aboutsummaryrefslogtreecommitdiff
path: root/arm_compute
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2017-11-28 09:10:03 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:41:58 +0000
commit58c5794b917dae10ff115dd85ec69e2ca41136c1 (patch)
treef2cea2d94e6566be720256dc6105056798723699 /arm_compute
parent754e9526a7caf50876c2db9563dc72f096093b34 (diff)
downloadComputeLibrary-58c5794b917dae10ff115dd85ec69e2ca41136c1.tar.gz
COMPMID-706 - Add GEMMLowp output stage for scaling by a fixed point number
DoD: - Implement NEON kernel for quantizing down the gemmlowp result. The result should be scaled by a fixedpoint number - Implement OpenCL kernel for quantizing down the gemmlowp result. The result should be scaled by a fixedpoint number - Add test for validating the result Required for: - Integration of GEMMLowp in Android NN - Convolution quantized - Fully connected quantized Change-Id: Ia963d25d695471e963961fb49a5600e78374ac4f Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110981 Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r--arm_compute/core/CL/CLKernels.h1
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h96
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h19
-rw-r--r--arm_compute/core/NEON/NEAsymm.h43
-rw-r--r--arm_compute/core/NEON/NEAsymm.inl33
-rw-r--r--arm_compute/core/NEON/NEKernels.h1
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h2
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h116
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h8
-rw-r--r--arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h28
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h77
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h66
12 files changed, 466 insertions, 24 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index e80df6aada..1ffbad90cf 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -59,6 +59,7 @@
#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..a1c6a1f7e1
--- /dev/null
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Round to nearest division by a power-of-two using result_shift
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ */
+class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICLKernel
+{
+public:
+ /** Constructor */
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+ * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+ * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ */
+ void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ *
+ * @param[in] input Input tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ *
+ * @return an error status
+ */
+ static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+ // Inherited methods overridden:
+ void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+ const ICLTensor *_input;
+ const ICLTensor *_bias;
+ ICLTensor *_output;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
index 75f3750714..08554983d6 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
-#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
+#ifndef __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__
+#define __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__
#include "arm_compute/core/CL/ICLKernel.h"
@@ -70,6 +70,19 @@ public:
* Along with @p min, this value can be used to implement "rectified linear unit" activation functions
*/
void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+ *
+ * @param[in] input Input tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ *
+ * @return an error status
+ */
+ static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
// Inherited methods overridden:
void run(const Window &window, cl::CommandQueue &queue) override;
@@ -81,4 +94,4 @@ private:
};
} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ */ \ No newline at end of file
+#endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ */ \ No newline at end of file
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
new file mode 100644
index 0000000000..d227d3ccbe
--- /dev/null
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEASYMM_H__
+#define __ARM_COMPUTE_NEASYMM_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+/** Round to the nearest division by a power-of-two using exponent
+ *
+ * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
+ *
+ * @param[in] x Vector of 4 elements
+ * @param[in] exponent Integer value used to round to nearest division by a power-of-two
+ *
+ * @return the nearest division by a power-of-two using exponent
+ */
+int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent);
+} // namespace arm_compute
+#include "arm_compute/core/NEON/NEAsymm.inl"
+#endif // __ARM_COMPUTE_NEASYMM_H__ \ No newline at end of file
diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl
new file mode 100644
index 0000000000..bbce308b35
--- /dev/null
+++ b/arm_compute/core/NEON/NEAsymm.inl
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+namespace arm_compute
+{
+inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
+{
+ const int32x4_t shift_vec = vdupq_n_s32(-exponent);
+ const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+ const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+ return vrshlq_s32(fixed_up_x, shift_vec);
+}
+} // namespace arm_compute \ No newline at end of file
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 281f06305f..b23e2ac5a3 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -66,6 +66,7 @@
#include "arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
index 989260de11..ac0af7cff3 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -77,6 +77,8 @@ public:
* Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
* @param[in] a_offset Offset to be added to each element of the matrix A.
* @param[in] b_offset Offset to be added to each element of the matrix B.
+ *
+ * @return an error status
*/
static Error validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..24ba54ebdf
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
+ *
+ * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * The following computations will be performed by the kernel:
+ *
+ * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ * -# Add bias to final result if bias tensor is not a nullptr
+ * -# Round to nearest division by a power-of-two using result_shift
+ * -# Add offset to each result
+ * -# Clamp the value between the specified min and max bounds
+ * -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
+ *
+ */
+class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public INEKernel
+{
+public:
+ /** Constructor */
+ NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers)*/
+ NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
+ /** Allow instances of this class to be moved */
+ NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+ /** Allow instances of this class to be moved */
+ NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
+ /** Initialise the kernel's input and output.
+ *
+ * @param[in] input Input tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+ * @param[in] result_shift Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
+ * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ */
+ void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ *
+ * @param[in] input Input tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ *
+ * @return an error status
+ */
+ static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+
+ // Inherited methods overridden:
+ void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+ /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ *
+ * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+ */
+ template <bool is_bounded_relu>
+ void run(const Window &window);
+
+ /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions
+ *
+ * @param[in] window Region on which to execute the kernel.
+ */
+ using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(const Window &window);
+
+ QuantizeDownFunctionPtr _func;
+ const ITensor *_input;
+ const ITensor *_bias;
+ ITensor *_output;
+ int _result_fixedpoint_multiplier;
+ int _result_shift;
+ int _result_offset_after_shift;
+ int _min;
+ int _max;
+};
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
index a522069330..d873a889d2 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
-#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__
+#ifndef __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__
+#define __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__
#include "arm_compute/core/NEON/INEKernel.h"
@@ -79,6 +79,8 @@ public:
* @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
* @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
* Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ *
+ * @return an error status
*/
static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
@@ -111,4 +113,4 @@ private:
};
} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALE_H__ */
+#endif /* __ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEKERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 26f23ce5f3..2cac06c1c9 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -87,20 +87,20 @@ private:
void configure_conv_fc(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output);
void configure_mm(const ICLTensor *input, const ICLTensor *weights, ICLTensor *output, bool is_interleaved_transposed = true);
- CLMemoryGroup _memory_group;
- CLIm2ColKernel _im2col_kernel;
- CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
- CLGEMMMatrixMultiplyKernel _mm_kernel;
- CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
- CLGEMMLowpQuantizeDownInt32ToUint8Scale _gemmlowp_output_stage;
- CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
- CLTensor _im2col_output;
- CLTensor _gemmlowp_output;
- CLTensor _reshape_weights_output;
- bool _are_weights_reshaped;
- bool _is_fc_after_conv;
- bool _accumulate_biases;
- bool _is_quantized;
+ CLMemoryGroup _memory_group;
+ CLIm2ColKernel _im2col_kernel;
+ CLFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+ CLGEMMMatrixMultiplyKernel _mm_kernel;
+ CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
+ CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
+ CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
+ CLTensor _im2col_output;
+ CLTensor _gemmlowp_output;
+ CLTensor _reshape_weights_output;
+ bool _are_weights_reshaped;
+ bool _is_fc_after_conv;
+ bool _accumulate_biases;
+ bool _is_quantized;
};
}
#endif /* __ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index 5c176a960b..c7e0c991d9 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -47,14 +47,14 @@ class ITensor;
*
* In case the bias tensor is provided, the final result is:
*
- * ((input[i][k] + result_offset) * result_mult_int + bias[k]) >> result_shift
+ * ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift
*
* This function calls the following OpenCL kernels:
*
* -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
*
* @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * before the result is shifted right by result_shift
+ * after the result is shifted right by result_shift
*/
class CLGEMMLowpQuantizeDownInt32ToUint8Scale : public ICLSimpleFunction
{
@@ -73,6 +73,79 @@ public:
* Along with @p min, this value can be used to implement "rectified linear unit" activation functions
*/
void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale
+ *
+ * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ *
+ * @return an error status
+ */
+ static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+};
+
+/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL.
+ *
+ * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
+ *
+ * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
+ *
+ * The final result is:
+ *
+ * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
+ *
+ * where FixedPointMul(x, y) is the nearest integer to the following
+ * mathematical expression, evaluated without overflow or intermediate rounding:
+ *
+ * (x * y) / 2^31
+ *
+ * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
+ *
+ * In case the bias tensor is provided, the final result is:
+ *
+ * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
+ *
+ * This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ * after the result is shifted right by result_shift
+*/
+class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction
+{
+public:
+ /** Initialise the kernel's inputs, output
+ *
+ * @param[in] input Input tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+ * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
+ * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ */
+ void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+ /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+ *
+ * @param[in] input Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ *
+ * @return an error status
+ */
+ static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
};
}
#endif /*__ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ */ \ No newline at end of file
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index 533a41c888..8a3d3e73d4 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -47,14 +47,14 @@ class ITensor;
*
* In case the bias tensor is provided, the final result is:
*
- * ((input[i][k] + result_offset) * result_mult_int + bias[k]) >> result_shift
+ * ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift
*
* This function calls the following NEON kernels:
*
* -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
*
* @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- * before the result is shifted right by result_shift
+ * after the result is shifted right by result_shift
*/
class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunction
{
@@ -82,6 +82,68 @@ public:
* @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
* @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
* Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ *
+ * @return an error status
+ */
+ static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
+};
+
+/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on NEON.
+ *
+ * NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
+ *
+ * result_fixedpoint_multiplier, result_shift, result_offset_after_shift
+ *
+ * The final result is:
+ *
+ * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
+ *
+ * where FixedPointMul(x, y) is the nearest integer to the following
+ * mathematical expression, evaluated without overflow or intermediate rounding:
+ *
+ * (x * y) / 2^31
+ *
+ * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
+ *
+ * In case the bias tensor is provided, the final result is:
+ *
+ * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
+ *
+ * This function calls the following NEON kernels:
+ *
+ * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+ *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ * after the result is shifted right by result_shift
+*/
+class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunction
+{
+public:
+ /** Initialise the kernel's inputs, output
+ *
+ * @param[in] input Input tensor. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
+ * @param[in] result_shift Number of bits to shift right the result after the fixed point multiplication
+ * @param[in] result_offset_after_shift Offset to be applied to result before converting it back to QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ */
+ void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
+ *
+ * @param[in] input Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+ * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+ * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+ * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+ * @param[in] min (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+ * @param[in] max (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+ * Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+ *
+ * @return an error status
*/
static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
};