aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorGian Marco <gianmarco.iodice@arm.com>2017-11-21 10:57:50 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:41:17 +0000
commit05288a2b871ef99f544771621c3bba409b2f70df (patch)
tree21e3d2a9927ef31f6d5bcdd5523c4c8e933047a6 /src/runtime
parentc82799003fbfdc5bb9526ff944e41eaae23e3f03 (diff)
downloadComputeLibrary-05288a2b871ef99f544771621c3bba409b2f70df.tar.gz
COMPMID-697 - Rework GEMMLowp interface on OpenCL
Reworked the interface of GemmLowp in order to make easy the integration in Android NN - Added support for different output stage - Added validation for both matrix multiplication and output stage - Added bounded relu support in the output stage - Added in32_t bias support - Added optimized path for vector by matrix case This rework is required for: - Convolution quantized - Fully connected quantized Change-Id: I512283d406099cf8c614dd89d0a97ed411143afc Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110625 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/CL/functions/CLGEMMLowp.cpp93
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp178
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp37
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp10
4 files changed, 223 insertions, 95 deletions
diff --git a/src/runtime/CL/functions/CLGEMMLowp.cpp b/src/runtime/CL/functions/CLGEMMLowp.cpp
deleted file mode 100644
index db6d11c2c3..0000000000
--- a/src/runtime/CL/functions/CLGEMMLowp.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/CL/functions/CLGEMMLowp.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-using namespace arm_compute;
-
-CLGEMMLowp::CLGEMMLowp(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _tmp_a(), _tmp_b()
-{
-}
-
-void CLGEMMLowp::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C");
-
- // Create shape for interleaved temporary tensor
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, ceil(a->info()->dimension(1) / 4));
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- _tmp_a.allocator()->init(info_a);
-
- // Create shape for tranposed temporary tensor
- TensorShape shape_tmp_b = b->info()->tensor_shape();
- shape_tmp_b.set(0, b->info()->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(static_cast<float>(b->info()->dimension(0)) / 16));
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
- _tmp_b.allocator()->init(info_b);
-
- // Manage intermediate buffers
- _memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
-
- // Configure kernels
- _interleave_kernel.configure(a, &_tmp_a);
- _transpose_kernel.configure(b, &_tmp_b);
- _mm_kernel.configure(&_tmp_a, &_tmp_b, output, a_offset, b_offset, output_offset, output_mult_int, shift);
-
- // Allocate intermediate buffers
- _tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
-}
-
-void CLGEMMLowp::run()
-{
- _memory_group.acquire();
-
- /* Run interleave kernel */
- CLScheduler::get().enqueue(_interleave_kernel, false);
-
- /* Run transpose kernel */
- CLScheduler::get().enqueue(_transpose_kernel, false);
-
- /* Run matrix multiply kernel */
- CLScheduler::get().enqueue(_mm_kernel, false);
-
- _memory_group.release();
-}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000000..5d2d13e243
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+using namespace arm_compute;
+
+CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
+ : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
+ _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+ ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
+ ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+
+ _a_offset = a->info()->quantization_info().offset;
+ _b_offset = b->info()->quantization_info().offset;
+
+ // If the input tensor has less than 16 rows, we run a special version of GEMMLowp without reshaping the input tensors
+ _is_interleaved_transposed = a->info()->dimension(1) > 16;
+
+ const ICLTensor *matrix_a = a;
+ const ICLTensor *matrix_b = b;
+
+ if(_is_interleaved_transposed)
+ {
+ matrix_a = &_tmp_a;
+ matrix_b = &_tmp_b;
+
+ // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+ TensorShape shape_tmp_a = a->info()->tensor_shape();
+ shape_tmp_a.set(0, a->info()->dimension(0) * 4);
+ shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
+
+ // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+ TensorShape shape_tmp_b = b->info()->tensor_shape();
+ shape_tmp_b.set(0, b->info()->dimension(1) * 16);
+ shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
+
+ TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
+ TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
+ _tmp_a.allocator()->init(info_a);
+ _tmp_b.allocator()->init(info_b);
+ _memory_group.manage(&_tmp_a);
+ _memory_group.manage(&_tmp_b);
+
+ // Configure interleave kernel
+ _mtx_a_reshape_kernel.configure(a, &_tmp_a);
+
+ // Configure transpose kernel
+ _mtx_b_reshape_kernel.configure(b, &_tmp_b);
+ }
+
+ // Configure matrix multiply kernel
+ _mm_kernel.configure(matrix_a, matrix_b, output, _is_interleaved_transposed);
+
+ // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ TensorShape shape_vector_sum_col = b->info()->tensor_shape();
+ if(b->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_col.remove_dimension(1);
+ }
+ TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
+ _vector_sum_col.allocator()->init(info_vector_sum_col);
+ _memory_group.manage(&_vector_sum_col);
+
+ // Configure Matrix B reduction kernel
+ _mtx_b_reduction_kernel.configure(b, &_vector_sum_col);
+ }
+
+ // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ TensorShape shape_vector_sum_row = a->info()->tensor_shape();
+ shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
+ if(a->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_row.remove_dimension(1);
+ }
+ TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
+ _vector_sum_row.allocator()->init(info_vector_sum_row);
+ _memory_group.manage(&_vector_sum_row);
+
+ // Configure matrix A reduction kernel
+ _mtx_a_reduction_kernel.configure(a, &_vector_sum_row);
+ }
+
+ // Configure offset contribution kernel
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+
+ // Allocate tensors
+ if(_is_interleaved_transposed)
+ {
+ _tmp_a.allocator()->allocate();
+ _tmp_b.allocator()->allocate();
+ }
+
+ if(_a_offset != 0)
+ {
+ _vector_sum_col.allocator()->allocate();
+ }
+
+ if(_b_offset != 0)
+ {
+ _vector_sum_row.allocator()->allocate();
+ }
+}
+
+void CLGEMMLowpMatrixMultiplyCore::run()
+{
+ _memory_group.acquire();
+
+ if(_is_interleaved_transposed)
+ {
+ // Run reshape matrix A
+ CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
+
+ // Run reshape matrix B
+ CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+ }
+
+ // Run matrix multiply
+ CLScheduler::get().enqueue(_mm_kernel, false);
+
+ // Run matrix A reduction kernel only if _b_offset is not equal to 0
+ if(_b_offset != 0)
+ {
+ CLScheduler::get().enqueue(_mtx_a_reduction_kernel, false);
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0)
+ {
+ CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+ }
+
+ // Run offset contribution kernel
+ CLScheduler::get().enqueue(_offset_contribution_kernel, true);
+
+ _memory_group.release();
+}
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
new file mode 100644
index 0000000000..b1d620d8a2
--- /dev/null
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
+{
+ auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
+ k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
+ _kernel = std::move(k);
+} \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 92c911c370..da5ac22fdc 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -133,7 +133,10 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
if(_a_offset != 0)
{
TensorShape shape_vector_sum_col = b->info()->tensor_shape();
- shape_vector_sum_col.remove_dimension(1);
+ if(b->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_col.remove_dimension(1);
+ }
TensorInfo info_vector_sum_col(shape_vector_sum_col, 1, DataType::S32);
_vector_sum_col.allocator()->init(info_vector_sum_col);
_memory_group.manage(&_vector_sum_col);
@@ -147,7 +150,10 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
{
TensorShape shape_vector_sum_row = a->info()->tensor_shape();
shape_vector_sum_row.set(Window::DimX, a->info()->dimension(1));
- shape_vector_sum_row.remove_dimension(1);
+ if(a->info()->num_dimensions() > 1)
+ {
+ shape_vector_sum_row.remove_dimension(1);
+ }
TensorInfo info_vector_sum_row(shape_vector_sum_row, 1, DataType::S32);
_vector_sum_row.allocator()->init(info_vector_sum_row);
_memory_group.manage(&_vector_sum_row);