aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/convolution
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-09-12 16:45:53 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commit20c246a60869bada4051bd14eb9a3862be5330d7 (patch)
tree7ae712b57aa052890588ef18a5c972edf352786d /src/core/NEON/kernels/convolution
parentac314c25f41e3b2be2ef9073377079584fc88861 (diff)
downloadComputeLibrary-20c246a60869bada4051bd14eb9a3862be5330d7.tar.gz
COMPMID-1532: Add DepthwiseConvolution3x3 FP16 on NEON
Change-Id: I780970f317b979b3230e2b471ac01df7fda9ee14 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/148168 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/convolution')
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp130
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp168
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp290
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp290
10 files changed, 884 insertions, 6 deletions
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index 9b3a60db59..c5a056560b 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
index dba2330507..9ce43f949b 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index b946e5d19e..0c96bebc02 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index 2510941f35..941c8e9248 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp
new file mode 100644
index 0000000000..33b55df449
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp16_fp16.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, float16_t, float16_t>;
+} // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index 44b93a12d6..1cbd6d5623 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp
new file mode 100644
index 0000000000..09722d0d2d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp16_fp16.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_fp16_fp16.hpp"
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, float16_t, float16_t>;
+} // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
index 8eb53a66b4..05315eeab3 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_fp32_fp32.cpp
@@ -21,7 +21,7 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp"
+#include "impl_fp32_fp32.hpp"
namespace depthwise
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
new file mode 100644
index 0000000000..ed4cfb86b9
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ * NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for FP16 to FP16
+template <int OutputTileRows, int OutputTileCols,
+ int KernelRows, int KernelCols,
+ int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float16_t, float16_t>
+{
+ typedef DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float16_t, float16_t
+ > DWC;
+
+ template <
+ bool Specialize=false, // Specialize (or not) the method
+ int InPadTop=0, // If specialized, top padding
+ int InPadLeft=0, // If specialized, left padding
+ int InPadBottom=0, // If specialized, bottom padding
+ int InPadRight=0, // If specialized, right padding
+ int OutPadBottom=0, // If specialized, bottom output padding
+ int OutPadRight=0 // If specialized, bottom right padding
+ >
+ static void process_tile(
+ const int n_channels,
+ const float16_t* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float16_t* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float16_t* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int in_pad_top=0,
+ const int in_pad_left=0,
+ const int in_pad_bottom=0,
+ const int in_pad_right=0,
+ const int out_pad_bottom=0,
+ const int out_pad_right=0
+ );
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+ bool Specialize,
+ int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+ int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float16_t, float16_t>::process_tile(
+ const int n_channels,
+ const float16_t *__restrict__ const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float16_t *__restrict__ const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float16_t *__restrict__ const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int _in_pad_top,
+ const int _in_pad_left,
+ const int _in_pad_bottom,
+ const int _in_pad_right,
+ const int _out_pad_bottom,
+ const int _out_pad_right
+)
+{
+ constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+ constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+ constexpr auto kernel_rows = DWC::kernel_rows;
+ constexpr auto kernel_cols = DWC::kernel_cols;
+ constexpr auto output_tile_rows = DWC::output_tile_rows;
+ constexpr auto output_tile_cols = DWC::output_tile_cols;
+ constexpr auto stride_rows = DWC::stride_rows;
+ constexpr auto stride_cols = DWC::stride_cols;
+
+ // Extract parameters
+ const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+ const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+ const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+ const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+ const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+ const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+ // Compute valid ranges of the tile
+ const int in_cells_i = inner_tile_rows - in_pad_bottom;
+ const int in_cells_j = inner_tile_cols - in_pad_right;
+ const int out_cells_i = output_tile_rows - out_pad_bottom;
+ const int out_cells_j = output_tile_cols - out_pad_right;
+
+ // Instantiate pointers
+ const float16_t* __restrict__ inptr_base = inptr;
+ const float16_t* __restrict__ wptr_base = weights;
+ float16_t* __restrict__ outptr_base = outptr;
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 8; channels_remaining -= 8)
+ {
+ // Load input tile
+ float16x8_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = vdupq_n_f16(0.0f);
+ }
+ else
+ {
+ u[i][j] = vld1q_f16(inptr_row + (j - in_pad_left)*in_col_stride);
+ }
+ }
+ }
+ inptr_base += 8;
+
+ // Load weights tile
+ float16x8_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = vld1q_f16(wptr_row + j*weight_col_stride);
+ }
+ }
+ wptr_base += 8;
+
+ // Perform the convolution
+ float16x8_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ if (in_i == 0 && in_j == 0)
+ {
+ // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vmulq_f16(w[in_i][in_j], u[i][j]);
+ }
+ else
+ {
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
+ }
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ float16_t* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
+ }
+ }
+ outptr_base += 8;
+ }
+#endif // __aarch64__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load input tile
+ float16_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = static_cast<float16_t>(0);
+ }
+ else
+ {
+ u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
+ }
+ }
+ }
+ inptr_base++;
+
+ // Load weights tile
+ float16_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = *(wptr_row + j*weight_col_stride);
+ }
+ }
+ wptr_base++;
+
+ // Perform the convolution
+ float16_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = static_cast<float16_t>(0);
+
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ float16_t* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ *(outptr_row + j*out_col_stride) = v[i][j];
+ }
+ }
+ outptr_base++;
+ }
+}
+} // namespace depthwise
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
new file mode 100644
index 0000000000..7a216ed518
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ * NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for FP32 to FP32
+template <int OutputTileRows, int OutputTileCols,
+ int KernelRows, int KernelCols,
+ int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float, float>
+{
+ typedef DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ float, float
+ > DWC;
+
+ template <
+ bool Specialize=false, // Specialize (or not) the method
+ int InPadTop=0, // If specialized, top padding
+ int InPadLeft=0, // If specialized, left padding
+ int InPadBottom=0, // If specialized, bottom padding
+ int InPadRight=0, // If specialized, right padding
+ int OutPadBottom=0, // If specialized, bottom output padding
+ int OutPadRight=0 // If specialized, bottom right padding
+ >
+ static void process_tile(
+ const int n_channels,
+ const float* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int in_pad_top=0,
+ const int in_pad_left=0,
+ const int in_pad_bottom=0,
+ const int in_pad_right=0,
+ const int out_pad_bottom=0,
+ const int out_pad_right=0
+ );
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+ bool Specialize,
+ int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+ int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_tile(
+ const int n_channels,
+ const float *__restrict__ const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const float *__restrict__ const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ float *__restrict__ const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int _in_pad_top,
+ const int _in_pad_left,
+ const int _in_pad_bottom,
+ const int _in_pad_right,
+ const int _out_pad_bottom,
+ const int _out_pad_right
+)
+{
+ constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+ constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+ constexpr auto kernel_rows = DWC::kernel_rows;
+ constexpr auto kernel_cols = DWC::kernel_cols;
+ constexpr auto output_tile_rows = DWC::output_tile_rows;
+ constexpr auto output_tile_cols = DWC::output_tile_cols;
+ constexpr auto stride_rows = DWC::stride_rows;
+ constexpr auto stride_cols = DWC::stride_cols;
+
+ // Extract parameters
+ const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+ const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+ const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+ const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+ const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+ const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+ // Compute valid ranges of the tile
+ const int in_cells_i = inner_tile_rows - in_pad_bottom;
+ const int in_cells_j = inner_tile_cols - in_pad_right;
+ const int out_cells_i = output_tile_rows - out_pad_bottom;
+ const int out_cells_j = output_tile_cols - out_pad_right;
+
+ // Instantiate pointers
+ const float* __restrict__ inptr_base = inptr;
+ const float* __restrict__ wptr_base = weights;
+ float* __restrict__ outptr_base = outptr;
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ for (; channels_remaining >= 4; channels_remaining -= 4)
+ {
+ // Load input tile
+ float32x4_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = vdupq_n_f32(0.0f);
+ }
+ else
+ {
+ u[i][j] = vld1q_f32(inptr_row + (j - in_pad_left)*in_col_stride);
+ }
+ }
+ }
+ inptr_base += 4;
+
+ // Load weights tile
+ float32x4_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const float* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = vld1q_f32(wptr_row + j*weight_col_stride);
+ }
+ }
+ wptr_base += 4;
+
+ // Perform the convolution
+ float32x4_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ if (in_i == 0 && in_j == 0)
+ {
+ // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vmulq_f32(w[in_i][in_j], u[i][j]);
+ }
+ else
+ {
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
+ }
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ float* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
+ }
+ }
+ outptr_base += 4;
+ }
+#endif // __aarch64__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load input tile
+ float u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = static_cast<float>(0);
+ }
+ else
+ {
+ u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
+ }
+ }
+ }
+ inptr_base++;
+
+ // Load weights tile
+ float w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const float* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = *(wptr_row + j*weight_col_stride);
+ }
+ }
+ wptr_base++;
+
+ // Perform the convolution
+ float v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = static_cast<float>(0);
+
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ float* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ *(outptr_row + j*out_col_stride) = v[i][j];
+ }
+ }
+ outptr_base++;
+ }
+}
+
+} // namespace depthwise