From a4bba9c594c4022c9f85192bb8fd3593ad1a8d3c Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 2 Apr 2019 15:27:52 +0100 Subject: COMPMID-1995: Fix 32-bit NEDepthwiseConvolution errors. -Updates padding handling in assembly depthwise kernels. -Fixes 32-bit runs issues for depthwise convolution. Change-Id: I3fe6369397c1d13f5629dd34c068ce4af53c95cd Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/939 Reviewed-by: Giuseppe Rossini Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- .../convolution/depthwise/impl_fp32_fp32.hpp | 169 ++++++++++++++++++++- 1 file changed, 167 insertions(+), 2 deletions(-) (limited to 'src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp') diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp index 10d110feb8..264576137c 100644 --- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp +++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp @@ -92,7 +92,6 @@ void DepthwiseConvolution< // Perform the depthwise convolution int channels_remaining = n_channels; -#ifdef __aarch64__ for (; channels_remaining >= 4; channels_remaining -= 4) { // Load input tile @@ -170,7 +169,6 @@ void DepthwiseConvolution< } outptr_base += 4; } -#endif // __aarch64__ for (; channels_remaining; channels_remaining--) { // Load input tile @@ -246,4 +244,171 @@ void DepthwiseConvolution< } } + +template < + unsigned int OutputTileRows, unsigned int OutputTileCols, + unsigned int KernelRows, unsigned int KernelCols, + unsigned int StrideRows, unsigned int StrideCols +> +template +void DepthwiseConvolution< + OutputTileRows, OutputTileCols, + KernelRows, KernelCols, StrideRows, StrideCols, + float, float, float +>::execute_tile( + int n_channels, + const void *weights_biases_ptr, + const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols], + float *outptrs[Base::output_tile_rows][Base::output_tile_cols] +) +{ + const float* __restrict__ params = static_cast(weights_biases_ptr); + + // Perform the depthwise convolution + int channels_remaining = n_channels; + int n = 0; + for (; channels_remaining >= 4; channels_remaining -= 4, n += 4) + { + // Load input tile + float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols]; + for (int i = 0; i < Base::inner_tile_rows; i++) + { + for (int j = 0; j < Base::inner_tile_cols; j++) + { + u[i][j] = vld1q_f32(inptrs[i][j] + n); + } + } + + // Load weights tile + float32x4_t vbias = vld1q_f32(params); + params += 4; + + float32x4_t w[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + w[i][j] = vld1q_f32(params); + params += 4; + } + } + + // Perform the convolution + float32x4_t v[OutputTileRows][OutputTileCols]; + for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) + { + for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) + { + v[out_i][out_j] = vbias; + + // Base co-ordinate + const int base_i = out_i * StrideRows; + const int base_j = out_j * StrideCols; + + // Fill the accumulator + for (unsigned int in_i = 0; in_i < KernelRows; in_i++) + { + const unsigned int i = base_i + in_i; + for (unsigned int in_j = 0; in_j < KernelCols; in_j++) + { + const unsigned int j = base_j + in_j; + + // v[out_i][out_j] += w[in_i][in_j] * u[i][j]; + v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]); + } + } + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f)); + } + if (Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f)); + } + } + } + + // Store the output tile + for (unsigned int i = 0; i < OutputTileRows; i++) + { + for (unsigned int j = 0; j < OutputTileCols; j++) + { + vst1q_f32(outptrs[i][j] + n, v[i][j]); + } + } + } + for (; channels_remaining; channels_remaining--, n++) + { + // Load input tile + float u[Base::inner_tile_rows][Base::inner_tile_cols]; + for (int i = 0; i < Base::inner_tile_rows; i++) + { + for (int j = 0; j < Base::inner_tile_cols; j++) + { + u[i][j] = *(inptrs[i][j] + n); + } + } + + // Load weights tile + float bias = *(params++); + float w[KernelRows][KernelCols]; + for (unsigned int i = 0; i < KernelRows; i++) + { + for (unsigned int j = 0; j < KernelCols; j++) + { + w[i][j] = *(params++); + } + } + + // Perform the convolution + float v[OutputTileRows][OutputTileCols]; + for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++) + { + for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++) + { + // Clear the accumulator + v[out_i][out_j] = bias; + + // Base co-ordinate + const int base_i = out_i * StrideRows; + const int base_j = out_j * StrideCols; + + // Fill the accumulator + for (unsigned int in_i = 0; in_i < KernelRows; in_i++) + { + const unsigned int i = base_i + in_i; + for (unsigned int in_j = 0; in_j < KernelCols; in_j++) + { + const int j = base_j + in_j; + v[out_i][out_j] += w[in_i][in_j] * u[i][j]; + } + } + + // Apply the activation function + if (Activation == ActivationFunction::ReLU || + Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]); + } + if (Activation == ActivationFunction::ReLU6) + { + v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]); + } + } + } + + // Store the output tile + for (unsigned int i = 0; i < OutputTileRows; i++) + { + for (unsigned int j = 0; j < OutputTileCols; j++) + { + *(outptrs[i][j] + n) = v[i][j]; + } + } + } +} + } // namespace depthwise -- cgit v1.2.1