5 files changed, 54 insertions, 44 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 6d8ce1f6f4..09a54968bb 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -62,7 +62,7 @@ public:
      *                           Supported sizes: 1x1, 3x3 and 5x5.
      *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                           Data type supported: Same as @p input.
-     * @param[in]      bias      Set of biases. Data type supported: Same as @p input.
+     * @param[in]      bias      Set of biases. Can be nullptr. Data type supported: Same as @p input.
      * @param[out]     output    Output tensor.
      *                           The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
@@ -80,7 +80,7 @@ public:
      *                      Supported sizes: 1x1, 3x3 and 5x5.
      *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                      Data type supported: Same as @p input.
-     * @param[in] bias      Set of biases. Data type supported: Same as @p input.
+     * @param[in] bias      Set of biases. Can be nullptr. Data type supported: Same as @p input.
      * @param[in] output    Output tensor.
      *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
      * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
@@ -98,6 +98,7 @@ private:
     NEDirectConvolutionLayerKernel               _conv_kernel;
     NEFillBorderKernel                           _input_border_handler;
     Tensor                                       _accumulator;
+    bool                                         _has_bias;
 };
 }
 #endif /* __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H__ */
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
index a6585ade12..65b7087d7e 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -244,6 +244,7 @@ void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, con
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, bias);
 
+    // Auto-initialize output output if required
     if(output != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 1ca213b04a..2ba0ef2e69 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -1048,7 +1048,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
 }
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
-                                                        unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration)
+                                                        unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
 {
     // Calculate right and bottom border
     unsigned int       kernel_size   = weights->dimension(0);
@@ -1056,7 +1056,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
     const unsigned int conv_pad_y    = std::get<1>(conv_info.pad());
     const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
     const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
-    BorderSize         border_size   = BorderSize(conv_pad_y, conv_pad_x);
     const int          input_width   = input->dimension(0);
     const int          input_height  = input->dimension(1);
 
@@ -1182,7 +1181,7 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, _num_weight_elems_read_per_row,
-                                                    _num_elems_read_per_iteration, _num_elems_written_per_iteration);
+                                                    _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
@@ -1192,9 +1191,16 @@ Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const
     unsigned int num_weight_elems_read_per_row   = 0;
     unsigned int num_elems_read_per_iteration    = 0;
     unsigned int num_elems_written_per_iteration = 0;
+    BorderSize   border_size(conv_info.pad().first, conv_info.pad().second);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, num_weight_elems_read_per_row, num_elems_read_per_iteration,
-                                                              num_elems_written_per_iteration)
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              weights->clone().get(),
+                                                              output->clone().get(),
+                                                              conv_info,
+                                                              num_weight_elems_read_per_row,
+                                                              num_elems_read_per_iteration,
+                                                              num_elems_written_per_iteration,
+                                                              border_size)
                                 .first);
 
     return Status{};
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index c66e057f23..af04955608 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -47,8 +47,8 @@ inline void fill_constant_value_single_channel_special<float, 1u, 1u>(ITensor *t
     float border_value;
     constant_border_value.get(border_value);
     uint8_t *const start_valid_region = tensor->ptr_to_element(tensor->info()->valid_region().anchor);
-    const size_t &width              = tensor->info()->valid_region().shape[0];
-    const size_t &height             = tensor->info()->valid_region().shape[1];
+    const size_t   width              = tensor->info()->valid_region().shape[0];
+    const size_t   height             = tensor->info()->valid_region().shape[1];
     const int      stridey            = tensor->info()->strides_in_bytes()[1];
 
     // Left and right border
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 2eabe459a5..ef5d987832 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -34,7 +34,7 @@
 using namespace arm_compute;
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator()
+    : _memory_group(std::move(memory_manager)), _accumulate_bias_kernel(), _conv_kernel(), _input_border_handler(), _accumulator(), _has_bias(false)
 {
 }
 
@@ -46,38 +46,29 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
         _accumulator.allocator()->free();
     }
 
+    // Check if bias should be added in the convolution result
+    _has_bias = (bias != nullptr);
+
     // Allocate the intermediate accumulator tensor in case of fixed point input
-    switch(output->info()->data_type())
+    if(is_data_type_fixed_point(input->info()->data_type()))
     {
-        case DataType::QS8:
-        {
-            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS16, output->info()->fixed_point_position()));
-            _memory_group.manage(&_accumulator);
-            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
-            _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-            _accumulator.allocator()->allocate();
-            break;
-        }
-        case DataType::QS16:
+        const DataType promoted_dt = (input->info()->data_type() == DataType::QS8) ? DataType::QS16 : DataType::QS32;
+        _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, promoted_dt, output->info()->fixed_point_position()));
+        _memory_group.manage(&_accumulator);
+        _conv_kernel.configure(input, weights, &_accumulator, conv_info);
+        // TODO (COMPMID-746): Fix accumulate biases to just down-cast when no bias is provided
+        if(_has_bias)
         {
-            _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::QS32, output->info()->fixed_point_position()));
-            _memory_group.manage(&_accumulator);
-            _conv_kernel.configure(input, weights, &_accumulator, conv_info);
             _accumulate_bias_kernel.configure(&_accumulator, bias, output);
-            _accumulator.allocator()->allocate();
-            break;
         }
-        case DataType::F16:
-        case DataType::F32:
+        _accumulator.allocator()->allocate();
+    }
+    else
+    {
+        _conv_kernel.configure(input, weights, output, conv_info);
+        if(_has_bias)
         {
-            _conv_kernel.configure(input, weights, output, conv_info);
             _accumulate_bias_kernel.configure(output, bias);
-            break;
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Data type not supported");
-            break;
         }
     }
 
@@ -87,7 +78,7 @@ void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights,
 
 Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, bias, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
 
     DataType data_type = output->data_type();
     if(is_data_type_fixed_point(data_type))
@@ -97,14 +88,22 @@ Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITenso
     }
     TensorInfo accumulator(output->clone()->set_is_resizable(true).reset_padding().set_data_type(data_type));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
-                                    "Biases size and number of input feature maps should match");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1,
-                                    "Biases should be one dimensional");
-
+    // Validate Convolution kernel
     ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerKernel::validate(input, weights, &accumulator, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerBiasAccumulateKernel::validate(&accumulator, bias, output));
+
+    // Validate bias
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias == nullptr) && is_data_type_fixed_point(data_type),
+                                    "Biases should be provided for fixed point inputs");
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of input feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->num_dimensions() > 1, "Biases should be one dimensional");
+
+        // Validate bias kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerBiasAccumulateKernel::validate(&accumulator, bias, output));
+    }
 
     return Status{};
 }
@@ -116,7 +115,10 @@ void NEDirectConvolutionLayer::run()
     _memory_group.acquire();
 
     NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
-    NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+    if(_has_bias)
+    {
+        NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
+    }
 
     _memory_group.release();
 }