aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-09-12 20:11:34 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commita799ce0ad775829862891dd98d1232638ec8761e (patch)
tree4b7bb9b080a44aa5cfff67b2ce7177929b22405f
parentd63dfa2fc61a33b4e675ec6bc7458d8700174134 (diff)
downloadComputeLibrary-a799ce0ad775829862891dd98d1232638ec8761e.tar.gz
COMPMID-1564: Add NEDepthwiseConvolution3x3 for QASYMM8
Change-Id: I1f55508af6f220e5f41df7b56daffb4761ed0591 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/148253 Tested-by: bsgcomp <bsgcomp@arm.com> Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp17
-rw-r--r--arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp26
-rw-r--r--examples/graph_mobilenet.cpp33
-rw-r--r--src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp37
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp104
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp2
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp128
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp166
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp8
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp8
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp315
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp40
-rw-r--r--tests/datasets/DepthwiseConvolutionLayerDataset.h2
-rw-r--r--tests/validation/NEON/DepthwiseConvolutionLayer.cpp10
-rw-r--r--utils/Utils.h4
18 files changed, 846 insertions, 60 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
index 4ca68116db..472c44f97a 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
@@ -33,6 +33,7 @@ class IDepthwiseConvolution
virtual ~IDepthwiseConvolution() = default;
virtual int output_size(const int dim_size, const bool padding_same) const = 0;
virtual unsigned int get_window(void) const = 0;
+ virtual void set_offsets(int input_offset, int weights_offset) = 0;
virtual void run(const unsigned int start, const unsigned int stop) = 0;
};
@@ -179,6 +180,13 @@ class DepthwiseConvolution : public IDepthwiseConvolution
>::get_output_size(dim_size, padding_same);
}
+ /** Sets quantization offsets
+ *
+ * @param[in] input_offset Input offset
+ * @param[in] weights_offset Weights offset
+ */
+ void set_offsets(int input_offset, int weights_offset) override;
+
/** Get the window of work to be performed by an instance of the operator.
*/
unsigned int get_window(void) const override;
@@ -212,7 +220,9 @@ class DepthwiseConvolution : public IDepthwiseConvolution
const int row_pad_out_bottom,
const int n_tiles,
const int n_input_cols,
- const int n_output_cols
+ const int n_output_cols,
+ const int input_offset,
+ const int weights_offset
);
// Determine the maximum (and minimum) padding values which can be applied
@@ -272,7 +282,9 @@ class DepthwiseConvolution : public IDepthwiseConvolution
const int _in_pad_bottom,
const int _in_pad_right,
const int _out_pad_bottom,
- const int _out_pad_right
+ const int _out_pad_right,
+ const int _input_offset,
+ const int _weights_offset
);
/* Arrays of methods to process tensor tiles.
@@ -300,6 +312,7 @@ class DepthwiseConvolution : public IDepthwiseConvolution
const int _weight_col_stride, _weight_row_stride;
const int _input_col_stride, _input_row_stride, _input_batch_stride;
const int _output_col_stride, _output_row_stride, _output_batch_stride;
+ int _input_offset, _weights_offset;
};
} // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
index 17889849db..e262817a3c 100644
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp
@@ -82,7 +82,8 @@ DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::DepthwiseConvolution(
_input_batch_stride(input_batch_stride ? input_batch_stride : _n_input_rows * _input_row_stride),
_output_col_stride(output_col_stride ? output_col_stride : _n_channels),
_output_row_stride(output_row_stride ? output_row_stride : _n_output_cols * _output_col_stride),
- _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride)
+ _output_batch_stride(output_batch_stride ? output_batch_stride : _n_output_rows * _output_row_stride),
+ _input_offset(0), _weights_offset(0)
{
}
@@ -94,6 +95,12 @@ unsigned int DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::get_wind
return iceildiv(_n_channels, CHANNEL_BLOCK);
}
+template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
+void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::set_offsets(int input_offset, int weights_offset)
+{
+ _input_offset = input_offset;
+ _weights_offset = weights_offset;
+}
template <int OTR, int OTC, int KR, int KC, int SR, int SC, typename TIn, typename TOut>
void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
@@ -145,7 +152,8 @@ void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::run(
outptr_row + start_channel, _output_row_stride, _output_col_stride,
input_row_pad_top, input_pad_left, input_row_pad_bottom,
output_row_pad_bottom,
- _n_tile_cols, _n_input_cols, _n_output_cols
+ _n_tile_cols, _n_input_cols, _n_output_cols,
+ _input_offset, _weights_offset
);
}
}
@@ -170,7 +178,9 @@ void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row
const int row_pad_out_bottom,
const int n_tiles,
const int n_input_cols,
- const int n_output_cols
+ const int n_output_cols,
+ const int input_offset,
+ const int weights_offset
)
{
constexpr int tile_overlap = kernel_cols - stride_cols;
@@ -242,7 +252,7 @@ void DepthwiseConvolution<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile_row
inptr_col, in_row_stride, in_col_stride,
outptr_col, out_row_stride, out_col_stride,
row_pad_in_top, t_pad_in_left, row_pad_in_bottom, t_pad_in_right,
- row_pad_out_bottom, t_pad_out_right
+ row_pad_out_bottom, t_pad_out_right, input_offset, weights_offset
);
}
}
@@ -313,7 +323,9 @@ struct DepthwiseConvolutionImpl : public DepthwiseConvolution<
const int in_pad_bottom=0,
const int in_pad_right=0,
const int out_pad_bottom=0,
- const int out_pad_right=0
+ const int out_pad_right=0,
+ const int input_offset=0,
+ const int weights_offset=0
);
};
@@ -340,7 +352,9 @@ void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, TIn, TOut>::process_tile
const int _in_pad_bottom,
const int _in_pad_right,
const int _out_pad_bottom,
- const int _out_pad_right
+ const int _out_pad_right,
+ const int _input_offset,
+ const int _weights_offset
)
{
constexpr auto inner_tile_rows = DWC::inner_tile_rows;
diff --git a/examples/graph_mobilenet.cpp b/examples/graph_mobilenet.cpp
index 35ab224700..ab6a4a842f 100644
--- a/examples/graph_mobilenet.cpp
+++ b/examples/graph_mobilenet.cpp
@@ -183,6 +183,12 @@ private:
// Get trainable parameters data path
std::string data_path = common_params.data_path;
+ // Add model path to data path
+ if(!data_path.empty())
+ {
+ data_path += "/cnn_data/mobilenet_qasymm8_model/";
+ }
+
// Quantization info taken from the AndroidNN QASYMM8 MobileNet example
const QuantizationInfo in_quant_info = QuantizationInfo(0.0078125f, 128);
const QuantizationInfo mid_quant_info = QuantizationInfo(0.0784313753247f, 128);
@@ -228,14 +234,15 @@ private:
};
graph << InputLayer(input_descriptor.set_quantization_info(in_quant_info),
- get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/" + common_params.image))
+ get_weights_accessor(data_path, common_params.image))
<< ConvolutionLayer(
3U, 3U, 32U,
- get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/Conv2d_0_weights.npy"),
- get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/Conv2d_0_bias.npy"),
+ get_weights_accessor(data_path, "Conv2d_0_weights.npy"),
+ get_weights_accessor(data_path, "Conv2d_0_bias.npy"),
PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR),
1, conv_weights_quant_info.at(0), mid_quant_info)
- << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f));
+ .set_name("Conv2d_0")
+ << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name("Conv2d_0/Relu6");
graph << get_dwsc_node_qasymm(data_path, "Conv2d_1", 64U, PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(0), point_weights_quant_info.at(0));
graph << get_dwsc_node_qasymm(data_path, "Conv2d_2", 128U, PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(1),
point_weights_quant_info.at(1));
@@ -261,12 +268,14 @@ private:
point_weights_quant_info.at(11));
graph << get_dwsc_node_qasymm(data_path, "Conv2d_13", 1024U, PadStrideInfo(1U, 1U, 1U, 1U, 1U, 1U, DimensionRoundingType::FLOOR), PadStrideInfo(1U, 1U, 0U, 0U), depth_weights_quant_info.at(12),
point_weights_quant_info.at(12))
- << PoolingLayer(PoolingLayerInfo(PoolingType::AVG))
+ << PoolingLayer(PoolingLayerInfo(PoolingType::AVG)).set_name("Logits/AvgPool_1a")
<< ConvolutionLayer(
1U, 1U, 1001U,
- get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/Logits_Conv2d_1c_1x1_weights.npy"),
- get_weights_accessor(data_path, "/cnn_data/mobilenet_qasymm8_model/Logits_Conv2d_1c_1x1_bias.npy"),
- PadStrideInfo(1U, 1U, 0U, 0U), 1, conv_weights_quant_info.at(1));
+ get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_weights.npy"),
+ get_weights_accessor(data_path, "Logits_Conv2d_1c_1x1_bias.npy"),
+ PadStrideInfo(1U, 1U, 0U, 0U), 1, conv_weights_quant_info.at(1))
+ .set_name("Logits/Conv2d_1c_1x1");
+ ;
}
ConcatLayer get_dwsc_node_float(const std::string &data_path, std::string &&param_path,
@@ -312,7 +321,7 @@ private:
PadStrideInfo dwc_pad_stride_info, PadStrideInfo conv_pad_stride_info,
QuantizationInfo depth_weights_quant_info, QuantizationInfo point_weights_quant_info)
{
- std::string total_path = "/cnn_data/mobilenet_qasymm8_model/" + param_path + "_";
+ std::string total_path = param_path + "_";
SubStream sg(graph);
sg << DepthwiseConvolutionLayer(
@@ -320,13 +329,15 @@ private:
get_weights_accessor(data_path, total_path + "depthwise_weights.npy"),
get_weights_accessor(data_path, total_path + "depthwise_bias.npy"),
dwc_pad_stride_info, depth_weights_quant_info)
- << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f))
+ .set_name(total_path + "depthwise/depthwise")
+ << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(total_path + "depthwise/Relu6")
<< ConvolutionLayer(
1U, 1U, conv_filt,
get_weights_accessor(data_path, total_path + "pointwise_weights.npy"),
get_weights_accessor(data_path, total_path + "pointwise_bias.npy"),
conv_pad_stride_info, 1, point_weights_quant_info)
- << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f));
+ .set_name(total_path + "pointwise/Conv2D")
+ << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)).set_name(total_path + "pointwise/Relu6");
return ConcatLayer(std::move(sg));
}
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 7029b06615..99bdb7a70e 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -198,8 +198,10 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
output_shape.set(1, convolver->output_size(output_shape.y(), same_padding)); // Set width
output_shape.set(2, convolver->output_size(output_shape.z(), same_padding)); // Set height
+ const DataType output_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
// Configure window (optimised)
// Set padding in channels
@@ -324,7 +326,7 @@ bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(Tenso
}
// Check supported data type
- bool supported_datatype = is_data_type_float(dt);
+ bool supported_datatype = is_data_type_float(dt) || is_data_type_quantized(dt);
// Check for supported strides
const auto &strides = conv_info.stride();
@@ -345,11 +347,15 @@ bool NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(Tenso
void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(_input, _weights);
ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
_convolver = create_convolver_object(_conv_info, _weights, _input, _output, true);
+ if(_convolver)
+ {
+ _convolver->set_offsets(-_input->info()->quantization_info().offset, -_weights->info()->quantization_info().offset);
+ }
}
void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
@@ -433,6 +439,31 @@ std::unique_ptr<depthwise::IDepthwiseConvolution> NEDepthwiseConvolutionLayer3x3
const auto stride_x = conv_info.stride().first;
switch(dt)
{
+ case DataType::QASYMM8:
+ {
+ switch(stride_x)
+ {
+ case 1:
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>>(
+ n_batches, in_rows, in_cols, n_channels, padding_same,
+ reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
+ in->ptr_to_element(Coordinates()),
+ reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+ weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
+ case 2:
+ return arm_compute::support::cpp14::make_unique<DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>>(
+ n_batches, in_rows, in_cols, n_channels, padding_same,
+ reinterpret_cast<const uint8_t *>(w->ptr_to_element(Coordinates())),
+ in->ptr_to_element(Coordinates()),
+ reinterpret_cast<int32_t *>(out->ptr_to_element(Coordinates())), weight_col_stride,
+ weight_row_stride, input_col_stride, input_row_stride, input_batch_stride,
+ output_col_stride, output_row_stride, output_batch_stride);
+ default:
+ return nullptr;
+ }
+ break;
+ }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 864c63f731..a571d54501 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -194,8 +194,8 @@ inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
template <typename T1, typename T2, bool in_place, bool has_bias>
-void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
@@ -304,14 +304,14 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window
internal_vst1q(out_ptr, internal_vld1q(in_ptr));
}
},
- in, bi);
+ in, bi, out);
}
}
// QASYMM8 specializations
template <>
-void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
uint8x16_t min = vdupq_n_u8(0);
@@ -352,8 +352,8 @@ void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *
in, out);
}
template <>
-void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
- int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+void output_stage_nchw<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
ARM_COMPUTE_UNUSED(bias);
@@ -382,6 +382,85 @@ void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor
},
in, out);
}
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+ const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+ uint8x16_t min = vdupq_n_u8(0);
+ uint8x16_t max = vdupq_n_u8(255);
+
+ Window window_bias = window;
+ window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ window_bias.set(3, Window::Dimension(0, 0, 0));
+
+ Iterator in(input, window);
+ Iterator bi(bias, window_bias);
+
+ Iterator out(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+ const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr());
+
+ // Accumulate bias
+ int32x4x4_t v_in =
+ {
+ {
+ vaddq_s32(vld1q_s32(in_ptr), vld1q_s32(bias_ptr)),
+ vaddq_s32(vld1q_s32(in_ptr + 4), vld1q_s32(bias_ptr + 4)),
+ vaddq_s32(vld1q_s32(in_ptr + 8), vld1q_s32(bias_ptr + 8)),
+ vaddq_s32(vld1q_s32(in_ptr + 12), vld1q_s32(bias_ptr + 12))
+ }
+ };
+
+ const auto out_ptr = out.ptr();
+ vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ },
+ in, bi, out);
+}
+template <>
+void output_stage_nhwc<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+ ARM_COMPUTE_UNUSED(bias);
+
+ const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+ uint8x16_t min = vdupq_n_u8(0);
+ uint8x16_t max = vdupq_n_u8(255);
+
+ Window window_bias = window;
+ window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+ window_bias.set(3, Window::Dimension(0, 0, 0));
+
+ Iterator in(input, window);
+ Iterator bi(bias, window_bias);
+
+ Iterator out(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+
+ // Accumulate bias
+ int32x4x4_t v_in =
+ {
+ {
+ vld1q_s32(in_ptr),
+ vld1q_s32(in_ptr + 4),
+ vld1q_s32(in_ptr + 8),
+ vld1q_s32(in_ptr + 12)
+ }
+ };
+
+ const auto out_ptr = out.ptr();
+ vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ },
+ in, bi, out);
+}
} // namespace
NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
@@ -426,19 +505,19 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
{
case DataType::S32:
{
- _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+ _func = (bias == nullptr) ? &output_stage_nchw<int32_t, uint8_t, false, false> : &output_stage_nchw<int32_t, uint8_t, false, true>;
break;
}
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
- _func = (output == nullptr) ? &output_stage<float16_t, float16_t, true, true> : &output_stage<float16_t, float16_t, false, true>;
+ _func = (output == nullptr) ? &output_stage_nchw<float16_t, float16_t, true, true> : &output_stage_nchw<float16_t, float16_t, false, true>;
break;
}
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
case DataType::F32:
{
- _func = (output == nullptr) ? &output_stage<float, float, true, true> : &output_stage<float, float, false, true>;
+ _func = (output == nullptr) ? &output_stage_nchw<float, float, true, true> : &output_stage_nchw<float, float, false, true>;
break;
}
default:
@@ -451,6 +530,11 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
{
switch(input->info()->data_type())
{
+ case DataType::S32:
+ {
+ _func = (output == nullptr) ? &output_stage_nhwc<int32_t, uint8_t, false, false> : &output_stage_nhwc<int32_t, uint8_t, false, true>;
+ break;
+ }
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
index c5a056560b..ca1de26ed7 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp
@@ -43,7 +43,7 @@ void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
float* const outptr,
const int out_row_stride,
const int out_col_stride,
- const int, const int, const int, const int, const int, const int
+ const int, const int, const int, const int, const int, const int, const int, const int
)
{
// Copy pointers
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
index 0c96bebc02..21e8f04774 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp
@@ -43,7 +43,7 @@ void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
float* const outptr,
const int out_row_stride,
const int out_col_stride,
- const int, const int, const int, const int, const int, const int
+ const int, const int, const int, const int, const int, const int, const int, const int
)
{
// Copy pointers
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
index 941c8e9248..c7113d05b3 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp
@@ -43,7 +43,7 @@ void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
float* const outptr,
const int out_row_stride,
const int out_col_stride,
- const int, const int, const int, const int, const int, const int
+ const int, const int, const int, const int, const int, const int, const int, const int
)
{
// Copy pointers
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
index 1cbd6d5623..c36c24ec0f 100644
--- a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp
@@ -43,7 +43,7 @@ void ConvImpl::process_tile<true, 0, 0, 0, 0, 0, 0>(
float* const outptr,
const int out_row_stride,
const int out_col_stride,
- const int, const int, const int, const int, const int, const int
+ const int, const int, const int, const int, const int, const int, const int, const int
)
{
constexpr auto inner_tile_rows = DWC::inner_tile_rows;
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
new file mode 100644
index 0000000000..8f22a64ea6
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_u8_s32.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_u8_s32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 1, 1, uint8_t, int32_t>;
+} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
new file mode 100644
index 0000000000..cf515504c7
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_2x2_u8_s32.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl_u8_s32.hpp"
+
+namespace depthwise
+{
+using Conv = DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+using ConvImpl = DepthwiseConvolutionImpl<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+
+template <>
+const Conv::TileFn Conv::tilefn_unpadded = ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>;
+
+template <>
+const Conv::TileFn Conv::tilefn_top[n_in_pad_top_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 1, 0, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_left[n_in_pad_left_fns] = {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 1, 0, 0, 0, 0>,
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_bottom[n_in_pad_bottom_fns][n_out_pad_bottom_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 1, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 2, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 3, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 4, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 5, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 6, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 7, 0, 3, 0>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 1, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 2, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 8, 0, 3, 0>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_right[n_in_pad_right_fns][n_out_pad_right_fns] = {
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 0, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 1, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 2, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 3, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 4, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 5, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 6, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 7, 0, 3>,
+ },
+ {
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 0>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 1>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 2>,
+ ConvImpl::template process_tile<true, 0, 0, 0, 8, 0, 3>,
+ },
+};
+
+template <>
+const Conv::TileFn Conv::tilefn_generic = ConvImpl::template process_tile<false>;
+
+template class DepthwiseConvolution<4, 4, 3, 3, 2, 2, uint8_t, int32_t>;
+} // namespace depthwise
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
index ed4cfb86b9..dacfb24c89 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp
@@ -75,7 +75,9 @@ struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, Kern
const int in_pad_bottom=0,
const int in_pad_right=0,
const int out_pad_bottom=0,
- const int out_pad_right=0
+ const int out_pad_right=0,
+ const int input_offset=0,
+ const int weights_offset=0
);
};
@@ -102,7 +104,9 @@ void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float16_t, float16_t>::p
const int _in_pad_bottom,
const int _in_pad_right,
const int _out_pad_bottom,
- const int _out_pad_right
+ const int _out_pad_right,
+ const int _input_offset,
+ const int _weights_offset
)
{
constexpr auto inner_tile_rows = DWC::inner_tile_rows;
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
index 7a216ed518..840086f917 100644
--- a/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp
@@ -75,7 +75,9 @@ struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, Kern
const int in_pad_bottom=0,
const int in_pad_right=0,
const int out_pad_bottom=0,
- const int out_pad_right=0
+ const int out_pad_right=0,
+ const int input_offset=0,
+ const int weights_offset=0
);
};
@@ -102,7 +104,9 @@ void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_t
const int _in_pad_bottom,
const int _in_pad_right,
const int _out_pad_bottom,
- const int _out_pad_right
+ const int _out_pad_right,
+ const int _input_offset,
+ const int _weights_offset
)
{
constexpr auto inner_tile_rows = DWC::inner_tile_rows;
diff --git a/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
new file mode 100644
index 0000000000..d0d8de538d
--- /dev/null
+++ b/src/core/NEON/kernels/convolution/depthwise/impl_u8_s32.hpp
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ *
+ * NOTE: Header to be included by implementation files only.
+ *
+ * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+ */
+
+#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
+#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
+
+#pragma once
+
+namespace depthwise
+{
+// Partial specialisation for U8 to S32
+template <int OutputTileRows, int OutputTileCols,
+ int KernelRows, int KernelCols,
+ int StrideRows, int StrideCols>
+struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, uint8_t, int32_t>
+{
+ typedef DepthwiseConvolution<
+ OutputTileRows, OutputTileCols,
+ KernelRows, KernelCols,
+ StrideRows, StrideCols,
+ uint8_t, int32_t
+ > DWC;
+
+ template <
+ bool Specialize=false, // Specialize (or not) the method
+ int InPadTop=0, // If specialized, top padding
+ int InPadLeft=0, // If specialized, left padding
+ int InPadBottom=0, // If specialized, bottom padding
+ int InPadRight=0, // If specialized, right padding
+ int OutPadBottom=0, // If specialized, bottom output padding
+ int OutPadRight=0 // If specialized, bottom right padding
+ >
+ static void process_tile(
+ const int n_channels,
+ const uint8_t* const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const uint8_t* const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ int32_t* const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int in_pad_top=0,
+ const int in_pad_left=0,
+ const int in_pad_bottom=0,
+ const int in_pad_right=0,
+ const int out_pad_bottom=0,
+ const int out_pad_right=0,
+ const int input_offset=0,
+ const int weights_offset=0);
+};
+
+
+template <int OTR, int OTC, int KR, int KC, int SR, int SC>
+template <
+ bool Specialize,
+ int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
+ int OutPadBottom, int OutPadRight
+>
+void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, uint8_t, int32_t>::process_tile(
+ const int n_channels,
+ const uint8_t *__restrict__ const weights,
+ const int weight_row_stride,
+ const int weight_col_stride,
+ const uint8_t *__restrict__ const inptr,
+ const int in_row_stride,
+ const int in_col_stride,
+ int32_t *__restrict__ const outptr,
+ const int out_row_stride,
+ const int out_col_stride,
+ const int _in_pad_top,
+ const int _in_pad_left,
+ const int _in_pad_bottom,
+ const int _in_pad_right,
+ const int _out_pad_bottom,
+ const int _out_pad_right,
+ const int _input_offset,
+ const int _weights_offset
+)
+{
+ constexpr auto inner_tile_rows = DWC::inner_tile_rows;
+ constexpr auto inner_tile_cols = DWC::inner_tile_cols;
+ constexpr auto kernel_rows = DWC::kernel_rows;
+ constexpr auto kernel_cols = DWC::kernel_cols;
+ constexpr auto output_tile_rows = DWC::output_tile_rows;
+ constexpr auto output_tile_cols = DWC::output_tile_cols;
+ constexpr auto stride_rows = DWC::stride_rows;
+ constexpr auto stride_cols = DWC::stride_cols;
+
+ // Extract parameters
+ const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
+ const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
+ const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
+ const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
+ const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
+ const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
+
+ // Compute valid ranges of the tile
+ const int in_cells_i = inner_tile_rows - in_pad_bottom;
+ const int in_cells_j = inner_tile_cols - in_pad_right;
+ const int out_cells_i = output_tile_rows - out_pad_bottom;
+ const int out_cells_j = output_tile_cols - out_pad_right;
+
+ // Instantiate pointers
+ const uint8_t* __restrict__ inptr_base = inptr;
+ const uint8_t* __restrict__ wptr_base = weights;
+ int32_t* __restrict__ outptr_base = outptr;
+
+ // Perform the depthwise convolution
+ int channels_remaining = n_channels;
+#ifdef __aarch64__
+ const int32x4_t v_input_offset = vdupq_n_s32(_input_offset);
+ const int32x4_t v_weights_offset = vdupq_n_s32(_weights_offset);
+ for (; channels_remaining >= 16; channels_remaining -= 16)
+ {
+ // Load input tile
+ int32x4x4_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j].val[0] = vdupq_n_s32(0);
+ u[i][j].val[1] = vdupq_n_s32(0);
+ u[i][j].val[2] = vdupq_n_s32(0);
+ u[i][j].val[3] = vdupq_n_s32(0);
+ }
+ else
+ {
+ const uint8x16_t uv = vld1q_u8(inptr_row + (j - in_pad_left)*in_col_stride);
+ u[i][j].val[0] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(uv)))));
+ u[i][j].val[1] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(uv)))));
+ u[i][j].val[2] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(uv)))));
+ u[i][j].val[3] = vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(uv)))));
+ }
+ }
+ }
+ inptr_base += 16;
+
+ // Load weights tile
+ int32x4x4_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ const uint8x16_t wv = vld1q_u8(wptr_row + j*weight_col_stride);
+ w[i][j].val[0] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(wv)))));
+ w[i][j].val[1] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(wv)))));
+ w[i][j].val[2] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(wv)))));
+ w[i][j].val[3] = vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(wv)))));
+ }
+ }
+ wptr_base += 16;
+
+ // Perform the convolution
+ int32x4x4_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ if (in_i == 0 && in_j == 0)
+ {
+ // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
+ v[out_i][out_j].val[0] = vmulq_s32(w[in_i][in_j].val[0], u[i][j].val[0]);
+ v[out_i][out_j].val[1] = vmulq_s32(w[in_i][in_j].val[1], u[i][j].val[1]);
+ v[out_i][out_j].val[2] = vmulq_s32(w[in_i][in_j].val[2], u[i][j].val[2]);
+ v[out_i][out_j].val[3] = vmulq_s32(w[in_i][in_j].val[3], u[i][j].val[3]);
+ }
+ else
+ {
+ // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ v[out_i][out_j].val[0] = vmlaq_s32(v[out_i][out_j].val[0], w[in_i][in_j].val[0], u[i][j].val[0]);
+ v[out_i][out_j].val[1] = vmlaq_s32(v[out_i][out_j].val[1], w[in_i][in_j].val[1], u[i][j].val[1]);
+ v[out_i][out_j].val[2] = vmlaq_s32(v[out_i][out_j].val[2], w[in_i][in_j].val[2], u[i][j].val[2]);
+ v[out_i][out_j].val[3] = vmlaq_s32(v[out_i][out_j].val[3], w[in_i][in_j].val[3], u[i][j].val[3]);
+ }
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ int32_t* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ vst1q_s32(outptr_row + j*out_col_stride, v[i][j].val[0]);
+ vst1q_s32(outptr_row + j*out_col_stride + 4, v[i][j].val[1]);
+ vst1q_s32(outptr_row + j*out_col_stride + 8, v[i][j].val[2]);
+ vst1q_s32(outptr_row + j*out_col_stride + 12, v[i][j].val[3]);
+ }
+ }
+ outptr_base += 16;
+ }
+#endif // __aarch64__
+ for (; channels_remaining; channels_remaining--)
+ {
+ // Load input tile
+ int32_t u[inner_tile_rows][inner_tile_cols];
+ for (int i = 0; i < inner_tile_rows; i++)
+ {
+ const uint8_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
+ for (int j = 0; j < inner_tile_cols; j++)
+ {
+ if (i < in_pad_top || in_cells_i <= i ||
+ j < in_pad_left || in_cells_j <= j)
+ {
+ u[i][j] = static_cast<uint8_t>(0);
+ }
+ else
+ {
+ u[i][j] = static_cast<int32_t >(*(inptr_row + (j - in_pad_left)*in_col_stride)) + _input_offset;
+ }
+ }
+ }
+ inptr_base++;
+
+ // Load weights tile
+ int32_t w[kernel_rows][kernel_cols];
+ for (int i = 0; i < kernel_rows; i++)
+ {
+ const uint8_t* const wptr_row = wptr_base + i*weight_row_stride;
+ for (int j = 0; j < kernel_cols; j++)
+ {
+ w[i][j] = static_cast<int32_t >(*(wptr_row + j*weight_col_stride)) + _weights_offset;
+ }
+ }
+ wptr_base++;
+
+ // Perform the convolution
+ int32_t v[output_tile_rows][output_tile_cols];
+ for (int out_i = 0; out_i < out_cells_i; out_i++)
+ {
+ for (int out_j = 0; out_j < out_cells_j; out_j++)
+ {
+ // Clear the accumulator
+ v[out_i][out_j] = static_cast<int32_t>(0);
+
+ // Base co-ordinate
+ const int base_i = out_i * stride_rows;
+ const int base_j = out_j * stride_cols;
+
+ // Fill the accumulator
+ for (int in_i = 0; in_i < kernel_rows; in_i++)
+ {
+ const int i = base_i + in_i;
+ for (int in_j = 0; in_j < kernel_cols; in_j++)
+ {
+ const int j = base_j + in_j;
+ v[out_i][out_j] += w[in_i][in_j] * u[i][j];
+ }
+ }
+ }
+ }
+
+ // Store the output tile
+ for (int i = 0; i < out_cells_i; i++)
+ {
+ int32_t* const outptr_row = outptr_base + i*out_row_stride;
+ for (int j = 0; j < out_cells_j; j++)
+ {
+ *(outptr_row + j*out_col_stride) = v[i][j];
+ }
+ }
+ outptr_base++;
+ }
+}
+
+} // namespace depthwise
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index a46be2ec92..9dcbc99332 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -59,8 +59,25 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
_is_nchw = input->info()->data_layout() == DataLayout::NCHW;
_permute = _is_optimized == _is_nchw;
+ // Initialize the intermediate accumulator tensor in case of quantized input
+ if(_is_quantized)
+ {
+ TensorShape accum_shape = output->info()->tensor_shape();
+ DataLayout accum_layout = output->info()->data_layout();
+ if(!_is_optimized && !_is_nchw)
+ {
+ permute(accum_shape, PermutationVector(1U, 2U, 0U));
+ accum_layout = DataLayout::NCHW;
+ }
+
+ _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32, input->info()->quantization_info()));
+ _accumulator.info()->set_data_layout(accum_layout);
+ zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
+ }
+
if(_is_optimized)
{
+ ITensor *optimized_output = (_is_quantized) ? &_accumulator : output;
if(_is_nchw)
{
// Configure the function to transform the input tensor from NCHW -> NHWC
@@ -75,8 +92,8 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
_dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
- _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ _permuted_output.info()->set_data_layout(DataLayout::NHWC);
+ _permute_output.configure(&_permuted_output, optimized_output, PermutationVector(1U, 2U, 0U));
// Allocate tensors
_permuted_input.allocator()->allocate();
@@ -85,26 +102,11 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
}
else
{
- _dwc_kernel.configure(input, weights, output, conv_info, depth_multiplier, DataLayout::NHWC);
+ _dwc_kernel.configure(input, weights, optimized_output, conv_info, depth_multiplier, DataLayout::NHWC);
}
}
else
{
- // Allocate the intermediate accumulator tensor in case of quantized input
- if(_is_quantized)
- {
- TensorShape accum_shape = output->info()->tensor_shape();
-
- if(!_is_nchw)
- {
- permute(accum_shape, PermutationVector(1U, 2U, 0U));
- }
-
- _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32));
- _accumulator.info()->set_quantization_info(input->info()->quantization_info());
- zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
- }
-
if(!_is_nchw)
{
// Configure the function to transform the input tensor from NHWC -> NCHW
@@ -143,7 +145,7 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
int output_multiplier, output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_stage_kernel.configure(&_accumulator, biases, (_is_nchw || _is_optimized) ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
_accumulator.allocator()->allocate();
}
else if(_has_bias)
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index 889473ecbc..5ef6e112aa 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -204,6 +204,7 @@ public:
// Stride 1
add_config(TensorShape(7U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
add_config(TensorShape(7U, 7U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
+ add_config(TensorShape(7U, 7U, 21U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 0, 0, DimensionRoundingType::CEIL));
add_config(TensorShape(28U, 28U, 16U), Size2D(3U, 3U), PadStrideInfo(1, 1, 1, 1, DimensionRoundingType::CEIL));
// Stride 2
@@ -211,6 +212,7 @@ public:
add_config(TensorShape(7U, 7U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 1, 1, 1, 1, DimensionRoundingType::CEIL));
add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 0, DimensionRoundingType::FLOOR));
add_config(TensorShape(8U, 8U, 32U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
+ add_config(TensorShape(8U, 8U, 33U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
add_config(TensorShape(64U, 64U, 128U), Size2D(3U, 3U), PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::CEIL));
}
};
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 54bce0252e..8f87a7d636 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -304,6 +304,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEDepthwiseConvolutionLayerQuantizedFixture3x3<
{
validate(Accessor(_target), _reference, tolerance_qasymm8);
}
+FIXTURE_DATA_TEST_CASE(RunOptimized, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(datasets::OptimizedDepthwiseConvolutionLayerDataset3x3(),
+ framework::dataset::make("DepthMultiplier", 1)),
+ framework::dataset::make("DataType",
+ DataType::QASYMM8)),
+ framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })),
+ framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })))
+{
+ validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
FIXTURE_DATA_TEST_CASE(RunLarge, NEDepthwiseConvolutionLayerQuantizedFixture3x3<uint8_t>, framework::DatasetMode::NIGHTLY,
combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset3x3(),
depth_multipliers),
diff --git a/utils/Utils.h b/utils/Utils.h
index 0bbdcc25d1..130e1f72fe 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -357,7 +357,7 @@ public:
void fill_tensor(T &tensor)
{
ARM_COMPUTE_ERROR_ON(!is_open());
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(&tensor, arm_compute::DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(&tensor, arm_compute::DataType::QASYMM8, arm_compute::DataType::S32, arm_compute::DataType::F32);
try
{
// Map buffer if creating a CLTensor
@@ -413,6 +413,8 @@ public:
switch(tensor.info()->data_type())
{
+ case arm_compute::DataType::QASYMM8:
+ case arm_compute::DataType::S32:
case arm_compute::DataType::F32:
{
// Read data