aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h2
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp29
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp270
-rw-r--r--src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp2
-rw-r--r--tests/benchmark_new/NEON/DirectConvolutionLayer.cpp13
-rw-r--r--tests/validation/NEON/ConvolutionLayerDirect.cpp56
6 files changed, 337 insertions, 35 deletions
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index a356cac7c8..a66cab3013 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -48,7 +48,7 @@ public:
NEDirectConvolutionLayer();
/** Set the input, weights, biases and output tensors.
*
- * @param[in, out] input Input tensor. Data types supported: QS8/F32.
+ * @param[in, out] input Input tensor. Data types supported: QS8/F16/F32.
* @param[in] weights Set of kernels to convolve the input volume.
* The 3rd dimension must be the same as the input's volume 3rd dimension.
* Data type supported: Same as @p input.
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
index effc50e7c0..fb16c8dcc1 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp
@@ -100,6 +100,25 @@ inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y)
return vqaddq_qs16(x, y);
}
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8_t internal_vld1q(const float16_t *in)
+{
+ return vld1q_f16(in);
+}
+inline void internal_vst1q(float16_t *p, const float16x8_t &v)
+{
+ vst1q_f16(p, v);
+}
+inline float16x8_t internal_vdupq_n(float16_t v)
+{
+ return vdupq_n_f16(v);
+}
+inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
+{
+ return vaddq_f16(x, y);
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
template <typename T1, typename T2, bool in_place>
void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
{
@@ -143,8 +162,8 @@ NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumu
void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position());
if(output != nullptr)
{
@@ -183,6 +202,12 @@ void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, con
{
_func = (output == nullptr) ? &accumulate_bias<float, float, true> : &accumulate_bias<float, float, false>;
}
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ else if(input->info()->data_type() == DataType::F16)
+ {
+ _func = (output == nullptr) ? &accumulate_bias<float16_t, float16_t, true> : &accumulate_bias<float16_t, float16_t, false>;
+ }
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
else if(input->info()->data_type() == DataType::QS8)
{
_func = (output == nullptr) ? &accumulate_bias<qint8_t, qint8_t, true> : &accumulate_bias<qint8_t, qint8_t, false>;
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index d6088981aa..09d8dd5642 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -39,6 +39,53 @@ using namespace arm_compute;
namespace
{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+template <unsigned int stridex>
+float16x8_t internal_vld1q(const float16_t *in);
+
+template <>
+float16x8_t internal_vld1q<1>(const float16_t *in)
+{
+ return vld1q_f16(in);
+}
+
+template <>
+float16x8_t internal_vld1q<2>(const float16_t *in)
+{
+ const float16x8x2_t tmp = vld2q_f16(in);
+ return tmp.val[0];
+}
+
+template <>
+float16x8_t internal_vld1q<3>(const float16_t *in)
+{
+ const float16x8x3_t tmp = vld3q_f16(in);
+ return tmp.val[0];
+}
+
+inline float16x8_t internal_vdupq_n(float16_t v)
+{
+ return vdupq_n_f16(v);
+}
+
+inline void internal_vst1q(float16_t *p, const float16x8_t &v)
+{
+ vst1q_f16(p, v);
+}
+
+float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vmulq_f16(x, y);
+}
+
+inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z, int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+ return vaddq_f16(x, vmulq_f16(y, z));
+}
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
template <unsigned int stridex>
float32x4_t internal_vld1q(const float *in);
@@ -226,6 +273,148 @@ public:
}
};
+#ifdef ARM_COMPUTE_ENABLE_FP16
+inline float16x8x3_t load_matrix_row(const float16_t *ptr)
+{
+ /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
+ r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
+ const float16x8x3_t r =
+ {
+ {
+ vld1q_dup_f16(ptr),
+ vld1q_dup_f16(1 + ptr),
+ vld1q_dup_f16(2 + ptr)
+ }
+ };
+ return r;
+}
+
+template <unsigned int stridex>
+float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+ int fixed_point_position);
+
+template <>
+float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+ int fixed_point_position)
+{
+ ARM_COMPUTE_UNUSED(fixed_point_position);
+
+ const float16x8x3_t vtop =
+ {
+ {
+ vld1q_f16(in_top),
+ vld1q_f16(in_top + 8),
+ vld1q_f16(in_top + 16)
+ }
+ };
+ const float16x8x3_t vmid =
+ {
+ {
+ vld1q_f16(in_mid),
+ vld1q_f16(in_mid + 8),
+ vld1q_f16(in_mid + 16)
+ }
+ };
+ const float16x8x3_t vlow =
+ {
+ {
+ vld1q_f16(in_low),
+ vld1q_f16(in_low + 8),
+ vld1q_f16(in_low + 16)
+ }
+ };
+ float16x8x2_t out =
+ {
+ {
+ vmulq_f16(vtop.val[0], m0.val[0]),
+ vmulq_f16(vtop.val[1], m0.val[0])
+ }
+ };
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));
+ out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
+ out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
+ return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+ int fixed_point_position)
+{
+ float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);
+ return out;
+}
+
+template <>
+inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
+ int fixed_point_position)
+{
+ float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
+ return out;
+}
+
+template <unsigned int stridex>
+void store_results(float16_t *buffer, const float16x8x2_t &values);
+
+template <>
+void store_results<1>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1q_f16(buffer, values.val[0]);
+ vst1q_f16(buffer + 8, values.val[1]);
+}
+
+template <>
+void store_results<2>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1q_f16(buffer, values.val[0]);
+}
+
+template <>
+void store_results<3>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1_f16(buffer, vget_low_f16(values.val[0]));
+}
+
+template <unsigned int stridex>
+void accumulate_results(float16_t *buffer, const float16x8x2_t &values);
+
+template <>
+void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
+ vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1]));
+}
+
+template <>
+void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
+}
+
+template <>
+void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)
+{
+ vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0])));
+}
+
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
inline float32x4x3_t load_matrix_row(const float *ptr)
{
const float32x4x3_t r =
@@ -590,12 +779,13 @@ public:
for(int oz = 0; oz < num_planes_z; ++oz)
{
+ const int zoffset = id.z() + oz;
uint8_t *p_out_base = out_ptr + oz * output_stride_z;
// Step 1
{
- const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
const auto vk_r0 = load_matrix_row(ptr_k_r0);
const auto vk_r1 = load_matrix_row(ptr_k_r1);
const auto vk_r2 = load_matrix_row(ptr_k_r2);
@@ -616,9 +806,9 @@ public:
// Step 2
for(int p = 1; p < kernel_depth; ++p)
{
- const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
- const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
+ const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
const auto vk_r0 = load_matrix_row(ptr_k_r0);
const auto vk_r1 = load_matrix_row(ptr_k_r1);
const auto vk_r2 = load_matrix_row(ptr_k_r2);
@@ -697,9 +887,9 @@ BorderSize NEDirectConvolutionLayerKernel::border_size() const
void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),
"Pad > 0 not supported for 1x1 weights");
ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),
@@ -723,10 +913,24 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens
{
case 1:
{
- _num_elems_written_per_iteration = (input->info()->data_type() == DataType::QS8) ? 8 : 4;
- _num_elems_read_per_iteration = conv_stride_x * _num_elems_written_per_iteration;
+ switch(input->info()->data_type())
+ {
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ case DataType::QS8:
+ _num_elems_written_per_iteration = 8;
+ break;
+ case DataType::F32:
+ _num_elems_written_per_iteration = 4;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported.");
+ break;
+ }
- win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
+ _num_elems_read_per_iteration = conv_stride_x * _num_elems_written_per_iteration;
+ win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));
AccessWindowHorizontal input_access(input->info(), 0, _num_elems_read_per_iteration);
AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);
update_window_and_padding(win, input_access, output_access);
@@ -786,25 +990,43 @@ void NEDirectConvolutionLayerKernel::run(const Window &window)
{
case 1:
{
- if(_input->info()->data_type() == DataType::QS8)
- {
- convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- }
- else
+ switch(_input->info()->data_type())
{
- convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ case DataType::QS8:
+ convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ case DataType::F32:
+ convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
}
break;
}
case 3:
{
- if(_input->info()->data_type() == DataType::QS8)
- {
- convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
- }
- else
+ switch(_input->info()->data_type())
{
- convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ case DataType::QS8:
+ convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+ case DataType::F32:
+ convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+#ifdef ARM_COMPUTE_ENABLE_FP16
+ case DataType::F16:
+ convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
+ break;
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported");
+ break;
}
break;
}
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 3f3e7710fb..d5f03fcc41 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -40,7 +40,7 @@ NEDirectConvolutionLayer::NEDirectConvolutionLayer()
void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
// Free accumulator
if(_accumulator.buffer() != nullptr)
diff --git a/tests/benchmark_new/NEON/DirectConvolutionLayer.cpp b/tests/benchmark_new/NEON/DirectConvolutionLayer.cpp
index 5588321cc6..dcefbc7512 100644
--- a/tests/benchmark_new/NEON/DirectConvolutionLayer.cpp
+++ b/tests/benchmark_new/NEON/DirectConvolutionLayer.cpp
@@ -37,14 +37,21 @@ namespace arm_compute
{
namespace test
{
+namespace
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+const auto data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::F16, DataType::F32 });
+#else /* ARM_COMPUTE_ENABLE_FP16 */
+const auto data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::F32 });
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+} // namespace
+
using NEDirectConvolutionLayerFixture = ConvolutionLayerFixture<Tensor, NEDirectConvolutionLayer, Accessor>;
TEST_SUITE(NEON)
REGISTER_FIXTURE_DATA_TEST_CASE(DirectConvolutionLayer, NEDirectConvolutionLayerFixture, framework::DatasetMode::ALL,
- framework::dataset::combine(framework::dataset::combine(datasets::DirectConvolutionLayerDataset(),
- framework::dataset::make("DataType", { DataType::F32, DataType::QS8 })),
- framework::dataset::make("Batches", { 1, 4, 8 })));
+ framework::dataset::combine(framework::dataset::combine(datasets::DirectConvolutionLayerDataset(), data_types), framework::dataset::make("Batches", { 1, 4, 8 })));
TEST_SUITE_END()
} // namespace test
diff --git a/tests/validation/NEON/ConvolutionLayerDirect.cpp b/tests/validation/NEON/ConvolutionLayerDirect.cpp
index 4949f387f7..034a8b2045 100644
--- a/tests/validation/NEON/ConvolutionLayerDirect.cpp
+++ b/tests/validation/NEON/ConvolutionLayerDirect.cpp
@@ -48,8 +48,11 @@ using namespace arm_compute::test::validation;
namespace
{
-const float tolerance_fp = 1e-3f; /**< Tolerance for floating point tests */
-const float tolerance_qs8 = 1; /**< Tolerance for fixed point tests */
+const float tolerance_fp32 = 1e-3f; /**< Tolerance for floating point tests */
+#ifdef ARM_COMPUTE_ENABLE_FP16
+const float tolerance_fp16 = 0.01f; /**< Tolerance for half precision floating point tests */
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+const float tolerance_qs8 = 1; /**< Tolerance for fixed point tests */
/** Compute NEON direct convolution layer function.
*
@@ -88,7 +91,7 @@ Tensor compute_convolution_layer(const TensorShape &src_shape, const TensorShape
BOOST_TEST(!dst.info()->is_resizable());
// Fill tensors
- if(dt == DataType::F32)
+ if(dt == DataType::F16 || dt == DataType::F32)
{
std::uniform_real_distribution<> distribution(-1.f, 1.f);
library->fill(Accessor(src), distribution, 0);
@@ -129,6 +132,51 @@ BOOST_AUTO_TEST_SUITE(NEON)
BOOST_AUTO_TEST_SUITE(ConvolutionLayer)
BOOST_AUTO_TEST_SUITE(Direct)
+#ifdef ARM_COMPUTE_ENABLE_FP16
+BOOST_AUTO_TEST_SUITE(Float16)
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(W1x1,
+ DirectConvolutionShapes() * boost::unit_test::data::make(DataType::F16) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }),
+ input_shape, dt, sx, sy, num_kernels)
+{
+ const unsigned int kernel_size = 1;
+ const PadStrideInfo conv_info(sx, sy, 0, 0, DimensionRoundingType::FLOOR);
+ const TensorShape w_shape(kernel_size, kernel_size, input_shape.z(), static_cast<unsigned int>(num_kernels));
+ const TensorShape b_shape(static_cast<unsigned int>(num_kernels));
+ const TensorShape d_shape(get_output_shape(input_shape, w_shape, conv_info));
+
+ Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info);
+
+ RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, 0);
+
+ // Validate output
+ validate(NEAccessor(dst), ref);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
+BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * boost::unit_test::data::make(DataType::F16) * boost::unit_test::data::xrange(1, 3, 1) * boost::unit_test::data::xrange(1, 3,
+ 1)
+ * boost::unit_test::data::xrange(0, 2,
+ 1)
+ * boost::unit_test::data::xrange(0, 2, 1) * boost::unit_test::data::make({ 1, 4, 8, 16 }),
+ input_shape, dt, sx, sy, px, py, num_kernels)
+{
+ const unsigned int kernel_size = 3;
+ const PadStrideInfo conv_info(sx, sy, px, py, DimensionRoundingType::FLOOR);
+ const TensorShape w_shape(kernel_size, kernel_size, input_shape.z(), static_cast<unsigned int>(num_kernels));
+ const TensorShape b_shape(static_cast<unsigned int>(num_kernels));
+ const TensorShape d_shape(get_output_shape(input_shape, w_shape, conv_info));
+
+ Tensor dst = compute_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info);
+
+ RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, 0);
+
+ // Validate output
+ validate(NEAccessor(dst), ref, tolerance_fp16);
+}
+BOOST_AUTO_TEST_SUITE_END()
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
+
BOOST_AUTO_TEST_SUITE(Float)
BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit"))
BOOST_DATA_TEST_CASE(W1x1,
@@ -166,7 +214,7 @@ BOOST_DATA_TEST_CASE(W3x3, DirectConvolutionShapes() * CNNFloatDataTypes() * boo
RawTensor ref = Reference::compute_reference_convolution_layer(input_shape, w_shape, b_shape, d_shape, dt, conv_info, 0);
// Validate output
- validate(Accessor(dst), ref, tolerance_fp);
+ validate(Accessor(dst), ref, tolerance_fp32);
}
BOOST_AUTO_TEST_SUITE_END()