aboutsummaryrefslogtreecommitdiff
path: root/src/core
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2019-05-13 17:41:01 +0100
committerMichalis Spyrou <michalis.spyrou@arm.com>2019-05-20 13:59:51 +0000
commitb9626ab169a168a7c1ca57edd1996e1e80938bf1 (patch)
tree57ce41fff5e2ece1e7d8f2a6f332c67e4534e752 /src/core
parent0af4418f4d4b6bceaea64fa21eaf127b1b8fed35 (diff)
downloadComputeLibrary-b9626ab169a168a7c1ca57edd1996e1e80938bf1.tar.gz
COMPMID-2243 ArgMinMaxLayer: support new datatypes
Change-Id: I846e833e0c94090cbbdcd6aee6061cea8295f4f9 Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-on: https://review.mlplatform.org/c/1131 Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core')
-rw-r--r--src/core/CL/cl_kernels/reduction_operation.cl67
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp6
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp25
3 files changed, 61 insertions, 37 deletions
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index 2651123cf5..749e3cdaa3 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -23,6 +23,19 @@
*/
#include "helpers.h"
+#if FLOAT_DATA_TYPE
+#define ISGREATER(x, y) isgreater(x, y)
+#define ISLESS(x, y) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((int16)0, (int16)-1, x > y)
+#define ISLESS(x, y) select((int16)0, (int16)-1, x < y)
+#endif // defined(WIDTH)
+#endif // FLOAT_DATA_TYPE
+
/** Calculate square sum of a vector
*
* @param[in] input Pointer to the first pixel.
@@ -124,9 +137,9 @@ __kernel void reduction_operation_x(
{
#if defined(PROD)
local_results[lid] *= local_results[lid + i];
-#else //!defined(PROD)
+#else // !defined(PROD)
local_results[lid] += local_results[lid + i];
-#endif //defined(PROD)
+#endif // defined(PROD)
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -138,7 +151,7 @@ __kernel void reduction_operation_x(
{
local_results[0] /= WIDTH;
}
-#endif /* defined(MEAN) && defined(WIDTH) */
+#endif // defined(MEAN) && defined(WIDTH)
((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
}
}
@@ -153,7 +166,7 @@ __kernel void reduction_operation_x(
* @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
* @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8 for operation MEAN
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
@@ -179,11 +192,11 @@ __kernel void reduction_operation_non_parallel_x(
{
DATA_TYPE_PROMOTED in = *((__global DATA_TYPE *)vector_offset(&src, x));
#if defined(ARG_MAX)
- indx = select(indx, x, isgreater(in, res));
- res = select(res, in, CONVERT(isgreater(in, res), COND_DATA_TYPE));
+ indx = select(indx, x, ISGREATER(in, res));
+ res = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
#elif defined(ARG_MIN)
- indx = select(indx, x, isless(in, res));
- res = select(res, in, CONVERT(isless(in, res), COND_DATA_TYPE));
+ indx = select(indx, x, ISLESS(in, res));
+ res = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
#else // !(defined(ARG_MAX) || defined(ARG_MIN))
res += in;
#endif // defined(ARG_MAX) || defined(ARG_MIN)
@@ -199,7 +212,7 @@ __kernel void reduction_operation_non_parallel_x(
*((__global uchar *)output.ptr) = convert_uchar(res);
#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
-#endif /* defined(WIDTH) */
+#endif // defined(WIDTH)
#if defined(HEIGHT)
/** This kernel performs reduction on y-axis.
@@ -207,7 +220,7 @@ __kernel void reduction_operation_non_parallel_x(
* @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
*
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -243,22 +256,22 @@ __kernel void reduction_operation_y(
VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
#if defined(ARG_MAX)
- uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+ uint16 cond_conv = CONVERT(ISGREATER(in, res), uint16);
indx = select(indx, y, cond_conv);
- res = select(res, in, isgreater(in, res));
+ res = select(res, in, ISGREATER(in, res));
#elif defined(ARG_MIN)
- uint16 cond_conv = CONVERT(isless(in, res), uint16);
+ uint16 cond_conv = CONVERT(ISLESS(in, res), uint16);
indx = select(indx, y, cond_conv);
- res = select(res, in, isless(in, res));
+ res = select(res, in, ISLESS(in, res));
#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(SUM_SQUARE)
in *= in;
#endif // defined(SUM_SQUARE)
#if defined(PROD)
res *= in;
-#else //!defined(PROD)
+#else // !defined(PROD)
res += in;
-#endif //defined(PROD)
+#endif // defined(PROD)
#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
@@ -272,7 +285,7 @@ __kernel void reduction_operation_y(
vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
-#endif /* defined(HEIGHT) */
+#endif // defined(HEIGHT)
#if defined(DEPTH)
/** This kernel performs reduction on z-axis.
@@ -280,7 +293,7 @@ __kernel void reduction_operation_y(
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
*
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -330,13 +343,13 @@ __kernel void reduction_operation_z(
#endif // defined(COMPLEX)
#if defined(ARG_MAX)
- uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+ uint16 cond_conv = CONVERT(ISGREATER(in, res), uint16);
indx = select(indx, z, cond_conv);
- res = select(res, in, isgreater(in, res));
+ res = select(res, in, ISGREATER(in, res));
#elif defined(ARG_MIN)
- uint16 cond_conv = CONVERT(isless(in, res), uint16);
+ uint16 cond_conv = CONVERT(ISLESS(in, res), uint16);
indx = select(indx, z, cond_conv);
- res = select(res, in, isless(in, res));
+ res = select(res, in, ISLESS(in, res));
#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(SUM_SQUARE)
in *= in;
@@ -374,7 +387,7 @@ __kernel void reduction_operation_z(
* @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
* @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
*
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/S32/F16/F32
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -419,13 +432,13 @@ __kernel void reduction_operation_w(
in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
#if defined(ARG_MAX)
- uint16 cond_conv = CONVERT(isgreater(in, res), uint16);
+ uint16 cond_conv = CONVERT(ISGREATER(in, res), uint16);
indx = select(indx, w, cond_conv);
- res = select(res, in, isgreater(in, res));
+ res = select(res, in, ISGREATER(in, res));
#elif defined(ARG_MIN)
- uint16 cond_conv = CONVERT(isless(in, res), uint16);
+ uint16 cond_conv = CONVERT(ISLESS(in, res), uint16);
indx = select(indx, w, cond_conv);
- res = select(res, in, isless(in, res));
+ res = select(res, in, ISLESS(in, res));
#else // !(defined(ARG_MAX) || defined(ARG_MIN))
#if defined(SUM_SQUARE)
in *= in;
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index db4850f14e..cb57070612 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -49,7 +49,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
if(input->num_channels() == 1)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
}
else
{
@@ -160,8 +160,10 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
{
data_type_promoted = "uint";
}
+
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
+ build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
@@ -199,7 +201,7 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
if(is_serial_op)
{
build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short");
+ build_opts.add_option_if_else(_input->info()->data_type() == DataType::F16, "-DCOND_DATA_TYPE=short", "-DCOND_DATA_TYPE=int");
kernel_axis_name = "non_parallel_x";
}
else
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index aa20d1f40d..5f0a4dd371 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -41,7 +41,8 @@ namespace arm_compute
{
namespace
{
-uint32x4x4_t calculate_index(uint32_t idx, float32x4_t a, float32x4_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+template <typename T>
+uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
{
uint32x4_t mask{ 0 };
if(op == ReductionOperation::ARG_IDX_MIN)
@@ -107,8 +108,8 @@ uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x
return res;
}
-
-uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_value, ReductionOperation op)
+template <typename T>
+uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
{
uint32x4_t res_idx_mask{ 0 };
uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
@@ -124,7 +125,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float32x4_t vec_res_va
{
auto pmax = wrapper::vpmax(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
pmax = wrapper::vpmax(pmax, pmax);
- auto mask = vceqq_f32(vec_res_value, wrapper::vcombine(pmax, pmax));
+ auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
}
@@ -394,14 +395,14 @@ struct RedOpX
case ReductionOperation::ARG_IDX_MIN:
{
auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
- vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_idx = calculate_index<decltype(vec_res_value)>(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
vec_res_value = temp_vec_res_value;
break;
}
case ReductionOperation::ARG_IDX_MAX:
{
auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
- vec_res_idx = calculate_index(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+ vec_res_idx = calculate_index<decltype(vec_res_value)>(id.x(), temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
vec_res_value = temp_vec_res_value;
break;
}
@@ -446,7 +447,7 @@ struct RedOpX
case ReductionOperation::ARG_IDX_MIN:
case ReductionOperation::ARG_IDX_MAX:
{
- auto res = calculate_vector_index(vec_res_idx, vec_res_value, op);
+ auto res = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
*(reinterpret_cast<uint32_t *>(output.ptr())) = res;
break;
}
@@ -943,6 +944,8 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
+ case DataType::S32:
+ return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -957,6 +960,8 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
+ case DataType::S32:
+ return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -971,6 +976,8 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
+ case DataType::S32:
+ return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -985,6 +992,8 @@ void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsi
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F32:
return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
+ case DataType::S32:
+ return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op);
default:
ARM_COMPUTE_ERROR("Not supported");
}
@@ -1002,7 +1011,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
if(input->num_channels() == 1)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
}
else
{