aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorManuel Bottini <manuel.bottini@arm.com>2018-12-10 17:40:23 +0000
committerManuel Bottini <manuel.bottini@arm.com>2019-01-14 13:53:11 +0000
commitb412fab0e3c8ec10e104f4d85760898a5b26179c (patch)
treee0cd062cdd32e78db3e2e67bcdb39e7efab6dff5 /src
parent1c9efebf4344e8db97e6d9282b2bf48b52090b58 (diff)
downloadComputeLibrary-b412fab0e3c8ec10e104f4d85760898a5b26179c.tar.gz
COMPMID-1724: CL Implement Prod
Change-Id: I17e51f25064b53a8f7e13d6fcbecc14a192de103 Reviewed-on: https://review.mlplatform.org/387 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/cl_kernels/reduction_operation.cl68
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.cpp28
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp72
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.cpp2
-rw-r--r--src/runtime/CL/functions/CLReductionOperation.cpp100
-rw-r--r--src/runtime/NEON/functions/NEIntegralImage.cpp4
-rw-r--r--src/runtime/NEON/functions/NEScale.cpp4
7 files changed, 166 insertions, 112 deletions
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index d1f47beda7..b4ede25296 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -60,12 +60,31 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input)
return (in.s0 + in.s1);
}
+
+/** Calculate product of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return product of vector.
+ */
+inline DATA_TYPE product(__global const DATA_TYPE *input)
+{
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, input);
+
+ in.s01234567 *= in.s89ABCDEF;
+ in.s0123 *= in.s4567;
+ in.s01 *= in.s23;
+
+ return (in.s0 * in.s1);
+}
#if defined(OPERATION)
/** This kernel performs parallel reduction given an operation on x-axis.
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
* @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
@@ -74,28 +93,28 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input)
* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] partial_sum_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptt
- * @param[in] partial_sum_stride_x Stride of the output tensor in X dimension (in bytes)
- * @param[in] partial_sum_step_x partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] partial_sum_stride_y Stride of the output tensor in Y dimension (in bytes)
- * @param[in] partial_sum_step_y partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_sums Local buffer for storing the partial sum
+ * @param[in] partial_res_ptr The local buffer to hold partial result values. Supported data types: same as @p src_ptr
+ * @param[in] partial_res_stride_x Stride of the output tensor in X dimension (in bytes)
+ * @param[in] partial_res_step_x partial_res_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] partial_res_step_y partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] local_results Local buffer for storing the partial result
*/
__kernel void reduction_operation_x(
IMAGE_DECLARATION(src),
- IMAGE_DECLARATION(partial_sum),
- __local DATA_TYPE *local_sums)
+ IMAGE_DECLARATION(partial_res),
+ __local DATA_TYPE *local_results)
{
Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);
+ Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
unsigned int lsize = get_local_size(0);
unsigned int lid = get_local_id(0);
for(unsigned int y = 0; y < get_local_size(1); ++y)
{
- local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
+ local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
barrier(CLK_LOCAL_MEM_FENCE);
// Perform parallel reduction
@@ -103,7 +122,11 @@ __kernel void reduction_operation_x(
{
if(lid < i)
{
- local_sums[lid] += local_sums[lid + i];
+#if defined(PROD)
+ local_results[lid] *= local_results[lid + i];
+#else //!defined(PROD)
+ local_results[lid] += local_results[lid + i];
+#endif //defined(PROD)
}
barrier(CLK_LOCAL_MEM_FENCE);
}
@@ -113,10 +136,10 @@ __kernel void reduction_operation_x(
#if defined(MEAN) && defined(WIDTH)
if(y == get_local_size(1) - 1)
{
- local_sums[0] /= WIDTH;
+ local_results[0] /= WIDTH;
}
#endif /* defined(MEAN) && defined(WIDTH) */
- ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
+ ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
}
}
}
@@ -127,6 +150,7 @@ __kernel void reduction_operation_x(
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
* @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
*
* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN
@@ -230,7 +254,11 @@ __kernel void reduction_operation_y(
#if defined(SUM_SQUARE)
in *= in;
#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+ res *= in;
+#else //!defined(PROD)
res += in;
+#endif //defined(PROD)
#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
@@ -304,7 +332,11 @@ __kernel void reduction_operation_z(
#if defined(SUM_SQUARE)
in *= in;
#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+ res *= in;
+#else //!defined(PROD)
res += in;
+#endif //defined(PROD)
#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
@@ -383,7 +415,11 @@ __kernel void reduction_operation_w(
#if defined(SUM_SQUARE)
in *= in;
#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+ res *= in;
+#else //!defined(PROD)
res += in;
+#endif //defined(PROD)
#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
@@ -397,4 +433,4 @@ __kernel void reduction_operation_w(
vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
#endif // defined(ARG_MAX) || defined(ARG_MIN)
}
-#endif /* defined(BATCH) && defined(DEPTH) */ \ No newline at end of file
+#endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 69206678d0..5fdb826f8b 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -75,25 +75,18 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo
// Select appropriate kernel
std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
- // Define select type required by replicate border > 1
- const DataType dt = tensor->info()->data_type();
- std::string select_type = get_underlying_cl_type_from_data_type(dt);
- if(is_data_type_float(dt))
- {
- select_type = (DataType::F32 == dt) ? "int" : "short";
- }
+ const DataType dt = tensor->info()->data_type();
// Define build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt)));
- build_opts.emplace(("-DSELECT_TYPE=" + select_type));
- build_opts.emplace(("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top)));
- build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom)));
- build_opts.emplace(("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left)));
- build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right)));
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt));
+ build_opts.add_option("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top));
+ build_opts.add_option("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom));
+ build_opts.add_option("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left));
+ build_opts.add_option("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right));
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
_tensor = tensor;
// Create static kernel arguments
@@ -141,8 +134,9 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo
set_constant_border<float>(idx, constant_border_value);
break;
case DataType::F16:
+ static_assert(sizeof(cl_half) == sizeof(half), "Half must be same size as cl_half");
static_assert(sizeof(cl_half) == 2, "Half must be 16 bit");
- set_constant_border<cl_half>(idx, constant_border_value);
+ set_constant_border<half>(idx, constant_border_value);
break;
default:
ARM_COMPUTE_ERROR("Not handled");
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 959209edc0..45aa810517 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,13 +80,13 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
bool window_changed = false;
- const bool is_arg_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+ const bool is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->data_type()));
switch(axis)
{
case 0:
{
- if(is_data_type_quantized(input->data_type()) || is_arg_op)
+ if(is_serial_op)
{
AccessWindowHorizontal input_access(input, 0, input->dimension(0));
AccessWindowHorizontal output_access(output, 0, 1);
@@ -153,10 +153,11 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
}
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
- build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE=");
+ build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN");
+ build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
switch(op)
{
@@ -170,6 +171,9 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
case ReductionOperation::ARG_IDX_MAX:
case ReductionOperation::ARG_IDX_MIN:
break;
+ case ReductionOperation::PROD:
+ build_opts.add_option(("-DOPERATION=product"));
+ break;
default:
ARM_COMPUTE_ERROR("Unsupported reduction operation");
}
@@ -177,12 +181,18 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
// Create kernel
cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
std::string kernel_axis_name;
- const bool is_arg_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+ const bool is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->info()->data_type()));
switch(axis)
{
case 0:
{
- if(!is_data_type_quantized(input->info()->data_type()) && !is_arg_op)
+ if(is_serial_op)
+ {
+ build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+ build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short");
+ kernel_axis_name = "non_parallel_x";
+ }
+ else
{
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
const unsigned int width_leftover = input->info()->dimension(0) % border_val;
@@ -195,12 +205,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
lws_hint = cl::NDRange(std::min(8U, num_of_threads));
_border_size = BorderSize(0, border_width, 0, 0);
}
- else
- {
- build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
- build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short");
- kernel_axis_name = "non_parallel_x";
- }
}
break;
case 1:
@@ -242,13 +246,31 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- const bool is_arg_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN);
+ const bool is_serial_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(_input->info()->data_type()));
switch(_reduction_axis)
{
case 0:
{
// We use parallel reduction only in non quantized types
- if(!is_data_type_quantized(_input->info()->data_type()) && !is_arg_op)
+ if(is_serial_op)
+ {
+ // Get first input and output slices
+ Window window_in{ window };
+ window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+
+ Window in_slice = window.first_slice_window_1D();
+ Window out_slice = window.first_slice_window_1D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, in_slice);
+ add_1D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+ }
+ else
{
// Set out window
Window out_window(window);
@@ -263,8 +285,8 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
// Set local sums buffer
- unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size();
- _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
+ unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size();
+ _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr);
do
{
@@ -275,24 +297,6 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
}
while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
}
- else
- {
- // Get first input and output slices
- Window window_in{ window };
- window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
-
- Window in_slice = window.first_slice_window_1D();
- Window out_slice = window.first_slice_window_1D();
-
- do
- {
- unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input, in_slice);
- add_1D_tensor_argument(idx, _output, out_slice);
- enqueue(queue, *this, in_slice);
- }
- while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
- }
}
break;
case 1:
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 39bcd996f9..f4046e0851 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index c5447ffd6b..e2dec6b375 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -56,15 +56,19 @@ unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int a
} // namespace
CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
+ : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
{
}
Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
{
const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-
- if(axis == 0 && !is_data_type_quantized(input->data_type()))
+ bool is_serial = is_data_type_quantized(input->data_type()) || axis != 0;
+ if(is_serial)
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+ }
+ else
{
// Create temporary tensor infos
auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
@@ -81,17 +85,25 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
}
ReductionOperation first_kernel_op;
+ ReductionOperation intermediate_kernel_op;
ReductionOperation last_kernel_op;
switch(op)
{
case ReductionOperation::SUM:
case ReductionOperation::MEAN_SUM:
- first_kernel_op = ReductionOperation::SUM;
- last_kernel_op = op;
+ first_kernel_op = ReductionOperation::SUM;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
break;
case ReductionOperation::SUM_SQUARE:
- first_kernel_op = ReductionOperation::SUM_SQUARE;
- last_kernel_op = ReductionOperation::SUM;
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = ReductionOperation::SUM;
+ break;
+ case ReductionOperation::PROD:
+ first_kernel_op = ReductionOperation::PROD;
+ intermediate_kernel_op = ReductionOperation::PROD;
+ last_kernel_op = ReductionOperation::PROD;
break;
default:
ARM_COMPUTE_ERROR("Not supported");
@@ -103,17 +115,13 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
// Validate ReductionOperation on intermediate stages
for(unsigned int i = 1; i < num_of_stages - 1; ++i)
{
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+ ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
}
// Validate ReductionOperation on the last stage
const unsigned int last_stage = num_of_stages - 1;
ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
}
- else
- {
- ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
- }
return Status{};
}
@@ -122,65 +130,77 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
{
_num_of_stages = calculate_number_of_stages(input->info(), axis);
_reduction_axis = axis;
- _is_quantized = is_data_type_quantized(input->info()->data_type());
+ _is_serial = is_data_type_quantized(input->info()->data_type()) || axis != 0;
// Configure reduction operation kernels
_reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
// Create temporary tensors
- if(axis == 0 && !_is_quantized)
+ if(_is_serial)
+ {
+ _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+ }
+ else
{
_border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
- _sums_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+ _results_vector = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
TensorShape shape{ input->info()->tensor_shape() };
for(unsigned int i = 0; i < _num_of_stages - 1; i++)
{
shape.set(0, ceil(shape.x() / 128.f));
- _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+ _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
}
// Apply ReductionOperation only on first kernel
- _memory_group.manage(_sums_vector.get());
+ _memory_group.manage(_results_vector.get());
ReductionOperation first_kernel_op;
+ ReductionOperation intermediate_kernel_op;
ReductionOperation last_kernel_op;
+ PixelValue pixelValue;
switch(op)
{
case ReductionOperation::SUM:
case ReductionOperation::MEAN_SUM:
- first_kernel_op = ReductionOperation::SUM;
- last_kernel_op = op;
+ first_kernel_op = ReductionOperation::SUM;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = op;
+ pixelValue = PixelValue(0);
break;
case ReductionOperation::SUM_SQUARE:
- first_kernel_op = ReductionOperation::SUM_SQUARE;
- last_kernel_op = ReductionOperation::SUM;
+ first_kernel_op = ReductionOperation::SUM_SQUARE;
+ intermediate_kernel_op = ReductionOperation::SUM;
+ last_kernel_op = ReductionOperation::SUM;
+ pixelValue = PixelValue(0);
+ break;
+ case ReductionOperation::PROD:
+ first_kernel_op = ReductionOperation::PROD;
+ intermediate_kernel_op = ReductionOperation::PROD;
+ last_kernel_op = ReductionOperation::PROD;
+ pixelValue = PixelValue(1, input->info()->data_type());
break;
default:
ARM_COMPUTE_ERROR("Not supported");
}
- _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
- _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+ _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+ _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
// Apply ReductionOperation on intermediate stages
for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
{
- _memory_group.manage(_sums_vector.get() + i);
- _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
- _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[i - 1].allocator()->allocate();
+ _memory_group.manage(_results_vector.get() + i);
+ _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
+ _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+ _results_vector[i - 1].allocator()->allocate();
}
// Apply ReductionOperation on the last stage
const unsigned int last_stage = _num_of_stages - 1;
const unsigned int input_width = input->info()->dimension(0);
- _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
- _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
- _sums_vector[last_stage - 1].allocator()->allocate();
- }
- else
- {
- _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+ _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+ _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+ _results_vector[last_stage - 1].allocator()->allocate();
}
}
@@ -188,7 +208,11 @@ void CLReductionOperation::run()
{
_memory_group.acquire();
- if(_reduction_axis == 0 && !_is_quantized)
+ if(_is_serial)
+ {
+ CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+ }
+ else
{
for(unsigned int i = 0; i < _num_of_stages; ++i)
{
@@ -196,10 +220,6 @@ void CLReductionOperation::run()
CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
}
}
- else
- {
- CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
- }
_memory_group.release();
}
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index fa8aaeb5dd..43308fa169 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,5 +36,5 @@ void NEIntegralImage::configure(const ITensor *input, ITensor *output)
auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
k->configure(input, output);
_kernel = std::move(k);
- _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
+ _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
}
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index a9c85bd726..169b9bbf6a 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -167,7 +167,7 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol
ARM_COMPUTE_ERROR("Unsupported interpolation mode");
}
- _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+ _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
}
Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,