aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2018-10-11 17:33:32 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:55:45 +0000
commit8aaf93e8c12ce93d3d0082d4f4b70376f15536da (patch)
tree0922f3dde6fafae181e101df315ef36007801850 /src
parentc93691717a6e7ca67e32b4dedd233b8c63b6daf2 (diff)
downloadComputeLibrary-8aaf93e8c12ce93d3d0082d4f4b70376f15536da.tar.gz
COMPMID-1632 Add CLL2NormalizationLayer for NHWC and FP32
Change-Id: Iae22554d5fe893fd22a000eab5bfd8275ea06eb3 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/154102 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: bsgcomp <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/CLKernelLibrary.cpp3
-rw-r--r--src/core/CL/cl_kernels/l2_normalize.cl52
-rw-r--r--src/core/CL/cl_kernels/reduction_operation.cl21
-rw-r--r--src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp67
-rw-r--r--src/core/CL/kernels/CLReductionOperationKernel.cpp3
-rw-r--r--src/runtime/CL/functions/CLReductionOperation.cpp2
6 files changed, 121 insertions, 27 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index a2428ca99d..900cb04b1a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -296,7 +296,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "IYUV_to_RGB888_bt709", "color_convert.cl" },
{ "IYUV_to_RGBA8888_bt709", "color_convert.cl" },
{ "IYUV_to_YUV444_bt709", "color_convert.cl" },
- { "l2_normalize", "l2_normalize.cl" },
+ { "l2_normalize_nchw", "l2_normalize.cl" },
+ { "l2_normalize_nhwc", "l2_normalize.cl" },
{ "lktracker_stage0", "optical_flow_pyramid_lk.cl" },
{ "lktracker_stage1", "optical_flow_pyramid_lk.cl" },
{ "magnitude_phase", "magnitude_phase.cl" },
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
index f58e98bace..d230487030 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/l2_normalize.cl
@@ -23,7 +23,7 @@
*/
#include "helpers.h"
-/** This kernel performs reduction given an operation.
+/** This kernel performs l2 normalization. (NCHW)
*
* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
* @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
@@ -42,7 +42,7 @@
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[in] epsilon Epsilon value
*/
-__kernel void l2_normalize(
+__kernel void l2_normalize_nchw(
VECTOR_DECLARATION(src),
VECTOR_DECLARATION(sum),
VECTOR_DECLARATION(dst),
@@ -55,7 +55,53 @@ __kernel void l2_normalize(
VEC_DATA_TYPE(DATA_TYPE, 16)
in = vload16(0, (__global DATA_TYPE *)src.ptr);
VEC_DATA_TYPE(DATA_TYPE, 16)
- normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))native_rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+ normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon));
+
+ vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+/** This kernel performs l2 normalization. (NHWC)
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] sum_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] epsilon Epsilon value
+ */
+__kernel void l2_normalize_nhwc(
+ IMAGE_DECLARATION(src),
+ IMAGE_DECLARATION(sum),
+ IMAGE_DECLARATION(dst),
+ DATA_TYPE epsilon)
+{
+ Image src = CONVERT_TO_IMAGE_STRUCT(src);
+ Image sum = CONVERT_TO_IMAGE_STRUCT(sum);
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ in = vload16(0, (__global DATA_TYPE *)src.ptr);
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ sums = vload16(0, (__global DATA_TYPE *)sum.ptr);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon));
vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr);
} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index c1be4472a7..d76e12ac04 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -189,7 +189,12 @@ __kernel void reduction_operation_y(
for(unsigned int y = 0; y < HEIGHT; ++y)
{
- res += CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+ in *= in;
+#endif // SQRSUM
+ res += in;
}
#if defined(MEAN)
@@ -236,7 +241,12 @@ __kernel void reduction_operation_z(
for(unsigned int z = 0; z < DEPTH; ++z)
{
- res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+ in *= in;
+#endif // SQRSUM
+ res += in;
}
#if defined(MEAN)
@@ -288,7 +298,12 @@ __kernel void reduction_operation_w(
for(unsigned int w = 0; w < BATCH; ++w)
{
- res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+ VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
+ in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
+#if defined(SUM_SQUARE)
+ in *= in;
+#endif // SQRSUM
+ res += in;
}
#if defined(MEAN)
diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 54ed51eda2..cfd04ef392 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
@@ -49,9 +49,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
// Reduce shape on axis
@@ -62,9 +61,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons
if(output->total_size() != 0)
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
- ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW);
}
return Status{};
@@ -110,11 +109,19 @@ void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor
build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
// Create kernel
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts));
+ const DataLayout data_layout = input->info()->data_layout();
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("l2_normalize_" + lower_string(string_from_data_layout(data_layout)), build_opts));
// Set epsilon argument
- unsigned int idx = num_arguments_per_1D_tensor() * 3;
- _kernel.setArg<cl_uint>(idx, _epsilon);
+ unsigned int idx = data_layout == DataLayout::NCHW ? num_arguments_per_1D_tensor() * 3 : num_arguments_per_2D_tensor() * 3;
+ if(input->info()->data_type() == DataType::F32)
+ {
+ _kernel.setArg<cl_uint>(idx, _epsilon);
+ }
+ else
+ {
+ _kernel.setArg<cl_ushort>(idx, _epsilon);
+ }
// Configure kernel window
auto win_config = validate_and_configure_window(_input->info(), _output->info());
@@ -137,18 +144,42 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
Window window_sum(window);
- window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
-
- Window in_slice = window.first_slice_window_1D();
- Window sum_slice = window_sum.first_slice_window_1D();
- do
+ switch(_input->info()->data_layout())
{
- unsigned int idx = 0;
- add_1D_tensor_argument(idx, _input, in_slice);
- add_1D_tensor_argument(idx, _sum, sum_slice);
- add_1D_tensor_argument(idx, _output, in_slice);
- enqueue(queue, *this, in_slice);
+ case DataLayout::NCHW:
+ {
+ window_sum.set(Window::DimX, Window::Dimension(0, 0, 0));
+ Window in_slice = window.first_slice_window_1D();
+ Window sum_slice = window_sum.first_slice_window_1D();
+ do
+ {
+ unsigned int idx = 0;
+ add_1D_tensor_argument(idx, _input, in_slice);
+ add_1D_tensor_argument(idx, _sum, sum_slice);
+ add_1D_tensor_argument(idx, _output, in_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
+ }
+ break;
+ case DataLayout::NHWC:
+ {
+ window_sum.set(Window::DimY, Window::Dimension(0, 0, 0));
+ Window in_slice = window.first_slice_window_2D();
+ Window sum_slice = window_sum.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _sum, sum_slice);
+ add_2D_tensor_argument(idx, _output, in_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not supported");
}
- while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice));
}
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index d4165ccd4e..ef46325e4d 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -46,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && axis != 0, "Not supported reduction operation for this axis");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8);
@@ -142,6 +142,7 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
}
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
+ build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE=");
build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
switch(op)
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 4b65c47392..52a5d91cb8 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -119,7 +119,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
for(unsigned int i = 0; i < _num_of_stages - 1; i++)
{
shape.set(0, ceil(shape.x() / 128.f));
- _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type()));
+ _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
}
// Apply ReductionOperation only on first kernel