From 8aaf93e8c12ce93d3d0082d4f4b70376f15536da Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 11 Oct 2018 17:33:32 +0100 Subject: COMPMID-1632 Add CLL2NormalizationLayer for NHWC and FP32 Change-Id: Iae22554d5fe893fd22a000eab5bfd8275ea06eb3 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/154102 Reviewed-by: Georgios Pinitas Tested-by: bsgcomp --- .../core/CL/kernels/CLReductionOperationKernel.h | 4 +- .../runtime/CL/functions/CLL2NormalizeLayer.h | 8 +- .../runtime/CL/functions/CLReductionOperation.h | 8 +- src/core/CL/CLKernelLibrary.cpp | 3 +- src/core/CL/cl_kernels/l2_normalize.cl | 52 +++++- src/core/CL/cl_kernels/reduction_operation.cl | 21 ++- src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp | 67 ++++++-- src/core/CL/kernels/CLReductionOperationKernel.cpp | 3 +- src/runtime/CL/functions/CLReductionOperation.cpp | 2 +- tests/validation/CL/L2NormalizeLayer.cpp | 34 +++- tests/validation/CL/ReductionOperation.cpp | 6 +- tests/validation/NEON/L2NormalizeLayer.cpp | 8 +- .../validation/fixtures/L2NormalizeLayerFixture.h | 38 ++++- tests/validation/reference/L2NormalizeLayer.cpp | 5 +- tests/validation/reference/ReductionOperation.cpp | 186 ++++++--------------- 15 files changed, 250 insertions(+), 195 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h index ef24fd5166..b5b90a15ce 100644 --- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h +++ b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h @@ -50,7 +50,7 @@ public: /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. * Output will have the same number of dimensions as input. * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 @@ -61,7 +61,7 @@ public: /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel. * - * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW. + * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. * Output will have the same number of dimensions as input. * @param[in] axis Axis along which to reduce. Supported reduction axis : 0,1,2,3 diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h index d3d34f877b..86ccd3440f 100644 --- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h +++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h @@ -53,18 +53,18 @@ public: /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW. + * @param[in] input Source tensor. Data types supported: F32. Data layouts supported: NCHW/NCHW. * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 2 * @param[in] epsilon (Optional) Lower bound value for the normalization. */ void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, float epsilon = 1e-12); /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayer. * - * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW. + * @param[in] input Source tensor info. Data types supported: F32. Data layouts supported: NCHW/NCHW. * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 2, * @param[in] epsilon (Optional) Lower bound value for the normalization. * * @return a status diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h index 42081786bc..ac35b7bfc6 100644 --- a/arm_compute/runtime/CL/functions/CLReductionOperation.h +++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h @@ -53,18 +53,18 @@ public: /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW. + * @param[in] input Source tensor. Data types supported: QASYMM8/F16/F32. * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3 * @param[in] op Reduction operation to perform. */ void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op); /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation. * - * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. Data layouts supported: NCHW. + * @param[in] input Source tensor info. Data types supported: QASYMM8/F16/F32. * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. - * @param[in] axis Axis along which to reduce. Supported reduction axis : 0 + * @param[in] axis Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3 * @param[in] op Reduction operation to perform. * * @return a status diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index a2428ca99d..900cb04b1a 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -296,7 +296,8 @@ const std::map CLKernelLibrary::_kernel_program_map = { "IYUV_to_RGB888_bt709", "color_convert.cl" }, { "IYUV_to_RGBA8888_bt709", "color_convert.cl" }, { "IYUV_to_YUV444_bt709", "color_convert.cl" }, - { "l2_normalize", "l2_normalize.cl" }, + { "l2_normalize_nchw", "l2_normalize.cl" }, + { "l2_normalize_nhwc", "l2_normalize.cl" }, { "lktracker_stage0", "optical_flow_pyramid_lk.cl" }, { "lktracker_stage1", "optical_flow_pyramid_lk.cl" }, { "magnitude_phase", "magnitude_phase.cl" }, diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl index f58e98bace..d230487030 100644 --- a/src/core/CL/cl_kernels/l2_normalize.cl +++ b/src/core/CL/cl_kernels/l2_normalize.cl @@ -23,7 +23,7 @@ */ #include "helpers.h" -/** This kernel performs reduction given an operation. +/** This kernel performs l2 normalization. (NCHW) * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 @@ -42,7 +42,7 @@ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor * @param[in] epsilon Epsilon value */ -__kernel void l2_normalize( +__kernel void l2_normalize_nchw( VECTOR_DECLARATION(src), VECTOR_DECLARATION(sum), VECTOR_DECLARATION(dst), @@ -55,7 +55,53 @@ __kernel void l2_normalize( VEC_DATA_TYPE(DATA_TYPE, 16) in = vload16(0, (__global DATA_TYPE *)src.ptr); VEC_DATA_TYPE(DATA_TYPE, 16) - normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))native_rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon)); + normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(((__global DATA_TYPE *)sum.ptr)[0], epsilon)); + + vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr); +} + +/** This kernel performs l2 normalization. (NHWC) + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along X processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] sum_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] sum_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] sum_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] epsilon Epsilon value + */ +__kernel void l2_normalize_nhwc( + IMAGE_DECLARATION(src), + IMAGE_DECLARATION(sum), + IMAGE_DECLARATION(dst), + DATA_TYPE epsilon) +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image sum = CONVERT_TO_IMAGE_STRUCT(sum); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + VEC_DATA_TYPE(DATA_TYPE, 16) + in = vload16(0, (__global DATA_TYPE *)src.ptr); + VEC_DATA_TYPE(DATA_TYPE, 16) + sums = vload16(0, (__global DATA_TYPE *)sum.ptr); + + VEC_DATA_TYPE(DATA_TYPE, 16) + normalize_value = (VEC_DATA_TYPE(DATA_TYPE, 16))rsqrt(fmax(sums, epsilon)); vstore16(in * normalize_value, 0, (__global DATA_TYPE *)dst.ptr); } \ No newline at end of file diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl index c1be4472a7..d76e12ac04 100644 --- a/src/core/CL/cl_kernels/reduction_operation.cl +++ b/src/core/CL/cl_kernels/reduction_operation.cl @@ -189,7 +189,12 @@ __kernel void reduction_operation_y( for(unsigned int y = 0; y < HEIGHT; ++y) { - res += CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); +#if defined(SUM_SQUARE) + in *= in; +#endif // SQRSUM + res += in; } #if defined(MEAN) @@ -236,7 +241,12 @@ __kernel void reduction_operation_z( for(unsigned int z = 0; z < DEPTH; ++z) { - res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); +#if defined(SUM_SQUARE) + in *= in; +#endif // SQRSUM + res += in; } #if defined(MEAN) @@ -288,7 +298,12 @@ __kernel void reduction_operation_w( for(unsigned int w = 0; w < BATCH; ++w) { - res += CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) + in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); +#if defined(SUM_SQUARE) + in *= in; +#endif // SQRSUM + res += in; } #if defined(MEAN) diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp index 54ed51eda2..cfd04ef392 100644 --- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp +++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp @@ -49,9 +49,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, sum, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 0, "Unsupported reduction axis, Supported axis is 0"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); // Reduce shape on axis @@ -62,9 +61,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, cons if(output->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != DataLayout::NCHW); } return Status{}; @@ -110,11 +109,19 @@ void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration))); // Create kernel - _kernel = static_cast(CLKernelLibrary::get().create_kernel("l2_normalize", build_opts)); + const DataLayout data_layout = input->info()->data_layout(); + _kernel = static_cast(CLKernelLibrary::get().create_kernel("l2_normalize_" + lower_string(string_from_data_layout(data_layout)), build_opts)); // Set epsilon argument - unsigned int idx = num_arguments_per_1D_tensor() * 3; - _kernel.setArg(idx, _epsilon); + unsigned int idx = data_layout == DataLayout::NCHW ? num_arguments_per_1D_tensor() * 3 : num_arguments_per_2D_tensor() * 3; + if(input->info()->data_type() == DataType::F32) + { + _kernel.setArg(idx, _epsilon); + } + else + { + _kernel.setArg(idx, _epsilon); + } // Configure kernel window auto win_config = validate_and_configure_window(_input->info(), _output->info()); @@ -137,18 +144,42 @@ void CLL2NormalizeLayerKernel::run(const Window &window, cl::CommandQueue &queue ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); Window window_sum(window); - window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); - - Window in_slice = window.first_slice_window_1D(); - Window sum_slice = window_sum.first_slice_window_1D(); - do + switch(_input->info()->data_layout()) { - unsigned int idx = 0; - add_1D_tensor_argument(idx, _input, in_slice); - add_1D_tensor_argument(idx, _sum, sum_slice); - add_1D_tensor_argument(idx, _output, in_slice); - enqueue(queue, *this, in_slice); + case DataLayout::NCHW: + { + window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); + Window in_slice = window.first_slice_window_1D(); + Window sum_slice = window_sum.first_slice_window_1D(); + do + { + unsigned int idx = 0; + add_1D_tensor_argument(idx, _input, in_slice); + add_1D_tensor_argument(idx, _sum, sum_slice); + add_1D_tensor_argument(idx, _output, in_slice); + enqueue(queue, *this, in_slice); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); + } + break; + case DataLayout::NHWC: + { + window_sum.set(Window::DimY, Window::Dimension(0, 0, 0)); + Window in_slice = window.first_slice_window_2D(); + Window sum_slice = window_sum.first_slice_window_2D(); + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, in_slice); + add_2D_tensor_argument(idx, _sum, sum_slice); + add_2D_tensor_argument(idx, _output, in_slice); + enqueue(queue, *this, in_slice); + } + while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice)); + } + break; + default: + ARM_COMPUTE_ERROR("Not supported"); } - while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); } diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp index d4165ccd4e..ef46325e4d 100644 --- a/src/core/CL/kernels/CLReductionOperationKernel.cpp +++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp @@ -46,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, u { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && axis != 0, "Not supported reduction operation for this axis"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis"); ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8); @@ -142,6 +142,7 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou } build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted); + build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE="); build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN"); switch(op) diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp index 4b65c47392..52a5d91cb8 100644 --- a/src/runtime/CL/functions/CLReductionOperation.cpp +++ b/src/runtime/CL/functions/CLReductionOperation.cpp @@ -119,7 +119,7 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign for(unsigned int i = 0; i < _num_of_stages - 1; i++) { shape.set(0, ceil(shape.x() / 128.f)); - _sums_vector[i].allocator()->init(TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())); + _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape)); } // Apply ReductionOperation only on first kernel diff --git a/tests/validation/CL/L2NormalizeLayer.cpp b/tests/validation/CL/L2NormalizeLayer.cpp index 3d121b079d..517ba84069 100644 --- a/tests/validation/CL/L2NormalizeLayer.cpp +++ b/tests/validation/CL/L2NormalizeLayer.cpp @@ -44,6 +44,10 @@ namespace { /** Tolerance for float operations */ constexpr AbsoluteTolerance tolerance_f32(0.00001f); +constexpr AbsoluteTolerance tolerance_f16(0.01f); + +auto data = concat(combine(framework::dataset::make("DataLayout", { DataLayout::NCHW }), framework::dataset::make("Axis", { 0 })), combine(framework::dataset::make("DataLayout", { DataLayout::NHWC }), + framework::dataset::make("Axis", { 1 }))); } // namespace @@ -58,7 +62,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1 TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F32 TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions - TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0 + TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 3 TensorInfo(TensorShape(128U, 64U), 1, DataType::F32) }), framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(128U, 64U), 1, DataType::F16), @@ -69,7 +73,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), TensorInfo(TensorShape(128U, 64U), 1, DataType::F32) })), - framework::dataset::make("Axis", { 0U, 0U, 0U, 0U, static_cast(TensorShape::num_max_dimensions), 1U, 0U })), + framework::dataset::make("Axis", { 0U, 0U, 0U, 0U, static_cast(TensorShape::num_max_dimensions), 4U, 0U })), framework::dataset::make("Expected", { false, false, false, false, false, false, true })), input_info, output_info, axis, expected) { @@ -87,22 +91,36 @@ using CLL2NormalizeLayerFixture = L2NormalizeLayerValidationFixture, framework::DatasetMode::PRECOMMIT, - combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Epsilon", { 1e-12 }))) + combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), data), framework::dataset::make("Epsilon", { 1e-12 }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } FIXTURE_DATA_TEST_CASE(RunLarge, CLL2NormalizeLayerFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Epsilon", { 1e-12 }))) + combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), data), framework::dataset::make("Epsilon", { 1e-12 }))) { // Validate output validate(CLAccessor(_target), _reference, tolerance_f32); } -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // FP32 +TEST_SUITE(FP16) +FIXTURE_DATA_TEST_CASE(RunSmall, CLL2NormalizeLayerFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), data), framework::dataset::make("Epsilon", { 1e-6 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f16); +} +FIXTURE_DATA_TEST_CASE(RunLarge, CLL2NormalizeLayerFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), data), framework::dataset::make("Epsilon", { 1e-6 }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_f16); +} +TEST_SUITE_END() // FP16 +TEST_SUITE_END() // Float -TEST_SUITE_END() -TEST_SUITE_END() +TEST_SUITE_END() // L2NormalizeLayer +TEST_SUITE_END() // CL } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp index 516a1341cc..2adb4e90d6 100644 --- a/tests/validation/CL/ReductionOperation.cpp +++ b/tests/validation/CL/ReductionOperation.cpp @@ -58,16 +58,16 @@ TEST_SUITE(ReductionOperation) DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( framework::dataset::make("InputInfo", { TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Mismatching data type input/output TensorInfo(TensorShape(128U, 64U), 2, DataType::F32), // Number of Input channels != 1 - TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != F16/F32 + TensorInfo(TensorShape(128U, 64U), 1, DataType::S16), // DataType != QASYMM8/F16/F32 TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis >= num_max_dimensions - TensorInfo(TensorShape(128U, 64U), 1, DataType::F32), // Axis > 0 and SUM_SQUARE + TensorInfo(TensorShape(128U, 64U), 1, DataType::QASYMM8), // Axis == 0 and SUM_SQUARE and QASYMM8 TensorInfo(TensorShape(128U, 64U), 1, DataType::F32) }), framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(1U, 64U), 1, DataType::F16), TensorInfo(TensorShape(1U, 64U), 1, DataType::F32), TensorInfo(TensorShape(1U, 64U), 1, DataType::S16), TensorInfo(TensorShape(1U, 64U), 1, DataType::F32), - TensorInfo(TensorShape(1U, 64U), 1, DataType::F32), + TensorInfo(TensorShape(1U, 64U), 1, DataType::QASYMM8), TensorInfo(TensorShape(1U, 64U), 1, DataType::F32) })), framework::dataset::make("Axis", { 0U, 0U, 0U, static_cast(TensorShape::num_max_dimensions), 1U, 0U })), diff --git a/tests/validation/NEON/L2NormalizeLayer.cpp b/tests/validation/NEON/L2NormalizeLayer.cpp index f868adea3b..0a1ddba77c 100644 --- a/tests/validation/NEON/L2NormalizeLayer.cpp +++ b/tests/validation/NEON/L2NormalizeLayer.cpp @@ -85,14 +85,18 @@ using NEL2NormalizeLayerFixture = L2NormalizeLayerValidationFixture, framework::DatasetMode::PRECOMMIT, - combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Epsilon", { 1e-12 }))) + combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW })), + framework::dataset::make("Axis", { 0 })), + framework::dataset::make("Epsilon", { 1e-12 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); } FIXTURE_DATA_TEST_CASE(RunLarge, NEL2NormalizeLayerFixture, framework::DatasetMode::NIGHTLY, - combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0 })), framework::dataset::make("Epsilon", { 1e-12 }))) + combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("DataLayout", { DataLayout::NCHW })), + framework::dataset::make("Axis", { 0 })), + framework::dataset::make("Epsilon", { 1e-12 }))) { // Validate output validate(Accessor(_target), _reference, tolerance_f32); diff --git a/tests/validation/fixtures/L2NormalizeLayerFixture.h b/tests/validation/fixtures/L2NormalizeLayerFixture.h index 6f11dcb658..097d1c4ec2 100644 --- a/tests/validation/fixtures/L2NormalizeLayerFixture.h +++ b/tests/validation/fixtures/L2NormalizeLayerFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -45,10 +45,10 @@ class L2NormalizeLayerValidationFixture : public framework::Fixture { public: template - void setup(TensorShape shape, DataType data_type, unsigned int axis, float epsilon) + void setup(TensorShape shape, DataType data_type, DataLayout data_layout, unsigned int axis, float epsilon) { - _target = compute_target(shape, data_type, axis, epsilon); - _reference = compute_reference(shape, data_type, axis, epsilon); + _target = compute_target(shape, data_type, data_layout, axis, epsilon); + _reference = compute_reference(shape, data_type, data_layout, axis, epsilon); } protected: @@ -58,11 +58,16 @@ protected: library->fill_tensor_uniform(tensor, 0); } - TensorType compute_target(const TensorShape &shape, DataType data_type, unsigned int axis, float epsilon) + TensorType compute_target(TensorShape shape, DataType data_type, DataLayout data_layout, unsigned int axis, float epsilon) { + if(data_layout == DataLayout::NHWC) + { + permute(shape, PermutationVector(2U, 0U, 1U)); + } + // Create tensors - TensorType src = create_tensor(shape, data_type); - TensorType dst = create_tensor(shape, data_type); + TensorType src = create_tensor(shape, data_type, 1, QuantizationInfo(), data_layout); + TensorType dst = create_tensor(shape, data_type, 1, QuantizationInfo(), data_layout); // Create and configure function FunctionType l2_norm_func; @@ -87,8 +92,25 @@ protected: return dst; } - SimpleTensor compute_reference(const TensorShape &shape, DataType data_type, unsigned int axis, float epsilon) + SimpleTensor compute_reference(const TensorShape &shape, DataType data_type, DataLayout data_layout, unsigned int axis, float epsilon) { + if(data_layout == DataLayout::NHWC) + { + switch(axis) + { + case 0: + axis = 2; + break; + case 1: + axis = 0; + break; + case 2: + axis = 1; + break; + default: + break; + } + } // Create reference SimpleTensor src{ shape, data_type }; diff --git a/tests/validation/reference/L2NormalizeLayer.cpp b/tests/validation/reference/L2NormalizeLayer.cpp index 99f4e8a6e6..26677511e4 100644 --- a/tests/validation/reference/L2NormalizeLayer.cpp +++ b/tests/validation/reference/L2NormalizeLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -66,7 +66,7 @@ SimpleTensor l2_normalize(const SimpleTensor &src, unsigned int axis, floa { const T *src_row_ptr = src.data() + du * elems; T *dst_row_ptr = dst.data() + du * elems; - const T normalization_value = std::sqrt(std::max(sum[du], epsilon)); + const T normalization_value = sqrt(std::max(sum[du], static_cast(epsilon))); std::transform(src_row_ptr, src_row_ptr + elems, dst_row_ptr, [normalization_value](T val) { return val / normalization_value; @@ -82,6 +82,7 @@ SimpleTensor l2_normalize(const SimpleTensor &src, unsigned int axis, floa } template SimpleTensor l2_normalize(const SimpleTensor &src, unsigned int axis, float epsilon); +template SimpleTensor l2_normalize(const SimpleTensor &src, unsigned int axis, float epsilon); } // namespace reference } // namespace validation } // namespace test diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp index 499263f11e..2f103a6f65 100644 --- a/tests/validation/reference/ReductionOperation.cpp +++ b/tests/validation/reference/ReductionOperation.cpp @@ -39,36 +39,39 @@ namespace reference namespace { template -struct square +T reduce_operation(T *ptr, int reduce_elements, ReductionOperation op, int stride) { - T operator()(const T &lhs, const T &rhs) const - { - return (lhs + rhs * rhs); - } -}; + using type = typename std::remove_cv::type; + auto res = type(0); -template -struct sum -{ - T operator()(const T &lhs, const T &rhs) const + if(std::is_integral::value) { - return (lhs + rhs); + uint32_t int_res = 0; + for(int i = 0; i < reduce_elements; ++i) + { + auto elem = static_cast(*(ptr + stride * i)); + int_res += (op == ReductionOperation::SUM_SQUARE) ? elem * elem : elem; + } + if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0) + { + int_res /= reduce_elements; + } + res = saturate_cast(int_res); } -}; - -template -T reduce_operation(T *ptr, int reduce_elements, ReductionOperation op) -{ - switch(op) + else { - case ReductionOperation::SUM_SQUARE: - return std::accumulate(ptr, ptr + reduce_elements, static_cast(0), square()); - case ReductionOperation::SUM: - case ReductionOperation::MEAN_SUM: - return std::accumulate(ptr, ptr + reduce_elements, static_cast(0), sum()); - default: - ARM_COMPUTE_ERROR("Unsupported reduction operation"); + for(int i = 0; i < reduce_elements; ++i) + { + auto elem = *(ptr + stride * i); + res += (op == ReductionOperation::SUM_SQUARE) ? elem * elem : elem; + } + if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0) + { + res /= reduce_elements; + } } + + return res; } } // namespace @@ -77,44 +80,22 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap { // Create reference SimpleTensor dst{ dst_shape, src.data_type(), 1, src.quantization_info() }; - const unsigned int src_width = src.shape().x(); - const unsigned int src_height = src.shape().y(); - const unsigned int src_depth = src.shape().z(); - const unsigned int src_batch = src.shape()[3]; - const bool mean = op == ReductionOperation::MEAN_SUM; + const unsigned int src_width = src.shape().x(); + const unsigned int src_height = src.shape().y(); + const unsigned int src_depth = src.shape().z(); + const unsigned int src_batch = src.shape()[3]; + const int reduce_elems = src.shape()[axis]; switch(axis) { case 0: { - const int reduce_elems = src.shape()[axis]; - const unsigned int upper_dims = src.shape().total_size_upper(1); + const unsigned int upper_dims = src.shape().total_size_upper(1); for(unsigned int du = 0; du < upper_dims; ++du) { - if(std::is_integral::value) - { - uint32_t res = 0; - for(unsigned int x = 0; x < src_width; ++x) - { - res += static_cast(src[du * src_width + x]); - } - if(mean && src_width > 0) - { - res /= src_width; - } - dst[du] = saturate_cast(res); - } - else - { - const T *src_row_ptr = src.data() + du * reduce_elems; - - auto res = reduce_operation(src_row_ptr, reduce_elems, op); - if(mean && src_width > 0) - { - res /= src_width; - } - dst[du] = res; - } + const T *src_row_ptr = src.data() + du * reduce_elems; + auto res = reduce_operation(src_row_ptr, reduce_elems, op, 1); + dst[du] = res; } } break; @@ -125,32 +106,11 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap { for(unsigned int x = 0; x < src_width; ++x) { - if(std::is_integral::value) - { - uint32_t res = 0; - for(unsigned int y = 0; y < src_height; ++y) - { - res += static_cast(src[du * src_height * src_width + y * src_width + x]); - } - if(mean && src_height > 0) - { - res /= src_height; - } - dst[du * src_width + x] = saturate_cast(res); - } - else - { - auto res = T(0); - for(unsigned int y = 0; y < src_height; ++y) - { - res += src[du * src_height * src_width + y * src_width + x]; - } - if(mean && src_height > 0) - { - res /= src_height; - } - dst[du * src_width + x] = res; - } + const int in_offset = du * src_height * src_width + x; + const int out_offset = du * src_width + x; + const T *src_row_ptr = src.data() + in_offset; + auto res = reduce_operation(src_row_ptr, reduce_elems, op, src_width); + dst[out_offset] = res; } } } @@ -164,32 +124,11 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap { for(unsigned int y = 0; y < src_height; ++y) { - if(std::is_integral::value) - { - uint32_t res = T(0); - for(unsigned int z = 0; z < src_depth; ++z) - { - res += static_cast(src[du * src_depth * src_height * src_width + z * src_height * src_width + y * src_width + x]); - } - if(mean && src_depth > 0) - { - res /= src_depth; - } - dst[du * src_width * src_height + y * src_width + x] = saturate_cast(res); - } - else - { - auto res = T(0); - for(unsigned int z = 0; z < src_depth; ++z) - { - res += src[du * src_depth * src_height * src_width + z * src_height * src_width + y * src_width + x]; - } - if(mean && src_depth > 0) - { - res /= src_depth; - } - dst[du * src_width * src_height + y * src_width + x] = res; - } + const int in_offset = du * src_depth * src_height * src_width + y * src_width + x; + const int out_offset = du * src_width * src_height + y * src_width + x; + const T *src_row_ptr = src.data() + in_offset; + auto res = reduce_operation(src_row_ptr, reduce_elems, op, src_height * src_width); + dst[out_offset] = res; } } } @@ -206,34 +145,11 @@ SimpleTensor reduction_operation(const SimpleTensor &src, const TensorShap { for(unsigned int x = 0; x < src_width; ++x) { - if(std::is_integral::value) - { - uint32_t res = 0; - for(unsigned int w = 0; w < src_batch; ++w) - { - res += static_cast(src[du * src_batch * src_depth * src_height * src_width + w * src_width * src_height * src_depth + z * src_width * src_height + y * src_width + x]); - } - if(mean && src_batch > 0) - { - res /= src_batch; - } - - dst[du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x] = saturate_cast(res); - } - else - { - auto res = T(0); - for(unsigned int w = 0; w < src_batch; ++w) - { - res += src[du * src_batch * src_depth * src_height * src_width + w * src_width * src_height * src_depth + z * src_width * src_height + y * src_width + x]; - } - if(mean && src_batch > 0) - { - res /= src_batch; - } - - dst[du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x] = res; - } + const int in_offset = du * src_batch * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x; + const int out_offset = du * src_depth * src_height * src_width + z * src_width * src_height + y * src_width + x; + const T *src_row_ptr = src.data() + in_offset; + auto res = reduce_operation(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth); + dst[out_offset] = res; } } } -- cgit v1.2.1