From 3e570dbdb0cbcbc3314e8f3e4daf2cf385caf325 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Fri, 24 Aug 2018 18:28:48 +0100 Subject: COMPMID-1304: NEDepthConvert : Add support for FP32 -> FP16 and FP16 -> FP32 + validate() function Change-Id: I12e4696a454744f6d493ab3a53520d3acf3a1a26 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/145719 Tested-by: Jenkins Reviewed-by: Georgios Pinitas --- .../NEON/kernels/NEDepthConvertLayerKernel.cpp | 180 +++++++++++++++------ src/runtime/NEON/functions/NEDepthConvertLayer.cpp | 9 +- 2 files changed, 139 insertions(+), 50 deletions(-) (limited to 'src') diff --git a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp index 8280b52fcb..158f401084 100644 --- a/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConvertLayerKernel.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -34,68 +35,90 @@ using namespace arm_compute; -namespace arm_compute +namespace { -class Coordinates; -} // namespace arm_compute - -NEDepthConvertLayerKernel::NEDepthConvertLayerKernel() - : _input(nullptr), _output(nullptr), _policy(), _shift(0) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) { -} + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output); + ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_RETURN_ERROR_ON(input == output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8); -void NEDepthConvertLayerKernel::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U8 && (output->data_type() != DataType::S16 && output->data_type() != DataType::U16 + && output->data_type() != DataType::S32), + "Only data_types supported [in] U8 -> [out] U16, S16, S32"); - _input = input; - _output = input; - _policy = policy; - _shift = shift; + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::U16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::U32), + "Only data_types supported [in] U16 -> [out] U8, U32"); - if(output != nullptr) - { - // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given) - set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S16 && (output->data_type() != DataType::U8 && output->data_type() != DataType::S32), + "Only data_types supported [in] S16 -> [out] U8, S32"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F16 && output->data_type() != DataType::F32, + "Only data_types supported [in] F16 -> [out] F32"); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::F32 && output->data_type() != DataType::F16, + "Only data_types supported [in] F32 -> [out] F16"); - // Set output - _output = output; + // Validate in case of configured output + if(output->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); } - ARM_COMPUTE_ERROR_ON(shift >= 8); - ARM_COMPUTE_ERROR_ON(input == output && (data_size_from_type(input->info()->data_type()) != data_size_from_type(output->info()->data_type()))); + return Status{}; +} - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16 - && output->info()->data_type() != DataType::S32), - "Only data_types supported [in] U8 -> [out] U16, S16, S32"); +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + constexpr unsigned int num_elems_processed_per_iteration = 16; - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32), - "Only data_types supported [in] U16 -> [out] U8, U32"); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32), - "Only data_types supported [in] S16 -> [out] U8, S32"); + AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); + bool window_changed = update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, output->valid_region()); - constexpr unsigned int num_elems_processed_per_iteration = 16; + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +NEDepthConvertLayerKernel::NEDepthConvertLayerKernel() + : _input(nullptr), _output(nullptr), _policy(), _shift(0) +{ +} + +void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given) + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + _input = input; + _output = output; + _policy = policy; + _shift = shift; + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift)); // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + auto win_config = validate_and_configure_window(input->info(), output->info()); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICPPKernel::configure(win_config.second); +} - AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); - if(output != nullptr) - { - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, input->info()->valid_region()); - } - else - { - // In-place computation - update_window_and_padding(win, input_access); - } - ICPPKernel::configure(win); +Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first); + + return Status{}; } void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info) @@ -103,8 +126,7 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - ARM_COMPUTE_ERROR_ON(nullptr == _input); - ARM_COMPUTE_ERROR_ON(nullptr == _output); + ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output); ARM_COMPUTE_ERROR_ON(_input == _output); Iterator input(_input, window); @@ -341,6 +363,68 @@ void NEDepthConvertLayerKernel::run(const Window &window, const ThreadInfo &info } break; } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + switch(_output->info()->data_type()) + { + case DataType::F32: + { + const float32x4_t scale = vdupq_n_f32(1 << _shift); + + /* Up-conversion F16 -> F32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float16x8x2_t texels = + { + { + vld1q_f16(reinterpret_cast(input.ptr())), + vld1q_f16(reinterpret_cast(input.ptr()) + 8) + } + }; + + vst1q_f32(reinterpret_cast(output.ptr()), vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)); + vst1q_f32(reinterpret_cast(output.ptr()) + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)); + vst1q_f32(reinterpret_cast(output.ptr()) + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)); + vst1q_f32(reinterpret_cast(output.ptr()) + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + case DataType::F32: + switch(_output->info()->data_type()) + { + case DataType::F16: + { + const float32x4_t scale = vdupq_n_f32(1.f / (1 << _shift)); + + /* Down-conversion F32 -> F16 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x4_t texels = + { + { + vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr())), scale), + vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr()) + 4), scale), + vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr()) + 8), scale), + vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr()) + 12), scale) + } + }; + + vst1q_f16(reinterpret_cast(output.ptr()), vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1]))); + vst1q_f16(reinterpret_cast(output.ptr()) + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3]))); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: ARM_COMPUTE_ERROR("Not supported"); } diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp index 9a75404fcd..0041c1f62e 100644 --- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp +++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -30,9 +30,14 @@ using namespace arm_compute; -void NEDepthConvertLayer::configure(ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) +void NEDepthConvertLayer::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) { auto k = arm_compute::support::cpp14::make_unique(); k->configure(input, output, policy, shift); _kernel = std::move(k); } + +Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift) +{ + return NEDepthConvertLayerKernel::validate(input, output, policy, shift); +} -- cgit v1.2.1