diff options
24 files changed, 767 insertions, 78 deletions
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index 01980d9793..eeb3e7699d 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -53,6 +53,14 @@ static constexpr const unsigned int max_cl_vector_width = 16; */ std::string get_cl_type_from_data_type(const DataType &dt); +/** Translates fixed point tensor data type to the underlying OpenCL type. + * + * @param[in] dt @ref DataType to be translated to OpenCL type. + * + * @return The string specifying the underlying OpenCL type to be used. + */ +std::string get_underlying_cl_type_from_data_type(const DataType &dt); + /** Translates a given gpu device target to string. * * @param[in] target Given gpu target. diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h index eda4c66883..e85e0ec232 100644 --- a/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h +++ b/arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h @@ -52,9 +52,9 @@ public: ~CLDepthConcatenateKernel() = default; /** Initialise the kernel's inputs and output * - * @param[in] input Input tensor. Data types supported: F32. + * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32. * @param[in] depth_offset The offset on the Z axis. - * @param[in,out] output Output tensor. Data types supported: F32. + * @param[in,out] output Output tensor. Data types supported: Same as @p input. * * @note: The output tensor's low two dimensions can't be smaller than the input one's. * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h index 7384cd1f02..b22d37bfe6 100644 --- a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h @@ -51,9 +51,9 @@ public: ~NEDepthConcatenateKernel() = default; /** Initialise the kernel's inputs and output * - * @param[in] input Input tensor. Data types supported: F32. + * @param[in] input Input tensor. Data types supported: QS8/QS16/F16/F32. * @param[in] depth_offset The offset on the Z axis. - * @param[in,out] output Output tensor. Data types supported: F32. + * @param[in,out] output Output tensor. Data types supported: Same as @p input. * * @note: The output tensor's low two dimensions can't be smaller than the input one's. * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. @@ -66,11 +66,15 @@ public: BorderSize border_size() const override; private: - const ITensor *_input; - ITensor *_output; - int _top_bottom; - int _left_right; - unsigned int _depth_offset; + using DepthConcatFunction = void(const ITensor *in, ITensor *out, std::pair<int, int> start_xy, int depth_offset, const Window &window); + +private: + DepthConcatFunction *_func; + const ITensor *_input; + ITensor *_output; + int _top_bottom; + int _left_right; + unsigned int _depth_offset; }; } #endif /* __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h index 3199936b82..77997f6bd1 100644 --- a/arm_compute/runtime/CL/functions/CLDepthConcatenate.h +++ b/arm_compute/runtime/CL/functions/CLDepthConcatenate.h @@ -29,14 +29,15 @@ #include "arm_compute/core/Window.h" #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h" +#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" + #include <memory> #include <vector> namespace arm_compute { class ICLTensor; -class CLDepthConcatenateKernel; -class CLFillBorderKernel; /** Basic function to execute concatenate tensors along z axis. This function calls the following kernels: * @@ -51,8 +52,8 @@ public: CLDepthConcatenate(); /** Initialise the kernel's inputs vector and output. * - * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: F32. - * @param[out] output Output tensor. Data types supported: F32. + * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QS8/QS16/F16/F32. + * @param[out] output Output tensor. Data types supported: Same as @p input. */ void configure(std::vector<ICLTensor *> inputs_vector, ICLTensor *output); diff --git a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h index 02ff1227c7..cc65099575 100644 --- a/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h +++ b/arm_compute/runtime/NEON/functions/NEDepthConcatenate.h @@ -26,14 +26,15 @@ #include "arm_compute/runtime/IFunction.h" +#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h" +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" + #include <memory> #include <vector> namespace arm_compute { class ITensor; -class NEDepthConcatenateKernel; -class NEFillBorderKernel; /** Basic function to execute concatenate tensors along z axis. This function calls the following kernels: * @@ -48,8 +49,8 @@ public: NEDepthConcatenate(); /** Initialise the kernel's inputs vector and output. * - * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: F32. - * @param[out] output Output tensor. Data types supported: F32. + * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QS8/QS16/F16/F32. + * @param[out] output Output tensor. Data types supported: Same as @p inputs_vector. */ void configure(std::vector<ITensor *> inputs_vector, ITensor *output); diff --git a/scripts/check_clang-tidy.py b/scripts/check_clang-tidy.py index e80b460011..237ed541f4 100755 --- a/scripts/check_clang-tidy.py +++ b/scripts/check_clang-tidy.py @@ -39,7 +39,9 @@ if __name__ == "__main__": ("Validation.cpp" in line and "parameter 'classified_labels' is unused" in line) or ("Validation.cpp" in line and "parameter 'expected_labels' is unused" in line) or ("Reference.cpp" in line and "parameter 'rois' is unused" in line) or + ("Reference.cpp" in line and "parameter 'shapes' is unused" in line) or ("ReferenceCPP.cpp" in line and "parameter 'rois' is unused" in line) or + ("ReferenceCPP.cpp" in line and "parameter 'srcs' is unused" in line) or ("NEGEMMMatrixMultiplyKernel.cpp" in line and "do not use C-style cast to convert between unrelated types" in line) or "3rdparty" in line): continue diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index dd87e778d7..1073b39ca7 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -100,6 +100,19 @@ std::string get_cl_type_from_data_type(const DataType &dt) } } +std::string get_underlying_cl_type_from_data_type(const DataType &dt) +{ + switch(dt) + { + case DataType::QS8: + return "char"; + case DataType::QS16: + return "short"; + default: + return get_cl_type_from_data_type(dt); + } +} + const std::string &string_from_target(GPUTarget target) { static std::map<GPUTarget, const std::string> gpu_target_map = diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl index 00f5189508..a92ab5bdad 100644 --- a/src/core/CL/cl_kernels/concatenate.cl +++ b/src/core/CL/cl_kernels/concatenate.cl @@ -25,29 +25,35 @@ /** This kernel concatenates the input tensor into the output tensor along the third dimension * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8, QS16, F16, F32 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: F32 + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] offset The offset to the first valid element of the output tensor in bytes + * @param[in] offsets The offsets to the first valid element of the output tensor in bytes */ __kernel void concatenate_depth( - IMAGE_DECLARATION(src), - IMAGE_DECLARATION(dst), - unsigned int offset) + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + int3 offsets) { - Image src = CONVERT_TO_IMAGE_STRUCT(src); - Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - float4 source_values = vload4(0, (__global float *)src.ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, -offsets.x, -offsets.y, 0)); - vstore4(source_values, 0, (__global float *)(dst.ptr + offset)); + VSTORE(VEC_SIZE) + (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z)); } diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index 29a43f769b..0b6d92dfd0 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -30,6 +30,12 @@ #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) +#define VLOAD_STR(size) vload##size +#define VLOAD(size) VLOAD_STR(size) + +#define VSTORE_STR(size) vstore##size +#define VSTORE(size) VSTORE_STR(size) + #define VEC_DATA_TYPE_STR(type, size) type##size #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp index 73f1ba15df..6a699ae710 100644 --- a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp +++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp @@ -35,6 +35,10 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "support/ToolchainSupport.h" + +#include <map> + using namespace arm_compute; CLDepthConcatenateKernel::CLDepthConcatenateKernel() @@ -49,12 +53,22 @@ BorderSize CLDepthConcatenateKernel::border_size() const void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + static std::map<int, std::pair<std::string, int>> configs_map = + { + { 1, { "uchar", 16 } }, + { 2, { "ushort", 8 } }, + { 4, { "uint", 4 } }, + { 8, { "ulong", 2 } }, + }; + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + ARM_COMPUTE_ERROR_ON(configs_map.find(input->info()->element_size()) == configs_map.end()); // The gaps between the two lowest dimensions of input and output need to be divisible by 2 // Otherwise it is not clear how the padding should be added onto the input tensor @@ -64,33 +78,44 @@ void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int de _input = input; _output = output; + // Add build options + auto config = configs_map.find(static_cast<int>(input->info()->element_size())); + std::set<std::string> build_opts; + build_opts.emplace(("-DDATA_TYPE=" + config->second.first)); + build_opts.emplace(("-DVEC_SIZE=" + support::cpp11::to_string(config->second.second))); + // Create kernel - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth")); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts)); // Configure kernel window _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; - const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom * - output->info()->strides_in_bytes()[1]; + const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2]; - const unsigned int num_elems_processed_per_iteration = 4; - const unsigned int num_elems_read_per_iteration = 4; + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const unsigned int num_elems_read_per_iteration = 16 / input->info()->element_size(); const unsigned int num_rows_read_per_iteration = 1; // The window needs to be based on input as we copy all the depths of input - Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size()); + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); + AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - - update_window_and_padding(win, - AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration), - output_access); - + update_window_and_padding(win, input_access, output_access); output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); - unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters - _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes); + unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the input and output parameters + const cl_int3 offsets = + { + { + static_cast<cl_int>(_left_right), + static_cast<cl_int>(_top_bottom), + static_cast<cl_int>(offset_to_first_elements_in_bytes), + } + }; + _kernel.setArg<cl_int3>(idx, offsets); ICLKernel::configure(win); } @@ -100,14 +125,14 @@ void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - Window slice = window.first_slice_window_2D(); + Window slice = window.first_slice_window_3D(); do { unsigned int idx = 0; - add_2D_tensor_argument(idx, _input, slice); - add_2D_tensor_argument(idx, _output, slice); + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice); } - while(window.slide_window_slice_2D(slice)); + while(window.slide_window_slice_3D(slice)); } diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp index 2c751a489c..7667491710 100644 --- a/src/core/CL/kernels/CLFillBorderKernel.cpp +++ b/src/core/CL/kernels/CLFillBorderKernel.cpp @@ -76,7 +76,7 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo // Define select type required by replicate border > 1 const DataType dt = tensor->info()->data_type(); - std::string select_type = get_cl_type_from_data_type(dt); + std::string select_type = get_underlying_cl_type_from_data_type(dt); if(is_data_type_float(dt)) { select_type = (DataType::F32 == dt) ? "int" : "short"; @@ -84,7 +84,7 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo // Define build options std::set<std::string> build_opts; - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt))); + build_opts.emplace(("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt))); build_opts.emplace(("-DSELECT_TYPE=" + select_type)); build_opts.emplace(("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top))); build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom))); @@ -119,9 +119,14 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo case DataType::U8: set_constant_border<uint8_t>(idx, constant_border_value); break; + case DataType::QS8: + case DataType::S8: + set_constant_border<int8_t>(idx, constant_border_value); + break; case DataType::U16: set_constant_border<uint16_t>(idx, constant_border_value); break; + case DataType::QS16: case DataType::S16: set_constant_border<int16_t>(idx, constant_border_value); break; diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp index 902490ec38..d58e4e0aa5 100644 --- a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp @@ -27,17 +27,76 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include <arm_neon.h> +#include <cstdint> using namespace arm_compute; +namespace +{ +// Overloads of 128-bit vector loads +uint8x16_t loadq(const uint8_t *ptr) +{ + return vld1q_u8(ptr); +} +uint16x8_t loadq(const uint16_t *ptr) +{ + return vld1q_u16(ptr); +} +uint32x4_t loadq(const uint32_t *ptr) +{ + return vld1q_u32(ptr); +} +// Overloads of 128-bit vector stores +void storeq(uint8_t *ptr, uint8x16_t val) +{ + return vst1q_u8(ptr, val); +} +void storeq(uint16_t *ptr, uint16x8_t val) +{ + return vst1q_u16(ptr, val); +} +void storeq(uint32_t *ptr, uint32x4_t val) +{ + return vst1q_u32(ptr, val); +} + +template <typename T> +void depth_concat(const ITensor *in, ITensor *out, std::pair<int, int> start_xy, int depth_offset, const Window &window) +{ + const int start_x = start_xy.first; + const int start_y = start_xy.second; + + // Offset input + const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1]; + uint8_t *input_ptr = in->buffer() + input_offset_to_first_elements_in_bytes; + + // Offset output + const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2]; + uint8_t *output_ptr = out->buffer() + output_offset_to_first_elements_in_bytes; + + Iterator input(in, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast<const T *>(input_ptr + input.offset()); + const auto out_ptr = reinterpret_cast<T *>(output_ptr + output.offset()); + + storeq(out_ptr, loadq(in_ptr)); + }, + input, output); +} +} // namespace + NEDepthConcatenateKernel::NEDepthConcatenateKernel() - : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) + : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) { } @@ -48,8 +107,9 @@ BorderSize NEDepthConcatenateKernel::border_size() const void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); @@ -60,18 +120,36 @@ void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int dept ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); + _func = nullptr; _input = input; _output = output; _depth_offset = depth_offset; _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; - const unsigned int num_elems_processed_per_iteration = 4; - const unsigned int num_elems_read_per_iteration = 4; + switch(input->info()->data_type()) + { + case DataType::QS8: + _func = &depth_concat<uint8_t>; + break; + case DataType::QS16: + case DataType::F16: + _func = &depth_concat<uint16_t>; + break; + case DataType::F32: + _func = &depth_concat<uint32_t>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const unsigned int num_elems_read_per_iteration = 16 / input->info()->element_size(); const unsigned int num_rows_read_per_iteration = 1; // The window needs to be based on input as we copy all the depths of input - Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size()); + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); @@ -85,21 +163,7 @@ void NEDepthConcatenateKernel::run(const Window &window) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); - // Offset output - const unsigned int offset_to_first_elements_in_bytes = _output->info()->offset_first_element_in_bytes() + _left_right * _output->info()->strides_in_bytes()[0] + _top_bottom * - _output->info()->strides_in_bytes()[1] + _depth_offset * _output->info()->strides_in_bytes()[2]; - uint8_t *output_ptr = _output->buffer() + offset_to_first_elements_in_bytes; - - Iterator input(_input, window); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast<const float *>(input.ptr()); - const auto out_ptr = reinterpret_cast<float *>(output_ptr + output.offset()); - - vst1q_f32(out_ptr, vld1q_f32(in_ptr)); - }, - input, output); + (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window); } diff --git a/src/runtime/CL/functions/CLDepthConcatenate.cpp b/src/runtime/CL/functions/CLDepthConcatenate.cpp index 34778c6980..d82f96c431 100644 --- a/src/runtime/CL/functions/CLDepthConcatenate.cpp +++ b/src/runtime/CL/functions/CLDepthConcatenate.cpp @@ -24,8 +24,6 @@ #include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h" -#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" diff --git a/src/runtime/NEON/functions/NEDepthConcatenate.cpp b/src/runtime/NEON/functions/NEDepthConcatenate.cpp index 8661fe5fda..c1025d203e 100644 --- a/src/runtime/NEON/functions/NEDepthConcatenate.cpp +++ b/src/runtime/NEON/functions/NEDepthConcatenate.cpp @@ -25,8 +25,6 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensor.h" -#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h" -#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/NEON/NEScheduler.h" diff --git a/tests/dataset/DataTypeDatasets.h b/tests/dataset/DataTypeDatasets.h index 7198f79007..5f313711c0 100644 --- a/tests/dataset/DataTypeDatasets.h +++ b/tests/dataset/DataTypeDatasets.h @@ -166,11 +166,11 @@ public: }; /** Supported CNN fixed point types. */ -class CNNFixedPointDataTypes final : public DataTypes<1> +class CNNFixedPointDataTypes final : public DataTypes<2> { public: CNNFixedPointDataTypes() - : DataTypes{ DataType::QS8 } + : DataTypes{ DataType::QS8, DataType::QS16 } { } diff --git a/tests/validation/CL/DepthConcatenateLayer.cpp b/tests/validation/CL/DepthConcatenateLayer.cpp new file mode 100644 index 0000000000..a9b727b769 --- /dev/null +++ b/tests/validation/CL/DepthConcatenateLayer.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "CL/CLAccessor.h" +#include "Globals.h" +#include "PaddingCalculator.h" +#include "TensorLibrary.h" +#include "TypePrinter.h" +#include "Utils.h" +#include "validation/Datasets.h" +#include "validation/Helpers.h" +#include "validation/Reference.h" +#include "validation/Validation.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "arm_compute/runtime/CL/functions/CLDepthConcatenate.h" + +#include "support/ToolchainSupport.h" + +#include "boost_wrapper.h" + +#include <algorithm> +#include <memory> +#include <random> +#include <string> +#include <vector> + +using namespace arm_compute; +using namespace arm_compute::test; +using namespace arm_compute::test::cl; +using namespace arm_compute::test::validation; + +namespace +{ +/** Compute OpenCL depth concatenate layer function. + * + * @param[in] shapes List of shapes to concatenate + * @param[in] dt Datatype of tensors + * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of fixed point numbers. + * + * @return Computed output tensor. + */ +CLTensor compute_depth_concatenate_layer(const std::vector<TensorShape> &shapes, DataType dt, int fixed_point_position = 0) +{ + std::vector<std::unique_ptr<CLTensor>> srcs{}; + TensorShape dst_shape = calculate_depth_concatenate_shape(shapes); + + // Create tensors + for(unsigned int i = 0; i < shapes.size(); ++i) + { + srcs.push_back(support::cpp14::make_unique<CLTensor>()); + srcs[i]->allocator()->init(TensorInfo(shapes[i], 1, dt, fixed_point_position)); + } + CLTensor dst = create_tensor<CLTensor>(dst_shape, dt, 1, fixed_point_position); + + // Create a vector of raw pointer + std::vector<ICLTensor *> srcs_raw{}; + srcs_raw.resize(srcs.size()); + std::transform(srcs.begin(), srcs.end(), srcs_raw.begin(), [](std::unique_ptr<CLTensor> const & t) + { + return t.get(); + }); + + // Create and configure function + CLDepthConcatenate depth_concat; + depth_concat.configure(srcs_raw, &dst); + + // Allocate tensors + for(auto &t : srcs) + { + t->allocator()->allocate(); + } + dst.allocator()->allocate(); + + for(const auto &t : srcs) + { + BOOST_TEST(!t->info()->is_resizable()); + } + BOOST_TEST(!dst.info()->is_resizable()); + + // Fill tensors + for(unsigned int i = 0; i < srcs.size(); ++i) + { + library->fill_tensor_uniform(CLAccessor(*srcs[i]), i); + } + + // Compute function + depth_concat.run(); + return dst; +} +} // namespace + +#ifndef DOXYGEN_SKIP_THIS +BOOST_AUTO_TEST_SUITE(CL) +BOOST_AUTO_TEST_SUITE(DepthConcatenateLayer) + +BOOST_AUTO_TEST_SUITE(Float) +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * CNNFloatDataTypes(), shape, dt) +{ + // Create input shapes + std::vector<unsigned int> depths = { 4, 6, 11, 13 }; + std::vector<TensorShape> shapes(depths.size(), shape); + for(unsigned int i = 0; i < shapes.size(); ++i) + { + shapes[i].set(2, depths[i]); + } + + // Compute function + CLTensor dst = compute_depth_concatenate_layer(shapes, dt); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_concatenate_layer(shapes, dt); + + // Validate output + validate(CLAccessor(dst), ref_dst); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmallPad, CNNFloatDataTypes(), dt) +{ + // Create input shapes + std::vector<TensorShape> shapes{ TensorShape(12u, 12u, 14u, 8u), TensorShape(14u, 14u, 12u, 8u), TensorShape(16u, 16u, 11u, 8u) }; + + // Compute function + CLTensor dst = compute_depth_concatenate_layer(shapes, dt); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_concatenate_layer(shapes, dt); + + // Validate output + validate(CLAccessor(dst), ref_dst); +} +BOOST_AUTO_TEST_SUITE_END() + +BOOST_AUTO_TEST_SUITE(Quantized) +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * CNNFixedPointDataTypes() * boost::unit_test::data::xrange(3, 6, 1), shape, dt, fixed_point_position) +{ + // Create input shapes + std::vector<unsigned int> depths = { 4, 6, 11, 13 }; + std::vector<TensorShape> shapes(depths.size(), shape); + for(unsigned int i = 0; i < shapes.size(); ++i) + { + shapes[i].set(2, depths[i]); + } + + // Compute function + CLTensor dst = compute_depth_concatenate_layer(shapes, dt, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_concatenate_layer(shapes, dt, fixed_point_position); + + // Validate output + validate(CLAccessor(dst), ref_dst); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmallPad, CNNFixedPointDataTypes() * boost::unit_test::data::xrange(3, 5, 1), dt, fixed_point_position) +{ + // Create input shapes + std::vector<TensorShape> shapes{ TensorShape(12u, 12u, 14u, 8u), TensorShape(14u, 14u, 12u, 8u), TensorShape(16u, 16u, 11u, 8u) }; + + // Compute function + CLTensor dst = compute_depth_concatenate_layer(shapes, dt, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_concatenate_layer(shapes, dt, fixed_point_position); + + // Validate output + validate(CLAccessor(dst), ref_dst); +} +BOOST_AUTO_TEST_SUITE_END() + +BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE_END() +#endif /* DOXYGEN_SKIP_THIS */ diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h index a551da731e..cae1976bd6 100644 --- a/tests/validation/Helpers.h +++ b/tests/validation/Helpers.h @@ -172,6 +172,34 @@ inline void fill_mask_from_pattern(uint8_t *mask, int cols, int rows, MatrixPatt } } +/** Calculate output tensor shape give a vector of input tensor to concatenate + * + * @param[in] input_shapes Shapes of the tensors to concatenate across depth. + * + * @return The shape of output concatenated tensor. + */ +inline TensorShape calculate_depth_concatenate_shape(std::vector<TensorShape> input_shapes) +{ + TensorShape out_shape = input_shapes.at(0); + + unsigned int max_x = 0; + unsigned int max_y = 0; + unsigned int depth = 0; + + for(auto const &shape : input_shapes) + { + max_x = std::max<unsigned int>(shape.x(), max_x); + max_y = std::max<unsigned int>(shape.y(), max_y); + depth += shape.z(); + } + + out_shape.set(0, max_x); + out_shape.set(1, max_y); + out_shape.set(2, depth); + + return out_shape; +} + /** Create a vector of random ROIs. * * @param[in] shape The shape of the input tensor. diff --git a/tests/validation/NEON/DepthConcatenateLayer.cpp b/tests/validation/NEON/DepthConcatenateLayer.cpp new file mode 100644 index 0000000000..6ab5885868 --- /dev/null +++ b/tests/validation/NEON/DepthConcatenateLayer.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "Globals.h" +#include "NEON/NEAccessor.h" +#include "PaddingCalculator.h" +#include "TensorLibrary.h" +#include "TypePrinter.h" +#include "Utils.h" +#include "validation/Datasets.h" +#include "validation/Helpers.h" +#include "validation/Reference.h" +#include "validation/Validation.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/NEON/functions/NEDepthConcatenate.h" +#include "arm_compute/runtime/Tensor.h" +#include "arm_compute/runtime/TensorAllocator.h" + +#include "support/ToolchainSupport.h" + +#include "boost_wrapper.h" + +#include <algorithm> +#include <memory> +#include <random> +#include <string> +#include <vector> + +using namespace arm_compute; +using namespace arm_compute::test; +using namespace arm_compute::test::neon; +using namespace arm_compute::test::validation; + +namespace +{ +/** Compute NEON depth concatenate layer function. + * + * @param[in] shapes List of shapes to concatenate + * @param[in] dt Datatype of tensors + * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of fixed point numbers. + * + * @return Computed output tensor. + */ +Tensor compute_depth_concatenate_layer(const std::vector<TensorShape> &shapes, DataType dt, int fixed_point_position = 0) +{ + std::vector<std::unique_ptr<Tensor>> srcs{}; + TensorShape dst_shape = calculate_depth_concatenate_shape(shapes); + + // Create tensors + for(unsigned int i = 0; i < shapes.size(); ++i) + { + srcs.push_back(support::cpp14::make_unique<Tensor>()); + srcs[i]->allocator()->init(TensorInfo(shapes[i], 1, dt, fixed_point_position)); + } + Tensor dst = create_tensor<Tensor>(dst_shape, dt, 1, fixed_point_position); + + // Create a vector of raw pointer + std::vector<ITensor *> srcs_raw{}; + srcs_raw.resize(srcs.size()); + std::transform(srcs.begin(), srcs.end(), srcs_raw.begin(), [](std::unique_ptr<Tensor> const & t) + { + return t.get(); + }); + + // Create and configure function + NEDepthConcatenate depth_concat; + depth_concat.configure(srcs_raw, &dst); + + // Allocate tensors + for(auto &t : srcs) + { + t->allocator()->allocate(); + } + dst.allocator()->allocate(); + + for(const auto &t : srcs) + { + BOOST_TEST(!t->info()->is_resizable()); + } + BOOST_TEST(!dst.info()->is_resizable()); + + // Fill tensors + for(unsigned int i = 0; i < srcs.size(); ++i) + { + library->fill_tensor_uniform(NEAccessor(*srcs[i]), i); + } + + // Compute function + depth_concat.run(); + + return dst; +} +} // namespace + +#ifndef DOXYGEN_SKIP_THIS +BOOST_AUTO_TEST_SUITE(NEON) +BOOST_AUTO_TEST_SUITE(DepthConcatenateLayer) + +BOOST_AUTO_TEST_SUITE(Float) +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * CNNFloatDataTypes(), shape, dt) +{ + // Create input shapes + std::vector<unsigned int> depths = { 4, 6, 11, 13 }; + std::vector<TensorShape> shapes(depths.size(), shape); + for(unsigned int i = 0; i < shapes.size(); ++i) + { + shapes[i].set(2, depths[i]); + } + + // Compute function + Tensor dst = compute_depth_concatenate_layer(shapes, dt); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_concatenate_layer(shapes, dt); + + // Validate output + validate(NEAccessor(dst), ref_dst); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmallPad, CNNFloatDataTypes(), dt) +{ + // Create input shapes + std::vector<TensorShape> shapes{ TensorShape(12u, 12u, 14u, 8u), TensorShape(14u, 14u, 12u, 8u), TensorShape(16u, 16u, 11u, 8u) }; + + // Compute function + Tensor dst = compute_depth_concatenate_layer(shapes, dt); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_concatenate_layer(shapes, dt); + + // Validate output + validate(NEAccessor(dst), ref_dst); +} +BOOST_AUTO_TEST_SUITE_END() + +BOOST_AUTO_TEST_SUITE(Quantized) +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * CNNFixedPointDataTypes() * boost::unit_test::data::xrange(3, 6, 1), shape, dt, fixed_point_position) +{ + // Create input shapes + std::vector<unsigned int> depths = { 4, 6, 11, 13 }; + std::vector<TensorShape> shapes(depths.size(), shape); + for(unsigned int i = 0; i < shapes.size(); ++i) + { + shapes[i].set(2, depths[i]); + } + + // Compute function + Tensor dst = compute_depth_concatenate_layer(shapes, dt, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_concatenate_layer(shapes, dt, fixed_point_position); + + // Validate output + validate(NEAccessor(dst), ref_dst); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmallPad, CNNFixedPointDataTypes() * boost::unit_test::data::xrange(3, 5, 1), dt, fixed_point_position) +{ + // Create input shapes + std::vector<TensorShape> shapes{ TensorShape(12u, 12u, 14u, 8u), TensorShape(14u, 14u, 12u, 8u), TensorShape(16u, 16u, 11u, 8u) }; + + // Compute function + Tensor dst = compute_depth_concatenate_layer(shapes, dt, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_depth_concatenate_layer(shapes, dt, fixed_point_position); + + // Validate output + validate(NEAccessor(dst), ref_dst); +} +BOOST_AUTO_TEST_SUITE_END() + +BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE_END() +#endif /* DOXYGEN_SKIP_THIS */ diff --git a/tests/validation/Reference.cpp b/tests/validation/Reference.cpp index 04362f0dc1..857dd7c741 100644 --- a/tests/validation/Reference.cpp +++ b/tests/validation/Reference.cpp @@ -538,6 +538,30 @@ RawTensor Reference::compute_reference_convolution_layer(const TensorShape &inpu return ref_dst; } +RawTensor Reference::compute_reference_depth_concatenate_layer(const std::vector<TensorShape> &shapes, DataType dt, int fixed_point_position) +{ + std::vector<std::unique_ptr<RawTensor>> ref_srcs{}; + TensorShape dst_shape = calculate_depth_concatenate_shape(shapes); + + // Create tensors + for(unsigned int i = 0; i < shapes.size(); ++i) + { + ref_srcs.push_back(support::cpp14::make_unique<RawTensor>(RawTensor(shapes[i], dt, 1, fixed_point_position))); + } + RawTensor ref_dst = library->get(dst_shape, dt, 1, fixed_point_position); + + // Fill references + for(unsigned int i = 0; i < ref_srcs.size(); ++i) + { + library->fill_tensor_uniform(*ref_srcs[i], i); + } + + // Compute reference + ReferenceCPP::depth_concatenate_layer(ref_srcs, ref_dst); + + return ref_dst; +} + RawTensor Reference::compute_reference_fully_connected_layer(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, DataType dt, bool transpose_weights, int fixed_point_position) { diff --git a/tests/validation/Reference.h b/tests/validation/Reference.h index cbdeb012ce..37a072b60a 100644 --- a/tests/validation/Reference.h +++ b/tests/validation/Reference.h @@ -26,7 +26,6 @@ #include "RawTensor.h" #include "Types.h" -#include <vector> #include <vector> @@ -285,7 +284,7 @@ public: * @return Computed raw tensor. */ static RawTensor compute_reference_batch_normalization_layer(const TensorShape &shape0, const TensorShape &shape1, DataType dt, float epsilon, int fixed_point_position = 0); - /** Compute reference pixel-wise multiplication + /** Compute reference convolution layer * * @param[in] input_shape Shape for the input tensor * @param[in] weights_shape Shape for the weights tensor @@ -299,6 +298,15 @@ public: */ static RawTensor compute_reference_convolution_layer(const TensorShape &input_shape, const TensorShape &weights_shape, const TensorShape &bias_shape, const TensorShape &output_shape, DataType dt, const PadStrideInfo &conv_info, int fixed_point_position); + /** Compute reference depth concatenation layer + * + * @param[in] shapes Input tensor shapes (All dimensions should match apart from DimZ) + * @param[in] dt Data type to use + * @param[in] fixed_point_position (Optional) Number of bits for the fractional part of the fixed point numbers + * + * @return Computed raw tensor. + */ + static RawTensor compute_reference_depth_concatenate_layer(const std::vector<TensorShape> &shapes, DataType dt, int fixed_point_position = 0); /** Compute reference for fully connected layer function * * @param[in] input_shape Shape for the input tensor diff --git a/tests/validation/ReferenceCPP.cpp b/tests/validation/ReferenceCPP.cpp index c89b737598..105bfc4b6c 100644 --- a/tests/validation/ReferenceCPP.cpp +++ b/tests/validation/ReferenceCPP.cpp @@ -36,7 +36,9 @@ #include "boost_wrapper.h" +#include <algorithm> #include <functional> +#include <memory> #include <numeric> #include <vector> @@ -292,6 +294,19 @@ void ReferenceCPP::convolution_layer(const RawTensor &src, const RawTensor &weig boost::apply_visitor(tensor_visitors::convolution_layer_visitor(s, w, b, conv_info), d); } +// Depth concatenate layer +void ReferenceCPP::depth_concatenate_layer(const std::vector<std::unique_ptr<RawTensor>> &srcs, RawTensor &dst) +{ + std::vector<TensorVariant> ss; + ss.resize(srcs.size()); + std::transform(srcs.begin(), srcs.end(), ss.begin(), [](std::unique_ptr<RawTensor> const & t) + { + return TensorFactory::get_tensor(*t); + }); + TensorVariant d = TensorFactory::get_tensor(dst); + boost::apply_visitor(tensor_visitors::depth_concatenate_layer_visitor(ss), d); +} + // Fully connected layer void ReferenceCPP::fully_connected_layer(const RawTensor &src, const RawTensor &weights, const RawTensor &bias, RawTensor &dst) { diff --git a/tests/validation/ReferenceCPP.h b/tests/validation/ReferenceCPP.h index 10e5ab6bc6..d3c77a2243 100644 --- a/tests/validation/ReferenceCPP.h +++ b/tests/validation/ReferenceCPP.h @@ -28,6 +28,7 @@ #include "RawTensor.h" +#include <memory> #include <ostream> #include <vector> @@ -262,6 +263,12 @@ public: * @param[in] conv_info Pads and strides information for the convolution layer. */ static void convolution_layer(const RawTensor &src, const RawTensor &weights, const RawTensor &bias, RawTensor &dst, const PadStrideInfo &conv_info); + /** Depth concatenate layer from @p srcs to @p dst + * + * @param[in] srcs Input tensors. + * @param[out] dst Result tensor. + */ + static void depth_concatenate_layer(const std::vector<std::unique_ptr<RawTensor>> &srcs, RawTensor &dst); /** Fully connected layer function * * @param[in] src Input tensor diff --git a/tests/validation/TensorOperations.h b/tests/validation/TensorOperations.h index adac70901d..bf9bceff0a 100644 --- a/tests/validation/TensorOperations.h +++ b/tests/validation/TensorOperations.h @@ -41,6 +41,7 @@ #include <array> #include <cmath> #include <random> +#include <string> #include <vector> namespace arm_compute @@ -1003,6 +1004,53 @@ void batch_normalization_layer(const Tensor<T> &in, Tensor<T> &out, const Tensor } } +// Depth Concatenate layer +template <typename T> +void depth_concatenate_layer(const std::vector<const Tensor<T> *> &srcs, Tensor<T> &out) +{ + unsigned depth_offset = 0; + const int width_out = out.shape().x(); + const int height_out = out.shape().y(); + const int depth_out = out.shape().z(); + const int out_stride_z = width_out * height_out; + const int batches = out.shape().total_size_upper(3); + + // Set output tensor to 0 + memset(out.data(), 0, out.num_elements() * element_size_from_data_type(out.data_type())); + + for(unsigned int i = 0; i < srcs.size(); ++i) + { + ARM_COMPUTE_ERROR_ON(srcs[i] == nullptr); + ARM_COMPUTE_ERROR_ON(srcs[i]->data_type() != out.data_type()); + ARM_COMPUTE_ERROR_ON(depth_offset >= out.shape().z()); + ARM_COMPUTE_ERROR_ON(batches != static_cast<int>(srcs[i]->shape().total_size_upper(3))); + + const Tensor<T> *src = srcs[i]; + const int width = src->shape().x(); + const int height = src->shape().y(); + const int depth = src->shape().z(); + const unsigned int x_diff = (width_out - width) / 2; + const unsigned int y_diff = (height_out - height) / 2; + + const T *src_ptr = src->data(); + for(int b = 0; b < batches; ++b) + { + const unsigned int offset_to_first_element = b * out_stride_z * depth_out + depth_offset * out_stride_z + + y_diff * width_out + x_diff; + for(int d = 0; d < depth; ++d) + { + for(int r = 0; r < height; ++r) + { + std::copy(src_ptr, src_ptr + width, out.data() + offset_to_first_element + d * out_stride_z + r * width_out); + src_ptr += width; + } + } + } + + depth_offset += depth; + } +} + // Convolution layer template <typename T> void convolution_layer(const Tensor<T> &in, const Tensor<T> &weights, const Tensor<T> &bias, Tensor<T> &out, const PadStrideInfo &conv_info) diff --git a/tests/validation/TensorVisitors.h b/tests/validation/TensorVisitors.h index 723302c973..fcc584dd46 100644 --- a/tests/validation/TensorVisitors.h +++ b/tests/validation/TensorVisitors.h @@ -30,6 +30,8 @@ #include "boost_wrapper.h" +#include <algorithm> +#include <memory> #include <ostream> #include <vector> @@ -253,7 +255,31 @@ private: const TensorVariant &_bias; PadStrideInfo _conv_info; }; +// Depth Concatenate Layer visitor +struct depth_concatenate_layer_visitor : public boost::static_visitor<> +{ +public: + explicit depth_concatenate_layer_visitor(const std::vector<TensorVariant> &srcs) + : _srcs(srcs) + { + } + template <typename T> + void operator()(Tensor<T> &out) const + { + std::vector<const Tensor<T> *> srcs; + srcs.resize(_srcs.size()); + std::transform(_srcs.begin(), _srcs.end(), srcs.begin(), [](const TensorVariant & t) + { + return &(boost::get<Tensor<T>>(t)); + }); + tensor_operations::depth_concatenate_layer(srcs, out); + } + +private: + const std::vector<TensorVariant> &_srcs; +}; +// Fully Connected Layer visitor struct fully_connected_layer_visitor : public boost::static_visitor<> { public: |