diff options
Diffstat (limited to 'src/gpu/cl/operators')
66 files changed, 2038 insertions, 1167 deletions
diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp index 74a818d738..66877ebcec 100644 --- a/src/gpu/cl/operators/ClActivation.cpp +++ b/src/gpu/cl/operators/ClActivation.cpp @@ -23,19 +23,21 @@ */ #include "src/gpu/cl/operators/ClActivation.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClActivationKernel.h" - #include "src/common/IOperator.h" #include "src/common/utils/LegacySupport.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/ClContext.h" +#include "src/gpu/cl/kernels/ClActivationKernel.h" namespace arm_compute { namespace opencl { -void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClActivation::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src, dst, act_info); auto k = std::make_unique<kernels::ClActivationKernel>(); @@ -53,13 +55,17 @@ namespace gpu { namespace opencl { -std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) +std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) { TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); auto info = detail::convert_to_activation_info(act); - if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) + if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), + &dst_info.set_is_resizable(false), info))) { return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); } @@ -68,7 +74,7 @@ std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensor act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info); auto op = new arm_compute::IOperator(static_cast<IContext *>(this)); - if(op == nullptr) + if (op == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); return std::make_tuple(nullptr, StatusCode::OutOfMemory); diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h index 348dc27929..4f25bb5f24 100644 --- a/src/gpu/cl/operators/ClActivation.h +++ b/src/gpu/cl/operators/ClActivation.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ACTIVATION_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -43,7 +44,10 @@ public: * @param[out] dst Destination tensor info. Data type supported: same as @p src * @param[in] activation_info Activation layer parameters. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ActivationLayerInfo &activation_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClActivation::configure() diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp index b9bf505bba..b58d0df58d 100644 --- a/src/gpu/cl/operators/ClAdd.cpp +++ b/src/gpu/cl/operators/ClAdd.cpp @@ -23,17 +23,20 @@ */ #include "src/gpu/cl/operators/ClAdd.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClElementwiseKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void ClAdd::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info); auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); @@ -41,8 +44,11 @@ void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1 _kernel = std::move(k); } -Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status ClAdd::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info); } diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h index a17ce7b5d6..7aed902f5d 100644 --- a/src/gpu/cl/operators/ClAdd.h +++ b/src/gpu/cl/operators/ClAdd.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ADD_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -65,7 +66,11 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -73,7 +78,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp index 05ea21b734..8f26ef003d 100644 --- a/src/gpu/cl/operators/ClCast.cpp +++ b/src/gpu/cl/operators/ClCast.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClCast.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClCastKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +void ClCast::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + ConvertPolicy policy) { ARM_COMPUTE_LOG_PARAMS(src, dst, policy); auto k = std::make_unique<kernels::ClCastKernel>(); diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h index 1b67ff7c8e..25d2293673 100644 --- a/src/gpu/cl/operators/ClCast.h +++ b/src/gpu/cl/operators/ClCast.h @@ -58,7 +58,8 @@ public: * @param[out] dst The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. * @param[in] policy Conversion policy. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + void + configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCast::configure() diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp index a27fc37cc4..31018b9768 100644 --- a/src/gpu/cl/operators/ClConcatenate.cpp +++ b/src/gpu/cl/operators/ClConcatenate.cpp @@ -23,9 +23,14 @@ */ #include "src/gpu/cl/operators/ClConcatenate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h" #include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h" #include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h" @@ -33,42 +38,39 @@ #include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h" #include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" - -#include "src/common/utils/Log.h" -#include "src/core/helpers/AutoConfiguration.h" - namespace arm_compute { namespace opencl { -void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis) +void ClConcatenate::configure(const CLCompileContext &compile_context, + const std::vector<ITensorInfo *> &src_vector, + ITensorInfo *dst, + size_t axis) { ARM_COMPUTE_ERROR_ON(dst == nullptr); ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis); _axis = axis; _num_inputs = src_vector.size(); - TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis); + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis); std::vector<const ITensorInfo *> const_src_vector(src_vector.size()); - std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t; - }); + std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), + [](ITensorInfo *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t; + }); // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type()); ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis)); unsigned int offset = 0; - switch(_axis) + switch (_axis) { case Window::DimX: { - switch(_num_inputs) + switch (_num_inputs) { case 2: { @@ -82,14 +84,15 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std { // Configure WidthConcatenate4Tensors kernel auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>(); - kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst); + kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), + src_vector.at(3), dst); _concat_kernels.emplace_back(std::move(kernel)); break; } default: { // Configure generic case WidthConcatenate kernels - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -103,7 +106,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std } case Window::DimY: { - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -114,7 +117,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std } case Window::DimZ: { - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -125,7 +128,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std } case 3: { - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -148,25 +151,27 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2); unsigned int offset = 0; - switch(axis) + switch (axis) { case Window::DimX: { - switch(num_inputs) + switch (num_inputs) { case 2: // Validate WidthConcatenate2Tensors kernels if there are 2 inputs ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst)); break; case 4: // Validate WidthConcatenate4Tensors kernels if there are 4 inputs ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate( + src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst)); break; default: // Validate generic case of WidthConcatenate kernel - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst)); @@ -178,7 +183,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto } case Window::DimY: { - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst)); offset += src->dimension(axis); @@ -187,7 +192,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto } case Window::DimZ: { - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst)); offset += src->dimension(axis); @@ -196,7 +201,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto } case 3: { - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst)); offset += src->dimension(axis); @@ -207,7 +212,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto ARM_COMPUTE_ERROR("Axis not supported"); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis); ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); @@ -218,17 +223,17 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto void ClConcatenate::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } - if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs)) + if (static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs)) { ARM_COMPUTE_ERROR("Configured with different number of inputs"); } - if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) + if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) { ARM_COMPUTE_ERROR_ON(_concat_kernels.empty()); CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true); @@ -236,7 +241,7 @@ void ClConcatenate::run(ITensorPack &tensors) else { int i = 0; - for(auto &k : _concat_kernels) + for (auto &k : _concat_kernels) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h index de0cf84d2c..d8ce9d2a5c 100644 --- a/src/gpu/cl/operators/ClConcatenate.h +++ b/src/gpu/cl/operators/ClConcatenate.h @@ -57,7 +57,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src_vector. * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. */ - void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis); + void configure(const ClCompileContext &compile_context, + const std::vector<ITensorInfo *> &src_vector, + ITensorInfo *dst, + size_t axis); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClConcatenate::configure() @@ -71,8 +74,8 @@ public: private: std::vector<std::unique_ptr<IClKernel>> _concat_kernels{}; - unsigned int _num_inputs{ 0 }; - unsigned int _axis{ 0 }; + unsigned int _num_inputs{0}; + unsigned int _axis{0}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp index eb9475ccaa..2c3b0214fa 100644 --- a/src/gpu/cl/operators/ClConv2d.cpp +++ b/src/gpu/cl/operators/ClConv2d.cpp @@ -23,17 +23,17 @@ */ #include "src/gpu/cl/operators/ClConv2d.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" + +#include "src/common/utils/Log.h" #include "src/gpu/cl/operators/ClDirectConv2d.h" #include "src/gpu/cl/operators/ClGemmConv2d.h" #include "src/gpu/cl/operators/ClIndirectConv2d.h" #include "src/gpu/cl/operators/ClWinogradConv2d.h" -#include "src/common/utils/Log.h" - #include <memory> namespace @@ -48,7 +48,7 @@ namespace */ size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target) { - switch(gpu_target) + switch (gpu_target) { case arm_compute::GPUTarget::G76: case arm_compute::GPUTarget::G77: @@ -71,27 +71,33 @@ namespace opencl { using namespace arm_compute::misc::shape_calculator; -ClConv2d::ClConv2d() - : _operator() +ClConv2d::ClConv2d() : _operator() { } ClConv2d::~ClConv2d() = default; -void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info) +void ClConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info)); + ARM_COMPUTE_ERROR_THROW_ON( + ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info); - switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target())) + switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target())) { case ConvolutionMethod::WINOGRAD: { ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1); auto f = std::make_unique<ClWinogradConv2d>(); - f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math); + f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, + conv2d_info.enable_fast_math); _operator = std::move(f); break; } @@ -125,35 +131,46 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s _aux_mem = _operator->workspace(); } -Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, +Status ClConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); const GPUTarget gpu_target = CLScheduler::get().target(); - switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target)) + switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target)) { case ConvolutionMethod::WINOGRAD: { //Validate Winograd - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClWinogradConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, + conv2d_info.act_info, conv2d_info.enable_fast_math)); break; } case ConvolutionMethod::DIRECT: { // Validate direct convolution layer - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClDirectConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR( + ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); break; } case ConvolutionMethod::INDIRECT: { // Validate indirect convolution layer - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClIndirectConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR( + ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); break; } case ConvolutionMethod::GEMM: @@ -170,8 +187,12 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co return Status{}; } -ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info, const GPUTarget gpu_target) +ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, + const GPUTarget gpu_target) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_ERROR_ON_NULLPTR(dst); @@ -191,20 +212,35 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>; using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; - const std::vector<ConfigurationMethod> known_configs = - { + const std::vector<ConfigurationMethod> known_configs = { // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), + PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), + ConvolutionMethod::DIRECT), // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), + ConvolutionMethod::DIRECT), // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), + ConvolutionMethod::GEMM), // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), + ConvolutionMethod::GEMM), // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), + ConvolutionMethod::GEMM), // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), + ConvolutionMethod::GEMM), }; const auto find_config = [&](ConfigurationMethod c) @@ -213,76 +249,89 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const const PadStrideInfo info = std::get<3>(config); const DataLayout data_layout = std::get<4>(config); - return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout()); + return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && + std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride() && (data_layout == src->data_layout()); }; std::vector<ConfigurationMethod>::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) { return (*found).second; } - if(dilation != Size2D(1U, 1U)) + if (dilation != Size2D(1U, 1U)) { return ConvolutionMethod::GEMM; } else { - if(src->data_layout() == DataLayout::NCHW) + if (src->data_layout() == DataLayout::NCHW) { // SRGAN - if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) - && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info))) + if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && + (conv_info.pad_top() < 3) && + (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info))) { return ConvolutionMethod::DIRECT; } - if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))) + if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && + (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))) { return ConvolutionMethod::FFT; } - if(src->dimension(idx_c) < 16) + if (src->dimension(idx_c) < 16) { return ConvolutionMethod::GEMM; } - return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; + return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) + ? ConvolutionMethod::WINOGRAD + : ConvolutionMethod::GEMM; } else { - const bool is_direct_valid = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); - const bool is_wino_valid = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)); + const bool is_direct_valid = + bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); + const bool is_wino_valid = + bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)); const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target); // SRGAN case - if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) - && is_direct_valid) + if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && + (conv_info.pad_top() < 3) && is_direct_valid) { return ConvolutionMethod::DIRECT; } // Floating-point case: GeMM/Direct/Winograd - if(is_data_type_float(src->data_type())) + if (is_data_type_float(src->data_type())) { // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); - const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8; - const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; - const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; - const bool is_ofm_lt_64 = weights->dimension(3U) < 64; - const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; - const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); - const bool is_m_one = output_shape[1] * output_shape[2] == 1; - const bool is_unit_stride = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1); - const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h); + TensorShape output_shape = + misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && + (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); + const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8; + const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; + const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; + const bool is_ofm_lt_64 = weights->dimension(3U) < 64; + const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; + const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); + const bool is_m_one = output_shape[1] * output_shape[2] == 1; + const bool is_unit_stride = + (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1); + const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h); // Run Winograd if valid and IFM >= 8 - if(is_wino_valid && is_ifm_ge_8) + if (is_wino_valid && is_ifm_ge_8) { - if(is_ofm_lte_8) + if (is_ofm_lte_8) { - if(gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD) + if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || + get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD) { return ConvolutionMethod::WINOGRAD; } @@ -294,18 +343,19 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const } // Direct convolution case - if(is_direct_valid) + if (is_direct_valid) { - if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)) + if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || + get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)) { - if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) + if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) { return ConvolutionMethod::DIRECT; } } - else if(gpu_target == arm_compute::GPUTarget::G76) + else if (gpu_target == arm_compute::GPUTarget::G76) { - if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) + if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) { return ConvolutionMethod::DIRECT; } @@ -314,21 +364,24 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const { ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT; - const bool is_indirect_valid = bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); + const bool is_indirect_valid = + bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); // indirect conv2d should be called when: // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81) // 2- When the kernel size is odd // 3- When the Gpu target is Arm Mali-G77 - if(is_indirect_valid) + if (is_indirect_valid) { const bool is_kernel_sz_odd = kernel_sz % 2; const bool is_g77 = gpu_target == GPUTarget::G77; - preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT; + preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 + ? ConvolutionMethod::INDIRECT + : ConvolutionMethod::DIRECT; } // Direct/indirect convolution used for the first layer of the network - if(workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64) + if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64) { // In general, the question we should ask for the first convolution layer of a model is: // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that @@ -337,13 +390,13 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const return preferred_conv_method; } - if((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16) + if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16) { return preferred_conv_method; } // Direct convolution used for the last layer of the network - if(is_ofm_lte_8) + if (is_ofm_lte_8) { return preferred_conv_method; } diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h index c6c366a762..0cf3cbc1ce 100644 --- a/src/gpu/cl/operators/ClConv2d.h +++ b/src/gpu/cl/operators/ClConv2d.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/IClOperator.h" @@ -112,15 +113,24 @@ public: * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info = WeightsInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d * * Similar to ClConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will return the convolution called by @ref ClConv2d * @@ -137,11 +147,15 @@ public: * * @return the Convolution Method Hint */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info, const GPUTarget gpu_target); + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, + const GPUTarget gpu_target); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp index 08122b6852..cf24c68d21 100644 --- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp @@ -23,16 +23,19 @@ */ #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout); auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>(); @@ -40,9 +43,12 @@ void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_c _kernel = std::move(k); } -Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h index 2794eb17b0..c46152081c 100644 --- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h @@ -43,14 +43,21 @@ public: * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClConvertFullyConnectedWeights::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp index d3b83040d0..e2be7cebd4 100644 --- a/src/gpu/cl/operators/ClCopy.cpp +++ b/src/gpu/cl/operators/ClCopy.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClCopy.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClCopyKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl @@ -45,4 +44,4 @@ Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window * return kernels::ClCopyKernel::validate(src, dst, dst_window); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h index 9b427f9675..fe9b58c607 100644 --- a/src/gpu/cl/operators/ClCopy.h +++ b/src/gpu/cl/operators/ClCopy.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_COPY_H #include "arm_compute/core/Window.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -44,7 +45,10 @@ public: * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCopy::configure() diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp index cef9f14c7d..6313e4fbb5 100644 --- a/src/gpu/cl/operators/ClCrop.cpp +++ b/src/gpu/cl/operators/ClCrop.cpp @@ -23,17 +23,22 @@ */ #include "src/gpu/cl/operators/ClCrop.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClCropKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void ClCrop::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window); auto k = std::make_unique<kernels::ClCropKernel>(); @@ -41,9 +46,15 @@ void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInf _kernel = std::move(k); } -Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +Status ClCrop::validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h index 1cf1c9bff4..e845cf372c 100644 --- a/src/gpu/cl/operators/ClCrop.h +++ b/src/gpu/cl/operators/ClCrop.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_CROP_H #include "arm_compute/core/Window.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -49,16 +50,27 @@ public: * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCrop::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp index 0fccab63e0..eb6f9e7abb 100644 --- a/src/gpu/cl/operators/ClDequantize.cpp +++ b/src/gpu/cl/operators/ClDequantize.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClDequantizeKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClDequantizeKernel.h" namespace arm_compute { diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp index 0215dba422..17a196ce6b 100644 --- a/src/gpu/cl/operators/ClDirectConv2d.cpp +++ b/src/gpu/cl/operators/ClDirectConv2d.cpp @@ -26,6 +26,8 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/kernels/ClActivationKernel.h" @@ -35,8 +37,6 @@ #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h" #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" -#include "src/common/utils/Log.h" - using namespace arm_compute::cl_direct_conv; namespace arm_compute @@ -53,7 +53,8 @@ ITensorPack select_activation_src_dst(ITensorPack &tensors) return pack; } -DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo +config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) { // Get GPU target GPUTarget gpu_target = CLScheduler::get().target(); @@ -65,8 +66,13 @@ DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *sr } // namespace -void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void ClDirectConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info); @@ -75,15 +81,17 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info); // Configure direct convolution kernel - const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo(); - auto k = std::make_unique<kernels::ClDirectConv2dKernel>(); + const ActivationLayerInfo conv2d_act_info = + (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info + : ActivationLayerInfo(); + auto k = std::make_unique<kernels::ClDirectConv2dKernel>(); k->set_target(CLScheduler::get().target()); k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc); _direct_conv_kernel = std::move(k); // Configure border handler PixelValue zero_value(0.f); - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { zero_value = PixelValue(0, src->data_type(), src->quantization_info()); } @@ -92,7 +100,7 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI _src_border_handler = std::move(b); // Fused activation is currently supported for NHWC and floating point types - if(act_info.enabled() && !conv2d_act_info.enabled()) + if (act_info.enabled() && !conv2d_act_info.enabled()) { auto a = std::make_unique<kernels::ClActivationKernel>(); a->configure(compile_context, dst, dst, act_info); @@ -103,14 +111,19 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI CLScheduler::get().tune_kernel_static(*_direct_conv_kernel); } -Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +Status ClDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { // Initialize the direct convolution descriptor const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc)); - if(act_info.enabled()) + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc)); + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info)); } @@ -124,7 +137,7 @@ void ClDirectConv2d::run(ITensorPack &tensors) // Run direct convolution CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false); // Run activation kernel - if(_activation_kernel) + if (_activation_kernel) { auto act_pack = select_activation_src_dst(tensors); CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false); diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h index fedb9e971e..0f18490814 100644 --- a/src/gpu/cl/operators/ClDirectConv2d.h +++ b/src/gpu/cl/operators/ClDirectConv2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_DIRECT_CONV2D_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/IClOperator.h" @@ -59,7 +60,12 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -67,16 +73,20 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited method overridden void run(ITensorPack &tensors) override; private: - std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr }; - std::unique_ptr<IClKernel> _src_border_handler{ nullptr }; - std::unique_ptr<IClKernel> _activation_kernel{ nullptr }; + std::unique_ptr<IClKernel> _direct_conv_kernel{nullptr}; + std::unique_ptr<IClKernel> _src_border_handler{nullptr}; + std::unique_ptr<IClKernel> _activation_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp index 5d37f07f31..b08347936b 100644 --- a/src/gpu/cl/operators/ClDirectConv3d.cpp +++ b/src/gpu/cl/operators/ClDirectConv3d.cpp @@ -24,13 +24,19 @@ #include "src/gpu/cl/operators/ClDirectConv3d.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/gpu/cl/kernels/ClDirectConv3dKernel.h" namespace arm_compute { namespace opencl { -void ClDirectConv3d::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info) +void ClDirectConv3d::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0); @@ -40,7 +46,11 @@ void ClDirectConv3d::configure(const CLCompileContext &compile_context, const IT _direct_conv3d_kernel = std::move(k); } -Status ClDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status ClDirectConv3d::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info)); return Status{}; diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h index fa58b5aedd..5fb32460e2 100644 --- a/src/gpu/cl/operators/ClDirectConv3d.h +++ b/src/gpu/cl/operators/ClDirectConv3d.h @@ -67,7 +67,12 @@ public: * @param[in] conv3d_info Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused. * */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info); /** Static function to check if given info will lead to a valid configuration * @@ -75,14 +80,18 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info); // Inherited method overridden void run(ITensorPack &tensors) override; private: - std::unique_ptr<IClKernel> _direct_conv3d_kernel{ nullptr }; + std::unique_ptr<IClKernel> _direct_conv3d_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */ diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp index 32d2b88798..1325371d19 100644 --- a/src/gpu/cl/operators/ClElementwiseOperations.cpp +++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp @@ -23,15 +23,18 @@ */ #include "src/gpu/cl/operators/ClElementwiseOperations.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" namespace arm_compute { namespace opencl { -void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseDivision::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -39,12 +42,19 @@ void ClElementwiseDivision::configure(const ClCompileContext &compile_context, I _kernel = std::move(k); } -Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseDivision::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info); } -void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseMax::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -52,12 +62,19 @@ void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITenso _kernel = std::move(k); } -Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseMax::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info); } -void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseMin::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -65,12 +82,19 @@ void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITenso _kernel = std::move(k); } -Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseMin::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info); } -void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -78,12 +102,19 @@ void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context _kernel = std::move(k); } -Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info); } -void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwisePower::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -91,7 +122,10 @@ void ClElementwisePower::configure(const ClCompileContext &compile_context, ITen _kernel = std::move(k); } -Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwisePower::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info); } diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h index 120049cb7f..de7c018d75 100644 --- a/src/gpu/cl/operators/ClElementwiseOperations.h +++ b/src/gpu/cl/operators/ClElementwiseOperations.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -48,14 +49,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseDivision::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max @@ -74,14 +82,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseMax::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min @@ -100,14 +115,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseMin::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference @@ -126,14 +148,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseSquaredDiff::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power @@ -152,14 +181,21 @@ public: * @param[out] dst Destination tensor info. Data types supported:F16/F32. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwisePower::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp index f94d402c05..914621183e 100644 --- a/src/gpu/cl/operators/ClElementwiseUnary.cpp +++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp @@ -23,9 +23,8 @@ */ #include "src/gpu/cl/operators/ClElementwiseUnary.h" -#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" - #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" namespace arm_compute { diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp index ad22b15cff..817b15ab20 100644 --- a/src/gpu/cl/operators/ClFill.cpp +++ b/src/gpu/cl/operators/ClFill.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClFill.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClFillKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window) +void ClFill::configure(const ClCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *dst_window) { ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window); auto k = std::make_unique<kernels::ClFillKernel>(); @@ -45,4 +47,4 @@ Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_va return kernels::ClFillKernel::validate(tensor, constant_value, dst_window); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h index 3bbe27ef71..e13862aa6b 100644 --- a/src/gpu/cl/operators/ClFill.h +++ b/src/gpu/cl/operators/ClFill.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Window.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -44,7 +45,10 @@ public: * @param[in] constant_value The value used to fill the planes of the tensor * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); + void configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to ClFill::configure() diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp index e277c0d7e4..7532532c94 100644 --- a/src/gpu/cl/operators/ClFlatten.cpp +++ b/src/gpu/cl/operators/ClFlatten.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClFlatten.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClReshapeKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp index 84f685e381..6790160172 100644 --- a/src/gpu/cl/operators/ClFloor.cpp +++ b/src/gpu/cl/operators/ClFloor.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClFloor.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClFloorKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp index 5845bbc69e..6969ac8ab3 100644 --- a/src/gpu/cl/operators/ClFullyConnected.cpp +++ b/src/gpu/cl/operators/ClFullyConnected.cpp @@ -24,12 +24,13 @@ #include "src/gpu/cl/operators/ClFullyConnected.h" #include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" #include "src/gpu/cl/operators/ClFlatten.h" @@ -38,11 +39,8 @@ #include "src/gpu/cl/operators/ClMatMul.h" #include "src/gpu/cl/operators/ClTranspose.h" #include "src/gpu/cl/utils/ClAuxTensorHandler.h" - #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h" #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" #include <algorithm> @@ -62,8 +60,11 @@ inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src) return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation } -Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst, - GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info) +Status construct_gemmlowp_output_stage(const ITensorInfo &src, + const ITensorInfo &weights, + const ITensorInfo &dst, + GEMMLowpOutputStageInfo &gemmlowp_output_stage, + ActivationLayerInfo activation_info) { gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; gemmlowp_output_stage.gemmlowp_offset = 0; @@ -73,7 +74,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo const auto data_type = src.data_type(); // Configure output stage for quantized case - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { const QuantizationInfo oq_info = dst.quantization_info(); const UniformQuantizationInfo iq_unif = src.quantization_info().uniform(); @@ -85,15 +86,17 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); PixelValue type_min{}; PixelValue type_max{}; std::tie(type_min, type_max) = get_min_max(data_type); - if(activation_info.enabled()) + if (activation_info.enabled()) { - std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info); + std::tie(type_min, type_max) = + get_quantized_activation_min_max(activation_info, data_type, output_quant_info); } // Set the GEMMLowp output stage info @@ -109,31 +112,41 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo return Status{}; } -Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info, bool use_matmul) +Status validate_mm(const ITensorInfo &src, + const ITensorInfo &weights, + const ITensorInfo *bias, + const ITensorInfo &dst, + const FullyConnectedLayerInfo &fc_info, + bool use_matmul) { // Note : If input is dynamic and data is not batched, use matmul, else use gemm const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; - const bool use_dynamic_gemm = !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul - const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type()); + const bool use_dynamic_gemm = + !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul + const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type()); - if(use_matmul) + if (use_matmul) { const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights); // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1] TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape())); - const GPUTarget gpu_target = CLScheduler::get().target(); - std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); - const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info); + const GPUTarget gpu_target = CLScheduler::get().target(); + std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t = + cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info); - return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info) : - kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info); + return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, + kernel_info, fc_info.activation_info) + : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, + fc_info.activation_info); } else { GEMMLowpOutputStageInfo gemmlowp_output_stage; - ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info)); const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped false, // is_b_reshaped @@ -147,7 +160,7 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe true, // broadcast_bias ActivationLayerInfo()); // activation_info - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo iq_info = src.quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); @@ -158,11 +171,9 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); // Validate gemmlowp function - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), - bias, - &dst, - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate( + &src.clone()->set_quantization_info(src_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info)); } else { @@ -188,11 +199,15 @@ ClFullyConnected::ClFullyConnected() ClFullyConnected::~ClFullyConnected() = default; -void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, +void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info) { // If weights are dynamic and matmul is supported use matmul, else use gemm - if(_use_matmul) + if (_use_matmul) { // Specify whether transpose weights is necessary in matmul info const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights); @@ -202,22 +217,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape())); // 2. Use heuristics to get kernel info object - const GPUTarget gpu_target = CLScheduler::get().target(); - std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); - MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info); + const GPUTarget gpu_target = CLScheduler::get().target(); + std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = + cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info); // 3. Configure relevant matmul kernel - if(_is_quantized) + if (_is_quantized) { _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>(); _matmul_lowp_native_kernel->set_target(gpu_target); - _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info); + _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, + fc_info.activation_info); } else { _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>(); _matmul_native_kernel->set_target(gpu_target); - _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info); + _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, + fc_info.activation_info); } } else @@ -238,7 +256,7 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe true, // broadcast_bias fc_info.activation_info); // activation_info - if(_is_quantized) + if (_is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -248,8 +266,10 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); - src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset)); - weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + src_info.set_quantization_info( + QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset)); + weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, + -weights_quantization_info.uniform().offset)); // Configure gemmlowp function _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>(); @@ -264,16 +284,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe } } -void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, +void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info) { // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate. - ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != + (src->dimension(0) * src->dimension(1) * src->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be linearized // Initialize output tensor for flatten - _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW); + _flattened_src = src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(src)) + .set_data_layout(DataLayout::NCHW); // Configure flatten kernel _flatten = std::make_unique<ClFlatten>(); @@ -284,7 +313,11 @@ void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info); } -void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, +void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info) { // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate. @@ -294,7 +327,11 @@ void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, configure_mm(compile_context, src, weights, bias, dst, fc_info); } -void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, +void ClFullyConnected::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, FullyConnectedLayerInfo fc_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -317,8 +354,9 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched. // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required) const bool is_batched_fc_layer = dst->dimension(1) > 1; - _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); - _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul; + _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && + !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); + _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul; // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -327,11 +365,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso // 4) Fully Connected layer -> Fully Connected layer with batches // Check if we have a fully connected layer with batches - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, - src->tensor_shape().cend(), - dst->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { @@ -341,7 +379,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso ITensorInfo *weights_used = weights; // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op. - if(_transpose_weights && !_use_matmul) + if (_transpose_weights && !_use_matmul) { // Reshape the weights _reshape_weights = std::make_unique<ClTranspose>(); @@ -351,14 +389,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso } // Convert weights if needed - if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Convert weights _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>(); - _convert_weights->configure(compile_context, - weights_used, - &_converted_weights, - src->tensor_shape(), + _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(), fc_info.weights_trained_layout); weights_used = &_converted_weights; @@ -366,7 +401,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso _run_convert_weights = true; } - if(_is_fc_after_conv) + if (_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info); @@ -379,60 +414,69 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion) _weights_to_use = *weights_used; - if(_use_matmul) + if (_use_matmul) { // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem - _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size()); + _aux_mem[ConvertedWeights] = + MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size()); } else { // Set auxiliary memory requirements for gemm operators auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); - for(unsigned int i = 0; i < gemm_mem_req.size(); ++i) + for (unsigned int i = 0; i < gemm_mem_req.size(); ++i) { _aux_mem[i] = gemm_mem_req[i]; } - if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs + if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs { // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time _aux_mem[TransposedWeights] = MemoryInfo( - offset_int_vec(TransposedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, - _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, - _converted_weights.total_size()); + offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _converted_weights.total_size()); } else { // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; - const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; - - _aux_mem[TransposedWeights] = MemoryInfo( - offset_int_vec(TransposedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft, - _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft, - _converted_weights.total_size()); + const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) + ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare; + const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) + ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare; + + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft, + _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft, + _converted_weights.total_size()); } } - _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); + _aux_mem[FlattenedSrc] = + MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); } -Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, +Status ClFullyConnected::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, FullyConnectedLayerInfo fc_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON( + fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target()); const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; @@ -441,11 +485,20 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei // When using dynamic weights - use matmul kernels. // Note: MatMul does not support broadcasting so fallback with batched cases. const bool is_batched_fc_layer = dst->dimension(1) > 1; - const bool use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); - - const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW)); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) ? TensorInfo(*reshaped_weights.clone()) : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()); + const bool use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && + !is_batched_fc_layer && + !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); + + const ITensorInfo &flatten_src = TensorInfo(src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(src)) + .set_data_layout(DataLayout::NCHW)); + const ITensorInfo &reshaped_weights = TensorInfo( + weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) + ? TensorInfo(*reshaped_weights.clone()) + : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()); // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -456,10 +509,10 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei const ITensorInfo *src_to_use = src; const ITensorInfo *weights_to_use = weights; - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -470,11 +523,11 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei } // Check if FC is after conv (flatten kernel is run in case where FC is after conv.) - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, - src->tensor_shape().cend(), - dst->tensor_shape().cbegin() + 1)); + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { @@ -482,29 +535,28 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei } // Transpose kernel does not run when matmul is supported as matmul fuses transpose op. - if(transpose_weights && !use_matmul) + if (transpose_weights && !use_matmul) { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } - if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - src->tensor_shape(), - fc_info.weights_trained_layout)); + ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } - if(is_fc_after_conv) + if (is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1; - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src)); @@ -539,24 +591,24 @@ void ClFullyConnected::run(ITensorPack &tensors) CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false); // Linearize input if it comes from a convolutional layer - if(_is_fc_after_conv) + if (_is_fc_after_conv) { - ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } }; + ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}}; _flatten->run(flatten_pack); } ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); - if(_weights_to_use_idx != ACL_SRC_1) + if (_weights_to_use_idx != ACL_SRC_1) { gemm_pack.add_const_tensor(ACL_SRC_1, weights.get()); } // Run MatMul Op - if(_use_matmul) + if (_use_matmul) { // Run matmul kernels for matrix multiplication - if(_is_quantized) + if (_is_quantized) { CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true); } @@ -568,7 +620,7 @@ void ClFullyConnected::run(ITensorPack &tensors) else { // Run matrix multiply - if(_is_quantized) + if (_is_quantized) { _mm_gemmlowp->run(gemm_pack); } @@ -582,7 +634,7 @@ void ClFullyConnected::run(ITensorPack &tensors) void ClFullyConnected::prepare(ITensorPack &tensors) { // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed. - if(!_is_prepared || _dynamic_gemm) + if (!_is_prepared || _dynamic_gemm) { #ifdef ARM_COMPUTE_ASSERTS_ENABLED ++_asrt_prepare_count; @@ -598,10 +650,10 @@ void ClFullyConnected::prepare(ITensorPack &tensors) const ITensor *cur_weights = weights; // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose. - if(_transpose_weights && !_use_matmul) + if (_transpose_weights && !_use_matmul) { // Run reshape weights kernel and mark weights as unused - ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; + ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}}; _reshape_weights->run(transpose_pack); cur_weights->mark_as_unused(); @@ -609,9 +661,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors) } // Convert weights if needed - if(_run_convert_weights) + if (_run_convert_weights) { - ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; + ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}}; _convert_weights->run(convert_pack); cur_weights->mark_as_unused(); @@ -622,9 +674,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors) gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); // Prepare GEMM prepare and release unused weights - if(_dynamic_gemm || !_use_matmul) + if (_dynamic_gemm || !_use_matmul) { - if(!_is_quantized) + if (!_is_quantized) { _mm_gemm->prepare(gemm_pack); } diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h index d975859d87..0621238ab5 100644 --- a/src/gpu/cl/operators/ClFullyConnected.h +++ b/src/gpu/cl/operators/ClFullyConnected.h @@ -47,7 +47,7 @@ namespace kernels { class ClMatMulNativeKernel; class ClMatMulLowpNativeKernel; -} +} // namespace kernels /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: * * -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer) @@ -88,7 +88,11 @@ public: * Data type supported: Same as @p src. * @param[in] fc_info (Optional) Fully connected layer additional info */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -96,18 +100,36 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); // Inherited methods overriden - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: - void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); - void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); - void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); + void configure_fc_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); + void configure_mm(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); private: enum AuxTensorIdx @@ -134,19 +156,19 @@ private: TensorInfo _reshaped_weights{}; TensorInfo _lhs_to_use{}; TensorInfo _weights_to_use{}; - int _weights_to_use_idx{ ACL_SRC_1 }; + int _weights_to_use_idx{ACL_SRC_1}; - bool _run_convert_weights{ false }; - bool _transpose_weights{ false }; - bool _dynamic_gemm{ false }; - bool _use_matmul{ false }; + bool _run_convert_weights{false}; + bool _transpose_weights{false}; + bool _dynamic_gemm{false}; + bool _use_matmul{false}; - bool _is_fc_after_conv{ true }; - bool _is_quantized{ false }; - bool _is_prepared{ false }; + bool _is_fc_after_conv{true}; + bool _is_quantized{false}; + bool _is_prepared{false}; #ifdef ARM_COMPUTE_ASSERTS_ENABLED - int _asrt_run_count {}; + int _asrt_run_count{}; int _asrt_prepare_count{}; #endif // ARM_COMPUTE_ASSERTS_ENABLED }; diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp index 7e331a86f3..815c254c69 100644 --- a/src/gpu/cl/operators/ClGemm.cpp +++ b/src/gpu/cl/operators/ClGemm.cpp @@ -33,11 +33,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/ITensorAllocator.h" +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/utils/helpers/float_ops.h" @@ -45,8 +46,6 @@ #include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" #include "utils/TypePrinter.h" @@ -67,35 +66,43 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) return kernel_type == CLGEMMKernelType::NATIVE ? false : true; } //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type -inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights) +inline CLGEMMKernelType +auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights) { - if(!constant_weights) + if (!constant_weights) { return CLGEMMKernelType::NATIVE; } auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); - if(bool(gemm_kernel)) + if (bool(gemm_kernel)) { - if(validate_gemm_kernel(gemm_kernel.gemm_type)) + if (validate_gemm_kernel(gemm_kernel.gemm_type)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } } gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } // Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info) +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + GEMMKernelInfo gemm_kernel_info) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } @@ -103,12 +110,14 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs gemm_kernel_info.lhs_info = lhs_info; gemm_kernel_info.rhs_info = rhs_info; gemm_kernel_info.has_pad_y = false; - if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) { return false; } gemm_kernel_info.has_pad_y = true; - if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) { return false; } @@ -116,49 +125,65 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs } //Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, - const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output) +inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, + GEMMKernelInfo kernel_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output) { auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info)) + if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } // Validate lhs_info and rhs_info for reshaped kernel -inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d) +inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + GEMMKernelInfo gemm_kernel_info, + bool reinterpret_input_as_3d) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel TensorInfo tmp_a_info{}; TensorInfo tmp_b_info{}; // Validate reshape LHS kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d))); - if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d))) + auto_init_if_empty(tmp_a_info, + a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d))); + if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d))) { return false; } // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } // Validate mm kernel gemm_kernel_info.lhs_info = lhs_info; gemm_kernel_info.rhs_info = rhs_info; - if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) { return false; } @@ -166,21 +191,32 @@ inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, co } //Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs -inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d) +inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, + GEMMKernelInfo kernel_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + bool reinterpret_input_as_3d) { auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d)) + if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, + reinterpret_input_as_3d)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_reshaped(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), + to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } // namespace @@ -200,18 +236,24 @@ ClGemm::ClGemm() { } -void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_native(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -225,24 +267,32 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn // Set the target for the kernels _mm_native_kernel->set_target(gpu_target); - auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); // Configure and tune matrix multiply kernel - _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info); + _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, + kernel_info); } -void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_reshaped(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -261,32 +311,42 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor GEMMRHSMatrixInfo rhs_info{}; // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, - c, output, gemm_info.reinterpret_input_as_3d()); + std::tie(lhs_info, rhs_info) = + auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, + kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d()); _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); // Configure and tune matrix multiply kernel - _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, + kernel_info); // Request memory for LHS and RHS reshape matrix _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size()); - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } -void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -304,7 +364,8 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context GEMMRHSMatrixInfo rhs_info{}; // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output); // Transpose matrix _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); @@ -315,24 +376,33 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context // Configure matrix multiply kernel with no y padding support kernel_info.has_pad_y = false; - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, + kernel_info); // Request memory for RHS reshape matrix - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } -void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -350,9 +420,10 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co GEMMRHSMatrixInfo rhs_info{}; // Pick up the GEMM configuration - auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; // Force H0 to 4 in order to use the MMUL extension rhs_info.h0 = 4; @@ -361,13 +432,22 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co // Configure matrix multiply kernel with no y padding support kernel_info.has_pad_y = false; - _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, + rhs_info, kernel_info); // Request memory for RHS reshape matrix - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } -Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_native(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -376,12 +456,12 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const const GPUTarget gpu_target = CLScheduler::get().target(); DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -392,15 +472,23 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const kernel_info.broadcast_bias = broadcast_bias; kernel_info.activation_info = gemm_info.activation_info(); - auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate( + a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info)); return Status{}; } -Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_reshaped(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -412,12 +500,12 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con const GPUTarget gpu_target = CLScheduler::get().target(); DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -433,23 +521,33 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + const auto gemm_config = + select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape( + compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, + beta, lhs_info, rhs_info, kernel_info)); return Status{}; } -Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -460,12 +558,12 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf const GPUTarget gpu_target = CLScheduler::get().target(); const DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -481,24 +579,33 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + const auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); // Validate matrix multiply kernel_info.has_pad_y = false; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); kernel_info.has_pad_y = true; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); return Status{}; } -Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -508,12 +615,12 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens const GPUTarget gpu_target = CLScheduler::get().target(); const DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -529,9 +636,10 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + const auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; // Force H0 to 4 in order to use the MMUL extension rhs_info.h0 = 4; @@ -540,12 +648,20 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens // Validate matrix multiply kernel_info.has_pad_y = false; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); return Status{}; } -void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +void ClGemm::configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); @@ -558,20 +674,21 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, _is_prepared = gemm_info.retain_internal_weights(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); // Select GEMMType - _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run, - b->are_values_constant()); + _gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size}, + _reshape_b_only_on_first_run, b->are_values_constant()); const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; - switch(_gemm_kernel_type) + switch (_gemm_kernel_type) { case CLGEMMKernelType::NATIVE: { @@ -600,35 +717,41 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, } } -Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { // Get the GPU target bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); // Check data type early because the auto_select_gemm_kernel has assertions on supported data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16); // Select GEMMType - CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery - { - CLScheduler::get().target(), - a->data_type(), - m, - n, - k, - batch_size, - }, - gemm_info.reshape_b_only_on_first_run(), b->are_values_constant()); + CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{ + CLScheduler::get().target(), + a->data_type(), + m, + n, + k, + batch_size, + }, + gemm_info.reshape_b_only_on_first_run(), b->are_values_constant()); const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; - switch(gemm_kernel_type) + switch (gemm_kernel_type) { case CLGEMMKernelType::NATIVE: { @@ -647,7 +770,8 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso } case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: { - ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info)); break; } default: @@ -674,7 +798,7 @@ void ClGemm::run(ITensorPack &tensors) prepare(tensors); // Run matrix multiply kernel - switch(_gemm_kernel_type) + switch (_gemm_kernel_type) { case CLGEMMKernelType::NATIVE: { @@ -684,13 +808,13 @@ void ClGemm::run(ITensorPack &tensors) case CLGEMMKernelType::RESHAPED: { // Run interleave kernel - ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } }; + ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); } // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts @@ -698,7 +822,7 @@ void ClGemm::run(ITensorPack &tensors) gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get()); gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED) { CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true); } @@ -706,10 +830,10 @@ void ClGemm::run(ITensorPack &tensors) } case CLGEMMKernelType::RESHAPED_ONLY_RHS: { - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); } // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement @@ -722,7 +846,7 @@ void ClGemm::run(ITensorPack &tensors) ITensorPack gemm_reshaped_onlyrhs_pack(tensors); gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); - if(has_pad_y) + if (has_pad_y) { ARM_COMPUTE_ERROR_ON(has_pad_y); } @@ -734,10 +858,10 @@ void ClGemm::run(ITensorPack &tensors) } case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: { - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); } // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement @@ -750,7 +874,7 @@ void ClGemm::run(ITensorPack &tensors) ITensorPack gemm_reshaped_onlyrhs_pack(tensors); gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); - if(has_pad_y) + if (has_pad_y) { ARM_COMPUTE_ERROR_ON(has_pad_y); } @@ -769,20 +893,22 @@ void ClGemm::run(ITensorPack &tensors) void ClGemm::prepare(ITensorPack &constants) { - if(!_is_prepared) + if (!_is_prepared) { - const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1); - ICLTensor *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape))); + const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1); + ICLTensor *rhs_aux = + utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape))); // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed - if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) + if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && + (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) { ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!"); CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux); ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr); - ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true); } _is_prepared = true; diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h index 11f9f2b3d8..85dc1d6c8f 100644 --- a/src/gpu/cl/operators/ClGemm.h +++ b/src/gpu/cl/operators/ClGemm.h @@ -90,30 +90,95 @@ public: * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping * in case matrix A and matrix B have been already transformed. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemm::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: - void configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_native(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); - static Status validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate_native(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped_only_rhs(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); private: enum AuxTensorIdx diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp index 5620471ff9..55d815a1ef 100644 --- a/src/gpu/cl/operators/ClGemmConv2d.cpp +++ b/src/gpu/cl/operators/ClGemmConv2d.cpp @@ -28,10 +28,12 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClActivationKernel.h" @@ -41,8 +43,6 @@ #include "src/gpu/cl/operators/ClGemm.h" #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" #include "src/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" namespace arm_compute @@ -53,18 +53,38 @@ using namespace utils::cast; namespace opencl { ClGemmConv2d::ClGemmConv2d() - : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(), - _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count) + : _weights_reshape_kernel(nullptr), + _im2col_kernel(nullptr), + _mm_gemm(nullptr), + _mm_gemmlowp(nullptr), + _col2im_kernel(nullptr), + _activation_kernel(nullptr), + _im2col_output(), + _weights_reshaped(), + _gemm_output(), + _skip_im2col(false), + _skip_col2im(false), + _is_quantized(false), + _fuse_activation(true), + _append_bias(false), + _is_prepared(false), + _aux_mem(AuxTensorIdx::Count) { } ClGemmConv2d::~ClGemmConv2d() = default; -void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, +void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, const ActivationLayerInfo &act_info) + int gemm_3d_depth, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped false, // is_b_reshaped @@ -77,18 +97,20 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I false, // fp_mixed_precision true, // broadcast_bias act_info // activation_info - ); + ); - TensorInfo tmp_src{ *src }; - if(_is_quantized) + TensorInfo tmp_src{*src}; + if (_is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset const QuantizationInfo input_quantization_info = src->quantization_info(); const QuantizationInfo weights_quantization_info = weights->quantization_info(); - tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + tmp_src.set_quantization_info( + QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights->set_quantization_info( + QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>(); _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info); @@ -97,7 +119,7 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I weights->set_quantization_info(weights_quantization_info); auto mm_mem_req = _mm_gemmlowp->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } @@ -108,15 +130,21 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I _mm_gemm = std::make_unique<ClGemm>(); _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info); auto mm_mem_req = _mm_gemm->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } } } -Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info) +Status ClGemmConv2d::validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + bool skip_im2col, + const ActivationLayerInfo &act_info) { const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type()); @@ -131,9 +159,9 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig false, // fp_mixed_precision true, // broadcast_bias act_info // activation_info - ); + ); - if(is_quantized) + if (is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -142,8 +170,10 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig std::unique_ptr<ITensorInfo> src_qa = src->clone(); std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); - src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + src_qa->set_quantization_info( + QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights_qa->set_quantization_info( + QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Perform validation step on GEMMLowp return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info); @@ -155,14 +185,17 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig } } -void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) +void ClGemmConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, - conv2d_info, - weights_info)); + ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info); const DataType data_type = src->data_type(); @@ -180,7 +213,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf _is_prepared = weights_info.retain_internal_weights(); _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); _skip_col2im = data_layout == DataLayout::NHWC; // Only for quantize there are few cases where we cannot fuse the activation function in GEMM @@ -197,12 +231,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf // Get convolved dimensions unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv2d_info.conv_info, - conv2d_info.dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv2d_info.conv_info, conv2d_info.dilation); unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; @@ -210,28 +240,31 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf _append_bias = false; _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>(); - if(conv2d_info.num_groups != 1 && biases != nullptr) + if (conv2d_info.num_groups != 1 && biases != nullptr) { // num_groups != 1 can only be for NCHW // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor biases_to_use = nullptr; _append_bias = true; - _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups); + _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, + conv2d_info.num_groups); } else { - _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups); + _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, + conv2d_info.num_groups); } // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) + if (!_skip_im2col) { // Configure and tune im2col. im2col output shape is auto-initialized _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>(); // Set the GPU target for im2col _im2col_kernel->set_target(CLScheduler::get().target()); - _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups); + _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), + conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups); // Set quantization info _im2col_output.set_quantization_info(src->quantization_info()); @@ -242,7 +275,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf } // Create GEMM output tensor - if(!_skip_col2im) + if (!_skip_col2im) { TensorShape shape_gemm; @@ -263,7 +296,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf gemmlowp_output_stage.gemmlowp_offset = 0; // Configure output stage for quantized case - if(_is_quantized) + if (_is_quantized) { const auto output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info; const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); @@ -286,16 +319,16 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf auto min_activation = min_val.get<int32_t>(); auto max_activation = max_val.get<int32_t>(); - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; - if(conv2d_info.act_info.enabled()) + if (conv2d_info.act_info.enabled()) { - if(supported_acts.count(conv2d_info.act_info.activation()) != 0) + if (supported_acts.count(conv2d_info.act_info.activation()) != 0) { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); } else { @@ -313,48 +346,60 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info); + configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, + gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info); - if(!_skip_col2im) + if (!_skip_col2im) { // Set the GPU target for col2im _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>(); _col2im_kernel->set_target(CLScheduler::get().target()); // Configure and tune Col2Im - _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups); + _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), + conv2d_info.num_groups); CLScheduler::get().tune_kernel_static(*_col2im_kernel.get()); } ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), "Output shape does not match the expected one"); - if(!_fuse_activation) + if (!_fuse_activation) { _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>(); _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info); } - _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); - _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size()); - _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); + _aux_mem[Im2ColOutput] = + MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + _aux_mem[WeightsReshaped] = + MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size()); + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); } -Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, +Status ClGemmConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); - if(!is_quantized_per_channel) + if (!is_quantized_per_channel) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), + "Grouping (num_groups != 1) is not supported with QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && + (src->data_layout() == DataLayout::NCHW)); const DataLayout data_layout = src->data_layout(); const DataType data_type = src->data_type(); @@ -374,18 +419,19 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights const ITensorInfo *gemm_output_to_use = dst; const ITensorInfo *weights_to_use = weights; const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 - && conv2d_info.conv_info.stride().second == 1); - const bool skip_col2im = data_layout == DataLayout::NHWC; - bool fuse_activation = true; + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); + const bool skip_col2im = data_layout == DataLayout::NHWC; + bool fuse_activation = true; - ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel)); + ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != + src->dimension(idx_channel)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -397,7 +443,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); } - if(conv2d_info.act_info.enabled()) + if (conv2d_info.act_info.enabled()) { ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a()); } @@ -406,48 +452,50 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv2d_info.conv_info, - conv2d_info.dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv2d_info.conv_info, conv2d_info.dilation); unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; const ITensorInfo *biases_to_use = biases; bool append_bias = false; - if(conv2d_info.num_groups != 1 && biases != nullptr) + if (conv2d_info.num_groups != 1 && biases != nullptr) { // num_groups != 1 can only be for NCHW // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor - biases_to_use = nullptr; - append_bias = true; - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type); + biases_to_use = nullptr; + append_bias = true; + weights_reshaped_info = + TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type); } else { - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type); + weights_reshaped_info = + TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type); } weights_to_use = &weights_reshaped_info; - if(!skip_im2col) + if (!skip_im2col) { const Size2D kernel_dims(kernel_width, kernel_height); // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups); + TensorShape expected_output_shape = + compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, + conv2d_info.num_groups == 1, conv2d_info.num_groups); auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, + append_bias, conv2d_info.dilation, conv2d_info.num_groups)); gemm_input_to_use = &im2col_reshaped_info; } // Create GEMM output tensor - if(!skip_col2im) + if (!skip_col2im) { TensorShape shape_gemm; @@ -465,7 +513,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights gemmlowp_output_stage.gemmlowp_offset = 0; gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -483,16 +531,16 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights int min_activation = 0; int max_activation = 0; - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; - if(conv2d_info.act_info.enabled()) + if (conv2d_info.act_info.enabled()) { - if(supported_acts.count(conv2d_info.act_info.activation()) != 0) + if (supported_acts.count(conv2d_info.act_info.activation()) != 0) { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); } else { @@ -509,16 +557,18 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, + gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info)); // Validate Col2Im - if(!skip_col2im) + if (!skip_col2im) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups)); } // Validate Activation Layer - if(!fuse_activation) + if (!fuse_activation) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info)); } @@ -541,30 +591,26 @@ void ClGemmConv2d::run(ITensorPack &tensors) CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); // Run im2col - if(!_skip_im2col) + if (!_skip_im2col) { - ITensorPack pack = - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, im2col_output.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}}; CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false); gemm_input_to_use = im2col_output.get(); } - if(!_skip_col2im) + if (!_skip_col2im) { gemm_output_to_use = gemm_output.get(); } ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); - if(!_append_bias) + if (!_append_bias) { pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases); } pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions - if(_is_quantized) + if (_is_quantized) { // Run gemmlowp _mm_gemmlowp->run(pack_mm); @@ -576,43 +622,32 @@ void ClGemmConv2d::run(ITensorPack &tensors) } // Reshape output matrix - if(!_skip_col2im) + if (!_skip_col2im) { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false); } //Run Activation Layer if we cannot fuse in GEMM - if(!_fuse_activation) + if (!_fuse_activation) { - ITensorPack pack = - { - { TensorType::ACL_SRC, dst }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}}; CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false); } } void ClGemmConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { // Run weights reshaping and mark original weights tensor as unused - ICLTensor *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped))); + ICLTensor *weights_reshaped_p = + utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped))); CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p); auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensorPack pack = - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, weights_reshaped.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}}; - if(_append_bias) + if (_append_bias) { const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); pack.add_const_tensor(TensorType::ACL_BIAS, biases); diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h index 8a46ee2dc3..e8f3147ac3 100644 --- a/src/gpu/cl/operators/ClGemmConv2d.h +++ b/src/gpu/cl/operators/ClGemmConv2d.h @@ -27,6 +27,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -100,15 +101,24 @@ public: * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights * tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info = WeightsInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemmConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info, + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info = WeightsInfo()); // Inherited methods overridden: @@ -130,9 +140,14 @@ private: * @param[in] gemm_3d_depth Depth of GEMM 3D * @param[in] act_info Activation to apply after the matrix multiplication */ - void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + void configure_mm(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, const ActivationLayerInfo &act_info); + int gemm_3d_depth, + const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines * * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -148,8 +163,14 @@ private: * * @return a status */ - static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info); + static Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + bool skip_im2col, + const ActivationLayerInfo &act_info); enum AuxTensorIdx { diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp index 2622274587..71c247de79 100644 --- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp @@ -52,7 +52,7 @@ namespace { inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) { - switch(kernel_type) + switch (kernel_type) { case CLGEMMKernelType::NATIVE: case CLGEMMKernelType::RESHAPED_ONLY_RHS: @@ -71,32 +71,41 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run) { auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); - if(bool(gemm_kernel)) + if (bool(gemm_kernel)) { - if(validate_gemm_kernel(gemm_kernel.gemm_type)) + if (validate_gemm_kernel(gemm_kernel.gemm_type)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } } gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } // Validate lhs_info and rhs_info for native kernel -inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) +inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const GEMMReshapeInfo &reshape_info) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo mm_result_s32_info{}; // Output tensor auto initialization if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32)); + auto_init_if_empty( + mm_result_s32_info, + a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32)); // Validate mm kernel // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info // NOTE: This assumes: // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments). // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window). - if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info))) + if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, + reshape_info))) { return false; } @@ -104,31 +113,45 @@ inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, cons } // Automatically select between mlgo (prioritized) and default heuristics for native kernel configs -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, + const ITensorInfo *a, + const ITensorInfo *b, + const GEMMReshapeInfo &reshape_info) { auto config = auto_heuristics::select_mlgo_gemm_config_native(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info)) + if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_native(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } // Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, - unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d) +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output, + unsigned int m, + unsigned int n, + unsigned int k, + bool reinterpret_input_as_3d, + int depth_output_gemm3d) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } @@ -148,7 +171,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs // Since we ignore the output stage, output data type has to be S32 to pass the validation TensorInfo output_info_copy(*output); output_info_copy.set_data_type(DataType::S32); - if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, + gemm_kernel_info))) { return false; } @@ -156,14 +180,22 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs } // Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, - unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d) +inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output, + unsigned int m, + unsigned int n, + unsigned int k, + bool reinterpret_input_as_3d, + int depth_output_gemm3d) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } @@ -183,7 +215,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo // Since we ignore the output stage, output data type has to be S32 to pass the validation TensorInfo output_info_copy(*output); output_info_copy.set_data_type(DataType::S32); - if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, + gemm_kernel_info))) { return false; } @@ -191,40 +224,55 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo } // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, - const ITensorInfo *a, - const ITensorInfo *b, const ITensorInfo *output) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, + bool reinterpret_input_as_3d, + int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output) { auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d)) + if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, + query.k, reinterpret_input_as_3d, depth_output_gemm3d)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, - const ITensorInfo *a, - const ITensorInfo *b, const ITensorInfo *output) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, + bool reinterpret_input_as_3d, + int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output) { ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d); auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), - to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, + query.k, reinterpret_input_as_3d, depth_output_gemm3d); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) { - switch(kernel_type) + switch (kernel_type) { case CLGEMMKernelType::NATIVE: return false; @@ -254,8 +302,11 @@ ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore() ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default; void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, - ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, - const GEMMInfo &gemm_info) + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info)); @@ -263,8 +314,8 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _a_offset = a->quantization_info().uniform().offset; - _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) - && a->data_type() == DataType::QASYMM8; + _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && + is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8; _b_offset = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset; _gemm_info = gemm_info; @@ -282,17 +333,18 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con // Arguments used by GEMMReshapeInfo // in order to know how the matrices have been reshaped bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run); + _gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run); - if(_convert_to_qasymm8) + if (_convert_to_qasymm8) { // Set data type for converted weights _qasymm8_weights = *b; @@ -301,47 +353,50 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con } ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { matrix_b = &_tmp_b; // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d, - depth_output_gemm3d, - a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d, + depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); // Configure reshape RHS kernel - _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, + rhs_info); } - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { matrix_b = &_tmp_b; // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d, - depth_output_gemm3d, - a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d, + depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); // Configure reshape RHS kernel - _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, + rhs_info); } // Using default reduction info - const GEMMLowpReductionKernelInfo reduction_info {}; + const GEMMLowpReductionKernelInfo reduction_info{}; // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) + if (_a_offset != 0) { _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); + _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, + &_vector_sum_col, reduction_info); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); @@ -360,17 +415,19 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con gemm_kernel_info.a_offset = _a_offset; gemm_kernel_info.b_offset = _b_offset; // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { // Configure offset contribution kernel - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32); _gemm_output_stage_shifts = TensorInfo(TensorShape(num_filters), 1, DataType::S32); GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); gemmlowp_output_stage.output_data_type = a->data_type(); - if(num_filters == 1) + if (num_filters == 1) { // Per-channel quantization with OFM == 1 is equivalent to uniform quantization. // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts @@ -379,55 +436,67 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con gemm_kernel_info.output_stage = gemmlowp_output_stage; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && + gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_reshaped_only_rhs_kernel->configure( + compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); } - else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && + gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_reshaped_only_rhs_mmul_kernel->configure( + compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); } else { _run_output_stage = true; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, + gemm_kernel_info); } - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { - _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, + gemm_kernel_info); } else { // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, - a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a, + _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info); // Configure matrix multiply kernel - _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info); - - _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, - c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage, - &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, + reshape_info); + + _offset_contribution_output_stage_kernel->configure( + compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0), + _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, + &_gemm_output_stage_shifts); } } } else { _run_offset_contribution = true; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { // Configure and tune matrix multiply kernel _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); } - else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { // Configure and tune matrix multiply kernel _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); @@ -436,44 +505,65 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con { // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, - a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a, + _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info); // Configure matrix multiply kernel _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info); } // Configure offset contribution kernel - _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, - c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset); + _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + a->dimension(0), _a_offset, _b_offset); } // Request memory - _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); - if(is_gemm_reshaped(_gemm_kernel_type)) + _aux_mem[RhsQAsymm8] = + MemoryInfo(offset_int_vec(RhsQAsymm8), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _qasymm8_weights.total_size()); + if (is_gemm_reshaped(_gemm_kernel_type)) { // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation - _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); - } - if(_a_offset != 0) - { - _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size()); - } - if(_b_offset != 0) - { - _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); - } - _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); - _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size()); - _aux_mem[Shifts] = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size()); + _aux_mem[RhsQAsymm8] = + MemoryInfo(offset_int_vec(RhsQAsymm8), + _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, + _qasymm8_weights.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + } + if (_a_offset != 0) + { + _aux_mem[VecSumCol] = + MemoryInfo(offset_int_vec(VecSumCol), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _vector_sum_col.total_size()); + } + if (_b_offset != 0) + { + _aux_mem[VecSumRow] = + MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + } + _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, + _gemm_output_stage_multipliers.total_size()); + _aux_mem[Shifts] = + MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size()); } -Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); @@ -492,39 +582,44 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso const GPUTarget gpu_target = CLScheduler::get().target(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run())); + bool reshape_matrix_b = is_gemm_reshaped( + auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, + gemm_info.reshape_b_only_on_first_run())); const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) - && is_data_type_quantized_asymmetric(a->data_type()); + bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && + is_data_type_quantized_symmetric(b->data_type()) && + is_data_type_quantized_asymmetric(a->data_type()); TensorInfo weights_info(*b); - if(convert_to_qasymm8) + if (convert_to_qasymm8) { b_offset = -128; weights_info.set_data_type(DataType::QASYMM8); ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); } const ITensorInfo *matrix_b_info = &weights_info; - if(reshape_matrix_b) + if (reshape_matrix_b) { matrix_b_info = &tmp_b_info; // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; + const auto res = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; // Validate reshape RHS kernel - auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); + auto_init_if_empty(tmp_b_info, + weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); } @@ -533,21 +628,23 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso const GEMMLowpReductionKernelInfo reduction_info; // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) + if (a_offset != 0) { info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) + if (b_offset != 0) { info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); } GEMMKernelInfo gemm_kernel_info; @@ -560,92 +657,99 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso gemm_kernel_info.rhs_info = rhs_info; gemm_kernel_info.a_offset = a_offset; gemm_kernel_info.b_offset = b_offset; - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; - const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + const TensorInfo gemm_output_stage_multipliers_shifts_info( + TensorInfo(TensorShape(num_filters), 1, DataType::S32)); GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); gemmlowp_output_stage.output_data_type = a->data_type(); gemm_kernel_info.output_stage = gemmlowp_output_stage; - if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (reshape_matrix_b && + gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info, + &gemm_output_stage_multipliers_shifts_info)); } else { TensorInfo mm_result_s32_info{}; - if(reshape_matrix_b) + if (reshape_matrix_b) { // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32)); + auto_init_if_empty(mm_result_s32_info, a->clone() + ->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, reshape_info)) + .set_data_type(DataType::S32)); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); } else { // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32)); + auto_init_if_empty(mm_result_s32_info, a->clone() + ->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, false, reshape_info)) + .set_data_type(DataType::S32)); // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; + const auto res = select_default_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - output, - a_offset, b_offset, - gemmlowp_output_stage, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage, + &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info)); } } else { - if(reshape_matrix_b) + if (reshape_matrix_b) { // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, output, gemm_kernel_info)); } else { // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; + const auto res = select_default_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate( + matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); } - if(output->total_size() != 0) + if (output->total_size() != 0) { // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, + c, a_offset, b_offset)); } } @@ -675,73 +779,61 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) const ITensor *matrix_a = a; const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b; - if(is_gemm_reshaped(_gemm_kernel_type)) + if (is_gemm_reshaped(_gemm_kernel_type)) { matrix_b = tmp_b.get(); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run reshape matrix B - ITensorPack mtx_b_reshape_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, tmp_b.get() } - }; + ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, tmp_b.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false); } } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) + if (_a_offset != 0 && !_reshape_b_only_on_first_run) { - ITensorPack mtx_b_red_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, vec_sum_col.get() } - }; + ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, vec_sum_col.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { - ITensorPack mtx_a_red_pack = - { - { TensorType::ACL_SRC, matrix_a }, - { TensorType::ACL_DST, vec_sum_row.get() } - }; + ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}}; CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false); } // Run matrix multiply - if(is_gemm_reshaped(_gemm_kernel_type)) + if (is_gemm_reshaped(_gemm_kernel_type)) { ITensorPack gemm_reshaped_pack; - if(_run_offset_contribution) + if (_run_offset_contribution) { - gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b }, - { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst } - }); + gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}}); } else { - gemm_reshaped_pack = ITensorPack( - { - { TensorType::ACL_SRC, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b }, - { TensorType::ACL_BIAS, c }, - { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, - { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }, - { TensorType::ACL_SHIFTS, shifts.get() }, - { TensorType::ACL_MULTIPLIERS, multipliers.get() }, - { TensorType::ACL_DST, dst }, + gemm_reshaped_pack = ITensorPack({ + {TensorType::ACL_SRC, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}, + {TensorType::ACL_SHIFTS, shifts.get()}, + {TensorType::ACL_MULTIPLIERS, multipliers.get()}, + {TensorType::ACL_DST, dst}, }); } - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false); } - else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false); } @@ -752,46 +844,39 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) } else { - ITensorPack gemm_native_pack = - { - { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b }, - { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() } - }; + ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}}; CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false); } - if(_run_output_stage) + if (_run_output_stage) { // Run offset contribution/output stage kernel - ITensorPack output_stage_pack = - { - { TensorType::ACL_SRC, res32.get() }, - { TensorType::ACL_BIAS, c }, - { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, - { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }, - { TensorType::ACL_SHIFTS, shifts.get() }, - { TensorType::ACL_MULTIPLIERS, multipliers.get() }, - { TensorType::ACL_DST, dst }, + ITensorPack output_stage_pack = { + {TensorType::ACL_SRC, res32.get()}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}, + {TensorType::ACL_SHIFTS, shifts.get()}, + {TensorType::ACL_MULTIPLIERS, multipliers.get()}, + {TensorType::ACL_DST, dst}, }; CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true); } - if(_run_offset_contribution) + if (_run_offset_contribution) { // Run offset contribution kernel - ITensorPack offset_contrib_pack = - { - { TensorType::ACL_SRC_DST, dst }, - { TensorType::ACL_BIAS, c }, - { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, - { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() } - }; + ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}}; CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true); } } void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true); @@ -800,56 +885,55 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) ARM_COMPUTE_ERROR_ON_NULLPTR(b); - if(_convert_to_qasymm8) + if (_convert_to_qasymm8) { - ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } }; + ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}}; CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false); b->mark_as_unused(); } - if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run) + if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run) { // Run reshape kernel and mark original weights tensor as unused - ITensorPack mtx_b_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, tmp_b.get() } - }; + ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, tmp_b.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false); b->mark_as_unused(); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && _reshape_b_only_on_first_run) + if (_a_offset != 0 && _reshape_b_only_on_first_run) { - ITensorPack mtx_b_red_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, vec_sum_col.get() } - }; + ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, vec_sum_col.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } // Compute GEMM output multipliers and shifts for output stage { - const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false); CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false); ICLTensor *multiplier_tensor = multipliers.get(); - if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0) + if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0) { multiplier_tensor->map(CLScheduler::get().queue(), true); - std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t)); + std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), + _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), + num_filters * sizeof(int32_t)); multiplier_tensor->unmap(CLScheduler::get().queue()); } ICLTensor *shifts_tensor = shifts.get(); - if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0) + if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0) { shifts_tensor->map(CLScheduler::get().queue(), true); - std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); + std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), + _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); shifts_tensor->unmap(CLScheduler::get().queue()); } } diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h index 6e32a90fc4..c80dc3a182 100644 --- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h @@ -93,18 +93,27 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run */ - void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemmLowpMatrixMultiplyCore::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: @@ -130,7 +139,7 @@ private: std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel> _offset_contribution_kernel; - std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; + std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; // Temporary tensors TensorInfo _qasymm8_weights{}; @@ -141,13 +150,13 @@ private: TensorInfo _gemm_output_stage_multipliers{}; TensorInfo _gemm_output_stage_shifts{}; - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - bool _reshape_b_only_on_first_run{ false }; - bool _run_output_stage{ false }; - bool _convert_to_qasymm8{ false }; - bool _run_offset_contribution{ false }; - bool _is_prepared{ false }; + int32_t _a_offset{0}; + int32_t _b_offset{0}; + bool _reshape_b_only_on_first_run{false}; + bool _run_output_stage{false}; + bool _convert_to_qasymm8{false}; + bool _run_offset_contribution{false}; + bool _is_prepared{false}; GEMMInfo _gemm_info{}; CLGEMMKernelType _gemm_kernel_type{}; diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp index a61b11a3b1..e3363e3685 100644 --- a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp @@ -27,22 +27,25 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { @@ -70,12 +73,16 @@ void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, c } } -Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info); @@ -94,7 +101,7 @@ void ClGemmLowpOutputStage::run(ITensorPack &tensors) const ITensor *bias = tensors.get_const_tensor(ACL_BIAS); ITensor *dst = tensors.get_tensor(ACL_DST); - ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } }; + ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}}; CLScheduler::get().enqueue_op(*_kernel, pack, true); } } // namespace opencl diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h index 3f1b04dcce..6357e0200b 100644 --- a/src/gpu/cl/operators/ClGemmLowpOutputStage.h +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h @@ -71,14 +71,21 @@ public: * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED * @param[in] info GEMMLowp output stage metadata. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemmLowpOutputStage::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp index b900974574..777fc9e5e1 100644 --- a/src/gpu/cl/operators/ClIndirectConv2d.cpp +++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp @@ -27,16 +27,15 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h" #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h" #include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "src/common/utils/Log.h" - using namespace arm_compute::cl_indirect_conv; namespace arm_compute @@ -47,7 +46,8 @@ using namespace arm_compute::experimental; namespace { -DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo +config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) { // Get GPU target GPUTarget gpu_target = CLScheduler::get().target(); @@ -59,8 +59,13 @@ DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo * } // namespace -void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void ClIndirectConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info); @@ -86,25 +91,29 @@ void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITenso CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel); // Request memory for the indirect buffer - _aux_mem[IndirectBuffer] = MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size()); + _aux_mem[IndirectBuffer] = + MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size()); } -Status ClIndirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +Status ClIndirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { // Initialize the direct convolution descriptor const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info); - TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), - src->data_layout(), - weights->tensor_shape(), - conv_info, - desc); + TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape( + src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc); TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(src, weights, &indirect_buffer, conv_info, desc)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, conv_info, act_info, desc)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate( + src, weights, &indirect_buffer, conv_info, desc)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, + conv_info, act_info, desc)); return Status{}; } @@ -124,9 +133,10 @@ void ClIndirectConv2d::run(ITensorPack &tensors) void ClIndirectConv2d::prepare(ITensorPack &constants) { - if(!_is_prepared) + if (!_is_prepared) { - ICLTensor *indirect_buffer_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer))); + ICLTensor *indirect_buffer_aux = + utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer))); ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr); ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer"); @@ -134,7 +144,7 @@ void ClIndirectConv2d::prepare(ITensorPack &constants) CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux); ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr); - ITensorPack indirect_buffer_pack{ { ACL_DST, indirect_buffer.get() } }; + ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}}; CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true); _is_prepared = true; diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h index e50fa25069..29e796efd9 100644 --- a/src/gpu/cl/operators/ClIndirectConv2d.h +++ b/src/gpu/cl/operators/ClIndirectConv2d.h @@ -77,7 +77,12 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -85,12 +90,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: @@ -100,11 +109,11 @@ private: Count }; - std::unique_ptr<IClKernel> _indirect_conv_kernel{ nullptr }; - std::unique_ptr<IClKernel> _addr_precalculation_kernel{ nullptr }; + std::unique_ptr<IClKernel> _indirect_conv_kernel{nullptr}; + std::unique_ptr<IClKernel> _addr_precalculation_kernel{nullptr}; TensorInfo _indirect_buffer{}; - bool _is_prepared{ false }; - experimental::MemoryRequirements _aux_mem{ Count }; + bool _is_prepared{false}; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp index b2eb89b320..d8d4186d00 100644 --- a/src/gpu/cl/operators/ClLogicalNot.cpp +++ b/src/gpu/cl/operators/ClLogicalNot.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClLogicalNot.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp index 49d14127ca..c14b1f2992 100644 --- a/src/gpu/cl/operators/ClMatMul.cpp +++ b/src/gpu/cl/operators/ClMatMul.cpp @@ -47,11 +47,17 @@ ClMatMul::ClMatMul() { } -Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) +Status ClMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); const GPUTarget gpu_target = CLScheduler::get().target(); @@ -61,11 +67,16 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); - return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) : - ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) + : ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); } -void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) +void ClMatMul::configure(const CLCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info); @@ -81,12 +92,13 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); - if(_is_quantized) + if (_is_quantized) { _matmul_lowp_native_kernel->set_target(gpu_target); // Configure the low-precision native matrix multiply kernel - _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, + act_info); } else { @@ -99,7 +111,7 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l void ClMatMul::run(ITensorPack &tensors) { - if(_is_quantized) + if (_is_quantized) { CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true); } diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h index abbb75239a..64dcf217bd 100644 --- a/src/gpu/cl/operators/ClMatMul.h +++ b/src/gpu/cl/operators/ClMatMul.h @@ -26,6 +26,7 @@ #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/function_info/MatMulInfo.h" + #include "src/gpu/cl/IClOperator.h" #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" @@ -73,7 +74,11 @@ public: * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo. * @param[in] act_info Class containing information about fused activation function. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, + void configure(const CLCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -81,15 +86,19 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; private: - std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{ nullptr }; - std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr }; + std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{nullptr}; + std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr}; - bool _is_quantized{ false }; + bool _is_quantized{false}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp index 2066f0cfaa..10cf8a6a38 100644 --- a/src/gpu/cl/operators/ClMul.cpp +++ b/src/gpu/cl/operators/ClMul.cpp @@ -24,17 +24,23 @@ #include "src/gpu/cl/operators/ClMul.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClMulKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClMulKernel.h" namespace arm_compute { namespace opencl { -void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void ClMul::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); auto k = std::make_unique<kernels::ClMulKernel>(); @@ -42,22 +48,34 @@ void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1 _kernel = std::move(k); } -Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status ClMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); } -void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClComplexMul::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { auto k = std::make_unique<kernels::ClComplexMulKernel>(); k->configure(compile_context, src1, src2, dst, act_info); _kernel = std::move(k); } -Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClComplexMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h index 6086bc9d52..1cf4d68d4c 100644 --- a/src/gpu/cl/operators/ClMul.h +++ b/src/gpu/cl/operators/ClMul.h @@ -66,16 +66,27 @@ public: * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClComplexMulKernel */ @@ -92,14 +103,21 @@ public: * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClComplexMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp index cf4ebe6083..f3efd00bba 100644 --- a/src/gpu/cl/operators/ClPRelu.cpp +++ b/src/gpu/cl/operators/ClPRelu.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClPRelu.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" namespace arm_compute { namespace opencl { using KernelType = kernels::ClArithmeticKernel; -void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output) +void ClPRelu::configure(const CLCompileContext &compile_context, + ITensorInfo *input, + ITensorInfo *alpha, + ITensorInfo *output) { ARM_COMPUTE_LOG_PARAMS(input, alpha, output); auto k = std::make_unique<KernelType>(); @@ -49,7 +51,7 @@ void ClPRelu::run(ITensorPack &tensors) { // Output tensor can be given as nullptr for in-place computation. // In this case, get the input tensor and use it as the output tensor. - if(tensors.get_tensor(TensorType::ACL_DST) == nullptr) + if (tensors.get_tensor(TensorType::ACL_DST) == nullptr) { auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation"); @@ -58,4 +60,4 @@ void ClPRelu::run(ITensorPack &tensors) IClOperator::run(tensors); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h index 8084ab86cd..45ce858fb0 100644 --- a/src/gpu/cl/operators/ClPRelu.h +++ b/src/gpu/cl/operators/ClPRelu.h @@ -47,7 +47,8 @@ public: * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. * @param[out] output Destination tensor. Data type supported: same as @p input */ - void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); + void + configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPRelu::configure() diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp index ed56f97bfe..3851e22b6a 100644 --- a/src/gpu/cl/operators/ClPermute.cpp +++ b/src/gpu/cl/operators/ClPermute.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClPermute.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClPermuteKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +void ClPermute::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm) { ARM_COMPUTE_LOG_PARAMS(src, dst, perm); auto k = std::make_unique<kernels::ClPermuteKernel>(); @@ -45,4 +47,4 @@ Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const return kernels::ClPermuteKernel::validate(src, dst, perm); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h index 3e87329f9b..6349358a18 100644 --- a/src/gpu/cl/operators/ClPermute.h +++ b/src/gpu/cl/operators/ClPermute.h @@ -44,7 +44,10 @@ public: * @param[in] dst The dst tensor info. Data types supported: Same as @p src * @param[in] perm Permutation vector */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPermute::configure() @@ -55,4 +58,4 @@ public: }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_PERMUTE_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_PERMUTE_H */ diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp index 3da90b8ced..e4507dc1a1 100644 --- a/src/gpu/cl/operators/ClPool2d.cpp +++ b/src/gpu/cl/operators/ClPool2d.cpp @@ -25,16 +25,19 @@ #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClPool2dKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices) +void ClPool2d::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + ITensorInfo *indices) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices); @@ -49,7 +52,10 @@ void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *s CLScheduler::get().tune_kernel_static(*_kernel); } -Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices) +Status ClPool2d::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &info, + const ITensorInfo *indices) { return kernels::ClPool2dKernel::validate(src, dst, info, indices); } diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h index f353ba262e..9c2fd1c3f2 100644 --- a/src/gpu/cl/operators/ClPool2d.h +++ b/src/gpu/cl/operators/ClPool2d.h @@ -50,14 +50,21 @@ public: * @param[in] info Pooling layer parameters. * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &info, + const ITensorInfo *indices = nullptr); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp index 7dec6c5958..d230413659 100644 --- a/src/gpu/cl/operators/ClPool3d.cpp +++ b/src/gpu/cl/operators/ClPool3d.cpp @@ -25,16 +25,18 @@ #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClPool3dKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClPool3d::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info) +void ClPool3d::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, dst, info); diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h index 7d994fd194..9fd78bfd69 100644 --- a/src/gpu/cl/operators/ClPool3d.h +++ b/src/gpu/cl/operators/ClPool3d.h @@ -51,7 +51,10 @@ public: * @param[out] dst Destination tensor info. * @param[in] info 3d Pooling layer parameters. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool3d::configure() diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp index 47ae5cea47..8560b5553e 100644 --- a/src/gpu/cl/operators/ClQuantize.cpp +++ b/src/gpu/cl/operators/ClQuantize.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClQuantizeKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClQuantizeKernel.h" namespace arm_compute { diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp index 560966f4fc..1dd5b760cb 100644 --- a/src/gpu/cl/operators/ClReshape.cpp +++ b/src/gpu/cl/operators/ClReshape.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClReshape.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClReshapeKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl @@ -45,4 +44,4 @@ Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) return kernels::ClReshapeKernel::validate(src, dst); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp index 0798b19ca0..184e2aa006 100644 --- a/src/gpu/cl/operators/ClScale.cpp +++ b/src/gpu/cl/operators/ClScale.cpp @@ -25,17 +25,20 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClScaleKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) +void ClScale::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ScaleKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, dst, info); @@ -61,4 +64,4 @@ void ClScale::run(ITensorPack &tensors) CLScheduler::get().enqueue_op(*_kernel.get(), tensors); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h index af97cf23e7..1427bb4fdc 100644 --- a/src/gpu/cl/operators/ClScale.h +++ b/src/gpu/cl/operators/ClScale.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SCALE_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -49,7 +50,8 @@ public: * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo descriptor to be used to configure */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClScale::configure() diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp index 03809553a3..2bec400597 100644 --- a/src/gpu/cl/operators/ClSoftmax.cpp +++ b/src/gpu/cl/operators/ClSoftmax.cpp @@ -22,7 +22,10 @@ * SOFTWARE. */ #include "src/gpu/cl/operators/ClSoftmax.h" + #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" #include "src/gpu/cl/kernels/ClSoftmaxKernel.h" @@ -30,8 +33,6 @@ #include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "support/Cast.h" -#include "src/common/utils/Log.h" - using namespace arm_compute::experimental; namespace arm_compute @@ -52,7 +53,10 @@ ClSoftmax::ClSoftmax() { } -void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info) +void ClSoftmax::configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &dst, + const SoftmaxKernelInfo &info) { ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info)); ARM_COMPUTE_LOG_PARAMS(src, dst, info); @@ -64,14 +68,15 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor const ITensorInfo &tmp_input_info = _needs_permute ? _permuted_src_info : src; ITensorInfo &tmp_output_info = _needs_permute ? _permuted_dst_info : dst; - if(_needs_permute) + if (_needs_permute) { const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info); } - DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type(); - _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type); + DataType tmp_data_type = + is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type(); + _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type); TensorShape max_sum_shape = tmp_input_info.tensor_shape(); _max_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape); @@ -83,33 +88,41 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info); _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info); - if(_needs_permute) + if (_needs_permute) { const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info); } - _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size()); - _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size()); - _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size()); - - _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size()); + _aux_mem[InternalTensorIdx::SUM] = + MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size()); + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size()); + _aux_mem[InternalTensorIdx::MAX] = + MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size()); + + _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), + MemoryLifetime::Temporary, _permuted_src_info.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), + MemoryLifetime::Temporary, _permuted_dst_info.total_size()); } Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported"); ARM_COMPUTE_UNUSED(info.beta); - ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis); + ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || + static_cast<int32_t>(src.num_dimensions()) <= info.axis); - const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()))); + const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()))); const bool needs_permute = actual_axis != 0; - if(needs_permute) + if (needs_permute) { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector); - TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape)); + const PermutationVector permutation_vector = + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + const TensorShape permuted_shape = + misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector); + TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector)); TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector)); @@ -122,9 +135,14 @@ Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const TensorShape max_sum_shape = src.tensor_shape(); max_sum_shape.set(0, 1); TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true)); - TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true)); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum)); + TensorInfo tensor_info_sum(src.clone() + ->set_tensor_shape(max_sum_shape) + .set_data_type(tmp_data_type) + .set_quantization_info(QuantizationInfo()) + .set_is_resizable(true)); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum)); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info)); return Status{}; @@ -139,10 +157,12 @@ void ClSoftmax::run(ITensorPack &tensors) CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false); CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false); - CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false); - CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false); + CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, + false); + CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, + false); - if(_needs_permute) + if (_needs_permute) { ITensorPack pack; pack.add_const_tensor(TensorType::ACL_SRC, src); @@ -152,7 +172,7 @@ void ClSoftmax::run(ITensorPack &tensors) ITensorPack sum_pack; ITensorPack norm_pack; - if(_needs_permute) + if (_needs_permute) { sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get()); norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get()); @@ -172,7 +192,7 @@ void ClSoftmax::run(ITensorPack &tensors) CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false); CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false); - if(_needs_permute) + if (_needs_permute) { ITensorPack pack; pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get()); @@ -186,4 +206,4 @@ experimental::MemoryRequirements ClSoftmax::workspace() const return _aux_mem; } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h index 6c9af585d6..6c2aaaea80 100644 --- a/src/gpu/cl/operators/ClSoftmax.h +++ b/src/gpu/cl/operators/ClSoftmax.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SOFTMAX_H #include "arm_compute/runtime/CL/CLTensor.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -52,7 +53,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &dst, + const SoftmaxKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClSoftmax::configure() @@ -61,7 +65,7 @@ public: */ static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: @@ -79,7 +83,7 @@ private: std::unique_ptr<ClPermute> _permute_output; std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel; std::unique_ptr<kernels::ClLogits1DNormKernel> _norm_kernel; - bool _needs_permute{ false }; + bool _needs_permute{false}; TensorInfo _max_info; TensorInfo _sum_info; @@ -90,6 +94,6 @@ private: experimental::MemoryRequirements _aux_mem{}; }; -} // opencl -} // arm_compute -#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
\ No newline at end of file +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_SOFTMAX_H */ diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp index 53be04a70f..5c6d0c3184 100644 --- a/src/gpu/cl/operators/ClSub.cpp +++ b/src/gpu/cl/operators/ClSub.cpp @@ -23,17 +23,20 @@ */ #include "src/gpu/cl/operators/ClSub.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClElementwiseKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void ClSub::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info); auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); @@ -41,8 +44,11 @@ void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1 _kernel = std::move(k); } -Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status ClSub::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info); } diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h index 7eac437143..6a97275b86 100644 --- a/src/gpu/cl/operators/ClSub.h +++ b/src/gpu/cl/operators/ClSub.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SUB_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -65,7 +66,11 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -73,7 +78,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp index 26feffe2b9..28da0d640a 100644 --- a/src/gpu/cl/operators/ClTranspose.cpp +++ b/src/gpu/cl/operators/ClTranspose.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClTranspose.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClTransposeKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl @@ -45,4 +44,4 @@ Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) return kernels::ClTransposeKernel::validate(src, dst); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp index 90dbe7f291..cec438faeb 100644 --- a/src/gpu/cl/operators/ClTransposedConvolution.cpp +++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/common/utils/Log.h" #include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h" @@ -32,8 +33,12 @@ namespace arm_compute { namespace opencl { -void ClTransposedConvolution::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info) +void ClTransposedConvolution::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info); @@ -43,10 +48,14 @@ void ClTransposedConvolution::configure(const CLCompileContext &compile_context, _transposed_conv_kernel = std::move(kernel_object); } -Status ClTransposedConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *output, const PadStrideInfo &deconv_info) +Status ClTransposedConvolution::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info)); return Status{}; } diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h index 58ebc689ed..660c4f85c1 100644 --- a/src/gpu/cl/operators/ClTransposedConvolution.h +++ b/src/gpu/cl/operators/ClTransposedConvolution.h @@ -68,23 +68,30 @@ public: * @param[in] deconv_info Contains padding and stride information described in @ref PadStrideInfo. * */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClTransposedConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *output, const PadStrideInfo &deconv_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info); // Inherited method overridden void run(ITensorPack &tensors) override; private: - std::unique_ptr<IClKernel> _transposed_conv_kernel{ nullptr }; + std::unique_ptr<IClKernel> _transposed_conv_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */ diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp index b4163a5986..8ec96b247e 100644 --- a/src/gpu/cl/operators/ClWinogradConv2d.cpp +++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp @@ -24,20 +24,19 @@ #include "src/gpu/cl/operators/ClWinogradConv2d.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h" #include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h" #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" #include "src/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" using namespace arm_compute::experimental; @@ -55,15 +54,16 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height); // Check if the input spatial dimensions are smaller than 4 - const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); + const bool is_input_lt4_nchw = + (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); - if(kernel_max_dim == 3U) + if (kernel_max_dim == 3U) { - if(kernel_dims == Size2D(3U, 3U)) + if (kernel_dims == Size2D(3U, 3U)) { output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U); } - else if(kernel_dims == Size2D(3U, 1U)) + else if (kernel_dims == Size2D(3U, 1U)) { output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U); } @@ -72,15 +72,13 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U); } } - else if(kernel_max_dim == 5U) + else if (kernel_max_dim == 5U) { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, - kernel_dims.height == 1 ? 1U : 4U); + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U); } - else if(kernel_max_dim == 7U) + else if (kernel_max_dim == 7U) { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, - kernel_dims.height == 1 ? 1U : 2U); + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U); } return output_tile; @@ -91,11 +89,9 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz // Check if we want to configure a Winograd configuration which requires fast math using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; - std::vector<WinogradConfiguration> fast_math_winograd = - { + std::vector<WinogradConfiguration> fast_math_winograd = { WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)), - WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7)) - }; + WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))}; auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height), std::pair<int, int>(kernel_size.width, kernel_size.height)); @@ -103,8 +99,13 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end(); } -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { // Get indeces for the width and height const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); @@ -115,41 +116,49 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), + "Winograd only supports padding up to half kernel size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), + "Winograd only supports padding up to half kernel size"); // Check if the Winograd configuration requires fast math - if(!enable_fast_math) + if (!enable_fast_math) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), + "This Winograd configuration requires enable_fast_math=true"); } - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - src->data_layout()); + const WinogradInfo winograd_info = + WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout()); // Validate input transform - const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); - const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); + const TensorShape input0_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); + const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info)); // Validate filter transform - const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); - const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); + const TensorShape input1_shape = + misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); + const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info)); // Validate batched matrix multiply TensorShape batched_mm_output_shape = input0.tensor_shape(); batched_mm_output_shape[0] = input1.tensor_shape()[0]; const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, - GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)))); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, + GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)))); // Configure output transform - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info)); return Status{}; } @@ -171,8 +180,14 @@ ClWinogradConv2d::ClWinogradConv2d() ClWinogradConv2d::~ClWinogradConv2d() = default; -void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +void ClWinogradConv2d::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math); @@ -187,50 +202,53 @@ void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITenso const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); // Check if the Winograd configuration requires fast math - if(!enable_fast_math) + if (!enable_fast_math) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, + DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), + "This Winograd configuration requires enable_fast_math=true"); } - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - src->data_layout()); + const WinogradInfo winograd_info = + WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout()); _is_prepared = false; // Configure input transform _input_transform->configure(compile_context, src, &_input0, winograd_info); - _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, + PixelValue()); // Configure filter transform _filter_transform->configure(compile_context, weights, &_input1, winograd_info); // Configure batched matrix multiply - _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, - false, false, - GEMMLowpOutputStageInfo(), - (src->data_type() == DataType::F16))); + _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, + GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))); // Configure output transform _output_transform->set_target(CLScheduler::get().target()); _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info); - _aux_mem = _batched_mm.workspace(); - const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r) - { - return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); - }) ? - MemoryLifetime::Prepare : - MemoryLifetime::Persistent; + _aux_mem = _batched_mm.workspace(); + const MemoryLifetime wino_wei_lifetm = + std::any_of(std::begin(_aux_mem), std::end(_aux_mem), + [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); }) + ? MemoryLifetime::Prepare + : MemoryLifetime::Persistent; _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size())); _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size())); _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size())); } -Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status ClWinogradConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); return Status{}; @@ -251,10 +269,9 @@ void ClWinogradConv2d::run(ITensorPack &tensors) prepare(tensors); // Run input transform - ITensorPack pack_it - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, input0.get() }, + ITensorPack pack_it{ + {TensorType::ACL_SRC, src}, + {TensorType::ACL_DST, input0.get()}, }; CLScheduler::get().enqueue_op(_border_handler, pack_it, false); CLScheduler::get().enqueue_op(*_input_transform, pack_it, false); @@ -263,31 +280,31 @@ void ClWinogradConv2d::run(ITensorPack &tensors) ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get()); pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get()); - is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get()); + is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) + : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get()); _batched_mm.run(pack_mm); // Run output transform - ITensorPack pack_ot - { - { TensorType::ACL_SRC_0, batched_mm_output.get() }, - { TensorType::ACL_SRC_1, biases }, - { TensorType::ACL_DST, dst }, + ITensorPack pack_ot{ + {TensorType::ACL_SRC_0, batched_mm_output.get()}, + {TensorType::ACL_SRC_1, biases}, + {TensorType::ACL_DST, dst}, }; CLScheduler::get().enqueue_op(*_output_transform, pack_ot); } void ClWinogradConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { - auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto weights = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3))); CLAuxTensorHandler input1(_input1, *in1_aux); - ITensorPack pack_ft - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, input1.get() }, + ITensorPack pack_ft{ + {TensorType::ACL_SRC, weights}, + {TensorType::ACL_DST, input1.get()}, }; // Run filter transform and mark original weights as unused CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false); @@ -308,4 +325,4 @@ experimental::MemoryRequirements ClWinogradConv2d::workspace() const return _aux_mem; } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h index eb2f7a72b2..54ec1a1737 100644 --- a/src/gpu/cl/operators/ClWinogradConv2d.h +++ b/src/gpu/cl/operators/ClWinogradConv2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_WINOGRADCONV2D_H #include "arm_compute/runtime/CL/CLTensor.h" + #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -41,7 +42,7 @@ namespace kernels class ClWinogradInputTransformKernel; class ClWinogradFilterTransformKernel; class ClWinogradOutputTransformKernel; -} // kernels +} // namespace kernels /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels: * * -# @ref kernels::ClWinogradInputTransformKernel @@ -93,20 +94,31 @@ public: * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration * * Similar to ClWinogradConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited method overridden - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: |