From eaefd002a5d6509dd5f12e98b538c99b33c2c1ee Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Fri, 20 Jul 2018 17:49:35 +0100 Subject: COMPMID-1419: Make NEGEMMAssemblyDispatch dynamically typed instead of templated This makes it easier to integrate in GEMMLowpMatrixMultiplyCore Change-Id: Ibf80803f016a2e6a24d943ffafb50b48f04ec545 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140868 Reviewed-by: Georgios Pinitas Tested-by: Jenkins --- arm_compute/core/CPP/Validate.h | 74 +++++ arm_compute/runtime/NEON/functions/NEGEMM.h | 2 +- .../NEON/functions/NEGEMMAssemblyDispatch.h | 89 +++--- .../NEON/functions/NEGEMMConvolutionLayer.h | 2 +- .../NEGEMMLowpAssemblyMatrixMultiplyCore.h | 15 +- .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h | 3 +- .../NEON/functions/NEWinogradConvolutionLayer.h | 4 +- src/core/NEON/kernels/NEActivationLayerKernel.cpp | 3 +- .../NEON/kernels/NEArithmeticAdditionKernel.cpp | 2 + .../NEON/kernels/NEArithmeticSubtractionKernel.cpp | 2 + .../kernels/NEBatchNormalizationLayerKernel.cpp | 2 + src/core/NEON/kernels/NECol2ImKernel.cpp | 1 + .../NEConvertFullyConnectedWeightsKernel.cpp | 1 + .../NEON/kernels/NEDepthConcatenateLayerKernel.cpp | 1 + src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp | 1 + .../kernels/NEDepthwiseVectorToTensorKernel.cpp | 2 + .../kernels/NEDepthwiseWeightsReshapeKernel.cpp | 1 + .../kernels/NEDirectConvolutionLayerKernel.cpp | 2 + .../NEDirectConvolutionLayerOutputStageKernel.cpp | 3 +- src/core/NEON/kernels/NEFillBorderKernel.cpp | 1 + .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp | 1 + .../kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp | 2 + .../NEON/kernels/NEGEMMMatrixAdditionKernel.cpp | 2 + .../NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp | 2 + src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 1 + src/core/NEON/kernels/NEIm2ColKernel.cpp | 2 + .../NELocallyConnectedMatrixMultiplyKernel.cpp | 3 +- .../NEON/kernels/NENormalizationLayerKernel.cpp | 2 + src/core/NEON/kernels/NEPermuteKernel.cpp | 1 + .../kernels/NEPixelWiseMultiplicationKernel.cpp | 4 +- src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 2 + src/core/NEON/kernels/NEReshapeLayerKernel.cpp | 2 + src/core/NEON/kernels/NESoftmaxLayerKernel.cpp | 11 +- src/core/NEON/kernels/NETransposeKernel.cpp | 1 + src/core/NEON/kernels/NEWeightsReshapeKernel.cpp | 1 + .../NEON/functions/NEGEMMAssemblyDispatch.cpp | 300 ++++++++++++--------- .../NEGEMMLowpAssemblyMatrixMultiplyCore.cpp | 20 +- .../functions/NEGEMMLowpMatrixMultiplyCore.cpp | 36 +-- 38 files changed, 353 insertions(+), 251 deletions(-) create mode 100644 arm_compute/core/CPP/Validate.h diff --git a/arm_compute/core/CPP/Validate.h b/arm_compute/core/CPP/Validate.h new file mode 100644 index 0000000000..1799f9003e --- /dev/null +++ b/arm_compute/core/CPP/Validate.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2018 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CPP_VALIDATE_H__ +#define __ARM_COMPUTE_CPP_VALIDATE_H__ + +#include "arm_compute/core/Validate.h" + +namespace arm_compute +{ +/** Return an error if the data type of the passed tensor info is FP16 and FP16 support is not compiled in. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor_info Tensor info to validate. + * + * @return Status + */ +inline arm_compute::Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, + const ITensorInfo *tensor_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line); +#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::F16, + function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above"); +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + return arm_compute::Status {}; +} + +/** Return an error if the data type of the passed tensor is FP16 and FP16 support is not compiled in. + * + * @param[in] function Function in which the error occurred. + * @param[in] file Name of the file where the error occurred. + * @param[in] line Line on which the error occurred. + * @param[in] tensor Tensor to validate. + * + * @return Status + */ +inline arm_compute::Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, + const ITensor *tensor) +{ + ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line); + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info())); + return arm_compute::Status{}; +} + +#define ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \ + ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor)) + +#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \ + ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor)) +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CPP_VALIDATE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h index 523f1d33a1..36c9587969 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMM.h +++ b/arm_compute/runtime/NEON/functions/NEGEMM.h @@ -85,7 +85,7 @@ private: NEGEMMInterleave4x4Kernel _interleave_kernel; NEGEMMTranspose1xWKernel _transpose_kernel; NEGEMMMatrixMultiplyKernel _mm_kernel; - NEGEMMAssemblyDispatchF32 _asm_glue; + NEGEMMAssemblyDispatch _asm_glue; NEGEMMMatrixAdditionKernel _ma_kernel; Tensor _tmp_a; Tensor _tmp_b; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h index 1c9ecb088e..382ef1caba 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h @@ -35,7 +35,6 @@ namespace arm_compute { /** Assembly kernel glue */ -template class NEGEMMAssemblyDispatch : public IFunction { public: @@ -43,12 +42,21 @@ public: NEGEMMAssemblyDispatch(std::shared_ptr memory_manager = nullptr); /** Prevent instances of this class from being copy constructed */ - NEGEMMAssemblyDispatch(const NEGEMMAssemblyDispatch &) = delete; + NEGEMMAssemblyDispatch(const NEGEMMAssemblyDispatch &) = delete; /** Prevent instances of this class from being copied */ - NEGEMMAssemblyDispatch &operator=(const NEGEMMAssemblyDispatch &) = delete; - NEGEMMAssemblyDispatch(NEGEMMAssemblyDispatch &&) = default; - NEGEMMAssemblyDispatch &operator=(NEGEMMAssemblyDispatch &&) = default; - ~NEGEMMAssemblyDispatch() = default; + NEGEMMAssemblyDispatch &operator=(const NEGEMMAssemblyDispatch &) = delete; + NEGEMMAssemblyDispatch(NEGEMMAssemblyDispatch &&) = default; + NEGEMMAssemblyDispatch &operator=(NEGEMMAssemblyDispatch &&) = default; + ~NEGEMMAssemblyDispatch() = default; + + class IFallback + { + public: + virtual void run() = 0; + virtual void prepare() = 0; + virtual bool is_configured() const = 0; + virtual ~IFallback() = default; + }; private: /** ACL Function */ @@ -68,53 +76,9 @@ private: */ bool create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint); - //Fallback: use arm_gemm's AssemblyGemm: - class Fallback - { -#ifndef DOXYGEN_SKIP_THIS - public: - /** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel. - * The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2) - */ - void run(); - void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs &args, MemoryGroup &memory_group); - void prepare(); - bool is_configured() const; -#endif /* DOXYGEN_SKIP_THIS */ - - private: - /** Allocate a workspace tensor. - * - * @param[in] workspace_size Size to allocate. - * @param[in] memory_group Tensor memory group. - * @param[in] alignment Workspace memory alignment. - */ - void allocate_workspace(size_t workspace_size, MemoryGroup *memory_group, size_t alignment); - - /** Assembly Gemm kernel */ - std::unique_ptr> _gemm_kernel_asm{ nullptr }; - /** Optimised NEON kernel */ - std::unique_ptr _optimised_kernel{ nullptr }; - /** Input A */ - const ITensor *_a - { - nullptr - }; - /** Input B */ - const ITensor *_b - { - nullptr - }; - /** Output */ - ITensor *_d{ nullptr }; - /** GEMM workspace */ - Tensor _workspace{}; - /** Pre-transpose tensor */ - Tensor _pretranspose{}; - /** Prepared flag */ - bool _is_prepared{ false }; - } _arm_gemm; /**< Fallback in case ACL doesn't have a function */ - MemoryGroup _memory_group; /**< Function memory group */ + /** Interface for the arm_gemm fallback */ + std::unique_ptr _arm_gemm; + MemoryGroup _memory_group; /**< Function memory group */ public: /** If supported create an ACL function else fallback to the arm_gemm function. * @@ -126,6 +90,19 @@ public: * @param[in] pretranspose_hint Can the B tensor can be pretransposed (ie shared across invocations)? */ void configure(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint); + + /** Indicates whether or not this function can be used to process the given parameters. + * + * @param[in] a Input tensor (Matrix A) + * @param[in] b Input tensor (Matrix B) + * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] alpha Scalar multiplier to apply to AB matrix product. + * @param[in] beta Scalar multiplier to apply to input D matrix before adding product. + * @param[in] pretranspose_hint Can the B tensor can be pretransposed (ie shared across invocations)? + * + * @return a status. + */ + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, float alpha, float beta, bool pretranspose_hint); /** Was the function successfully configured ? * * @return True if the function is configured and ready to run @@ -137,11 +114,5 @@ public: void run() override; }; -/** Float 32 assembly dispatch kernel */ -using NEGEMMAssemblyDispatchF32 = NEGEMMAssemblyDispatch; -/** Uint 8 to Uint 32 assembly dispatch kernel */ -using NEGEMMAssemblyDispatchU8U32 = NEGEMMAssemblyDispatch; -/** Int 8 to Int 32 assembly dispatch kernel */ -using NEGEMMAssemblyDispatchS8S32 = NEGEMMAssemblyDispatch; } // namespace arm_compute #endif /* __ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h index 1564b6c983..8f41462b0b 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h @@ -169,7 +169,7 @@ private: private: MemoryGroup _memory_group; - NEGEMMAssemblyDispatchF32 _asm_glue; + NEGEMMAssemblyDispatch _asm_glue; NEIm2ColKernel _input_im2col_kernel; NEGEMMInterleave4x4Kernel _input_interleave_kernel; NEConvolutionLayerReshapeWeights _reshape_weights; diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h index b6672d7584..27be34d1f8 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h @@ -58,14 +58,13 @@ public: void run() override; private: - MemoryGroup _memory_group; - NEGEMMAssemblyDispatchU8U32 _asm_glue_unsigned; - NEGEMMAssemblyDispatchS8S32 _asm_glue_signed; - std::unique_ptr _mm_kernel; - std::unique_ptr _mtx_a_reshape_kernel; - std::unique_ptr _mtx_b_reshape_kernel; - Tensor _tmp_a; - Tensor _tmp_b; + MemoryGroup _memory_group; + NEGEMMAssemblyDispatch _asm_glue; + std::unique_ptr _mm_kernel; + std::unique_ptr _mtx_a_reshape_kernel; + std::unique_ptr _mtx_b_reshape_kernel; + Tensor _tmp_a; + Tensor _tmp_b; }; } #endif /*__ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H__ */ diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h index 96ac7bb7e0..3db76f423c 100644 --- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h +++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h @@ -98,8 +98,7 @@ public: private: MemoryGroup _memory_group; - NEGEMMAssemblyDispatchU8U32 _asm_glue_unsigned; - NEGEMMAssemblyDispatchS8S32 _asm_glue_signed; + NEGEMMAssemblyDispatch _asm_glue; std::unique_ptr _mm_kernel; std::unique_ptr _mtx_a_reshape_kernel; std::unique_ptr _mtx_b_reshape_kernel; diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h index 384fbf893b..5da63311e0 100644 --- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h +++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h @@ -43,7 +43,7 @@ class ITensor; * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method ) * -# @ref NEWinogradLayerTransformInputKernel * -# @ref NEWinogradLayerTransformOutputKernel - * -# @ref NEGEMMAssemblyDispatchF32 + * -# @ref NEGEMMAssemblyDispatch * -# @ref CPPPermute (three times: weights, input and output) * * @note Some Winograd configurations (i.e. F(2x2, 5x5), F(4x4, 5x5)) are supported only with enable_fast_math = true @@ -103,7 +103,7 @@ public: private: MemoryGroup _memory_group; - NEGEMMAssemblyDispatchF32 _asm_glue; + NEGEMMAssemblyDispatch _asm_glue; std::unique_ptr _transform_input_kernel; std::unique_ptr _transform_output_kernel; std::unique_ptr _transform_weights_kernel; diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp index bdc93ed1b8..1dad531a40 100644 --- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/NEON/NEAsymm.h" @@ -44,7 +45,7 @@ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8, DataType::F16, DataType::F32); // Checks performed when output is configured diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp index f8e2b6d73e..a6102b159f 100644 --- a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" @@ -330,6 +331,7 @@ Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, { ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp index 5a162e3b2c..3c76548b0a 100644 --- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp +++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -320,6 +321,7 @@ void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy) { ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp index abfaa0cd26..ac1fc393c4 100644 --- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/NEON/NEMath.h" @@ -43,6 +44,7 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info) { ARM_COMPUTE_UNUSED(epsilon); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp index d09d174e4f..b9c7a9ac3b 100644 --- a/src/core/NEON/kernels/NECol2ImKernel.cpp +++ b/src/core/NEON/kernels/NECol2ImKernel.cpp @@ -50,6 +50,7 @@ TensorShape get_output_shape(const ITensorInfo *input, const Size2D &convolved_d Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims) { + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, diff --git a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp index 198565b1d5..b6d166d30e 100644 --- a/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp +++ b/src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.cpp @@ -66,6 +66,7 @@ void NEConvertFullyConnectedWeightsKernel::configure(const ITensor *input, ITens Status NEConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout) { + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp index 1b937b5be8..8c875cdb2d 100644 --- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp @@ -95,6 +95,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, unsi Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp index 91b29cdf03..92ee8d5809 100644 --- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp @@ -42,6 +42,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier) { ARM_COMPUTE_UNUSED(conv_info); + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias); diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp index fe141bef56..2d17c237a3 100644 --- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h" #include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Coordinates.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" @@ -43,6 +44,7 @@ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h) { + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32); if(output->total_size() != 0) diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp index 2c7a379c25..22a2cf8f2d 100644 --- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp @@ -80,6 +80,7 @@ void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases) { + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (biases != nullptr)); diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index 54a046846a..59244c876c 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" #include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" @@ -972,6 +973,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp index 6a373de1c3..eefbd98dd8 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h" #include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -43,7 +44,7 @@ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp index 3d08cafa93..aef4d4865a 100644 --- a/src/core/NEON/kernels/NEFillBorderKernel.cpp +++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp @@ -105,6 +105,7 @@ NEFillBorderKernel::NEFillBorderKernel() void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) { + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp index 6519a39b9c..5483602786 100644 --- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -44,6 +44,7 @@ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp index 421a6f0ef9..42353ed0eb 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" #include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -43,6 +44,7 @@ namespace { inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases) { + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(accum); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp index d02504329a..cd6aa553db 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/NEON/NEFixedPoint.h" @@ -100,6 +101,7 @@ NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel() void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta) { + ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp index 196398a2de..0ca24748af 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/AccessWindowStatic.h" #include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" @@ -810,6 +811,7 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i { ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp index 4517f46139..2e14e7a8c0 100644 --- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp @@ -54,6 +54,7 @@ TensorShape get_output_shape(const ITensorInfo *input) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp index f03bc49ed3..16525ac22e 100644 --- a/src/core/NEON/kernels/NEIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -46,6 +47,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation) { + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias); diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp index 099626d259..4d3ec46e34 100644 --- a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h" #include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" @@ -305,7 +306,7 @@ void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, IT Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32); diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp index 253a93f196..cb1996f33e 100644 --- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/NEON/NEFixedPoint.h" @@ -39,6 +40,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared); diff --git a/src/core/NEON/kernels/NEPermuteKernel.cpp b/src/core/NEON/kernels/NEPermuteKernel.cpp index e9bc8effc6..8d3fd88329 100644 --- a/src/core/NEON/kernels/NEPermuteKernel.cpp +++ b/src/core/NEON/kernels/NEPermuteKernel.cpp @@ -45,6 +45,7 @@ namespace { Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm) { + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp index 0ec7e823a1..a4f51436b4 100644 --- a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp +++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp @@ -23,6 +23,7 @@ */ #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" @@ -61,6 +62,7 @@ inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *i ARM_COMPUTE_UNUSED(overflow_policy); ARM_COMPUTE_UNUSED(rounding_policy); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); @@ -607,4 +609,4 @@ BorderSize NEPixelWiseMultiplicationKernel::border_size() const const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0)); const unsigned int border = std::min(num_elems_processed_per_iteration - 1U, replicateSize); return BorderSize(0, border, 0, 0); -} \ No newline at end of file +} diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp index e586b72d30..2ca6090674 100644 --- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h" #include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -139,6 +140,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); static const std::set supported_pool_sizes = { 2, 3 }; + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(input->data_type())); diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp index d6f470445f..a8a7440270 100644 --- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp +++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h" #include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" @@ -59,6 +60,7 @@ inline void reshape_tensor(const Window &window, const ITensor *input, ITensor * void NEReshapeLayerKernel::configure(const ITensor *input, ITensor *output) { + ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp index 9946f002de..4041b623b1 100644 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp +++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp @@ -24,6 +24,7 @@ #include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h" #include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" @@ -320,11 +321,8 @@ namespace { Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) { -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ // Validate in case of configured output if(output.total_size() != 0) @@ -486,11 +484,8 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor { ARM_COMPUTE_UNUSED(beta); // Check input -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); -#else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type()); diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp index 2630159561..32a5acd2f4 100644 --- a/src/core/NEON/kernels/NETransposeKernel.cpp +++ b/src/core/NEON/kernels/NETransposeKernel.cpp @@ -74,6 +74,7 @@ unsigned int num_elems_processed(size_t element_size) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp index f398409b26..2c9ad923aa 100644 --- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp +++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp @@ -105,6 +105,7 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias) Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output) { + //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions. ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp index f4710fab84..e60fe80e0f 100644 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp @@ -23,20 +23,31 @@ */ #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h" +#include "arm_compute/core/CPP/Validate.h" #include "arm_compute/core/NEON/kernels/assembly/NEGEMMNativeWrapperKernel.h" #include "arm_compute/runtime/NEON/NEScheduler.h" #include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h" +#include + namespace arm_compute { +namespace +{ template -NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr memory_manager) - : _function(nullptr), _arm_gemm(), _memory_group(std::move(memory_manager)) +std::unique_ptr create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint) { + ARM_COMPUTE_UNUSED(method); + ARM_COMPUTE_UNUSED(a); + ARM_COMPUTE_UNUSED(b); + ARM_COMPUTE_UNUSED(d); + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_UNUSED(pretranspose_hint); + return nullptr; } - template <> -bool NEGEMMAssemblyDispatch::create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint) +std::unique_ptr create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint) { ARM_COMPUTE_UNUSED(method); ARM_COMPUTE_UNUSED(a); @@ -54,132 +65,59 @@ bool NEGEMMAssemblyDispatch::create_function(arm_gemm::GemmMethod kernel->configure(a, b, d, alpha, beta); auto function = support::cpp14::make_unique(); function->configure(std::move(kernel)); - _function = std::move(function); - return true; + return std::move(function); } #endif /* __aarch64__ */ default: - return false; + return nullptr; } } +/** Fallback in case ACL doesn't have a function */ template -bool NEGEMMAssemblyDispatch::create_function(arm_gemm::GemmMethod method, const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint) +class Fallback : public NEGEMMAssemblyDispatch::IFallback { - ARM_COMPUTE_UNUSED(method); - ARM_COMPUTE_UNUSED(a); - ARM_COMPUTE_UNUSED(b); - ARM_COMPUTE_UNUSED(d); - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_UNUSED(pretranspose_hint); - return false; -} - -template -void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint) -{ - INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); - - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint); - - //Try to create an ACL function: - if(!create_function(arm_gemm::get_gemm_method(args), a, b, d, alpha, beta, pretranspose_hint)) +public: + void configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs &args, MemoryGroup &memory_group); + void run() override; + void prepare() override; + bool is_configured() const override; + +private: + /** Allocate a workspace tensor. + * + * @param[in] workspace_size Size to allocate. + * @param[in] memory_group Tensor memory group. + * @param[in] alignment Workspace memory alignment. + */ + void allocate_workspace(size_t workspace_size, MemoryGroup *memory_group, size_t alignment); + + /** Assembly Gemm kernel */ + std::unique_ptr> _gemm_kernel_asm{ nullptr }; + /** Optimised NEON kernel */ + std::unique_ptr _optimised_kernel{ nullptr }; + /** Input A */ + const ITensor *_a { - //Fallback onto arm_gemm function if ACL doesn't support this method. - _arm_gemm.configure(a, b, d, args, _memory_group); - } -} - -template -void NEGEMMAssemblyDispatch::prepare() -{ - if(_function != nullptr) - { - _function->prepare(); - } - else + nullptr + }; + /** Input B */ + const ITensor *_b { - _arm_gemm.prepare(); - } -} - -template -bool NEGEMMAssemblyDispatch::is_configured() const -{ - return _arm_gemm.is_configured() || _function != nullptr; -} + nullptr + }; + /** Output */ + ITensor *_d{ nullptr }; + /** GEMM workspace */ + Tensor _workspace{}; + /** Pre-transpose tensor */ + Tensor _pretranspose{}; + /** Prepared flag */ + bool _is_prepared{ false }; +}; template -void NEGEMMAssemblyDispatch::run() -{ - _memory_group.acquire(); - if(_function != nullptr) - { - _function->run(); - } - else - { - _arm_gemm.run(); - } - _memory_group.release(); -} - -#ifndef __aarch64__ -template <> -void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint) -{ - // arm_gemm::gemm for 8bit only exists for aarch64 - ARM_COMPUTE_UNUSED(a); - ARM_COMPUTE_UNUSED(b); - ARM_COMPUTE_UNUSED(d); - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_UNUSED(pretranspose_hint); - ARM_COMPUTE_ERROR("Not supported for this architecture"); -} - -template <> -void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint) -{ - // arm_gemm::gemm for 8bit only exists for aarch64 - ARM_COMPUTE_UNUSED(a); - ARM_COMPUTE_UNUSED(b); - ARM_COMPUTE_UNUSED(d); - ARM_COMPUTE_UNUSED(alpha); - ARM_COMPUTE_UNUSED(beta); - ARM_COMPUTE_UNUSED(pretranspose_hint); - ARM_COMPUTE_ERROR("Not supported for this architecture"); -} - -template <> -void NEGEMMAssemblyDispatch::Fallback::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs &args, MemoryGroup &memory_group) -{ - // arm_gemm::gemm for 8bit only exists for aarch64 - ARM_COMPUTE_UNUSED(a); - ARM_COMPUTE_UNUSED(b); - ARM_COMPUTE_UNUSED(d); - ARM_COMPUTE_UNUSED(args); - ARM_COMPUTE_UNUSED(memory_group); - ARM_COMPUTE_ERROR("Not supported for this architecture"); -} - -template <> -void NEGEMMAssemblyDispatch::Fallback::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs &args, MemoryGroup &memory_group) -{ - // arm_gemm::gemm for 8bit only exists for aarch64 - ARM_COMPUTE_UNUSED(a); - ARM_COMPUTE_UNUSED(b); - ARM_COMPUTE_UNUSED(d); - ARM_COMPUTE_UNUSED(args); - ARM_COMPUTE_UNUSED(memory_group); - ARM_COMPUTE_ERROR("Not supported for this architecture"); -} -#endif // aarch64 -template -void NEGEMMAssemblyDispatch::Fallback::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs &args, MemoryGroup &memory_group) +void Fallback::configure(const ITensor *a, const ITensor *b, ITensor *d, arm_gemm::GemmArgs &args, MemoryGroup &memory_group) { _gemm_kernel_asm = arm_gemm::gemm(args, nullptr); if(_gemm_kernel_asm == nullptr) @@ -228,7 +166,7 @@ void NEGEMMAssemblyDispatch::Fallback::configure(const IT } template -void NEGEMMAssemblyDispatch::Fallback::prepare() +void Fallback::prepare() { if(!_is_prepared) { @@ -249,7 +187,7 @@ void NEGEMMAssemblyDispatch::Fallback::prepare() } template -void NEGEMMAssemblyDispatch::Fallback::allocate_workspace(size_t workspace_size, MemoryGroup *memory_group, size_t alignment) +void Fallback::allocate_workspace(size_t workspace_size, MemoryGroup *memory_group, size_t alignment) { ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "size cannot be 0"); _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment); @@ -261,13 +199,13 @@ void NEGEMMAssemblyDispatch::Fallback::allocate_workspace } template -bool NEGEMMAssemblyDispatch::Fallback::is_configured() const +bool Fallback::is_configured() const { return _optimised_kernel != nullptr; } template -void NEGEMMAssemblyDispatch::Fallback::run() +void Fallback::run() { const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput); const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput); @@ -312,7 +250,119 @@ void NEGEMMAssemblyDispatch::Fallback::run() NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX); } -template class NEGEMMAssemblyDispatch; -template class NEGEMMAssemblyDispatch; -template class NEGEMMAssemblyDispatch; +template +void create_function_or_arm_gemm(std::unique_ptr &acl_function, std::unique_ptr &arm_gemm, MemoryGroup &memory_group, const ITensor *a, const ITensor *b, + ITensor *d, float alpha, float beta, bool pretranspose_hint) +{ + INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + unsigned int num_threads = NEScheduler::get().num_threads(); + + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, false, false, alpha, beta, num_threads, pretranspose_hint); + + //Try to create an ACL function: + acl_function = create_function(arm_gemm::get_gemm_method(args), a, b, d, alpha, beta, pretranspose_hint); + if(acl_function == nullptr) + { + //Fallback onto arm_gemm function if ACL doesn't support this method. + auto fallback = support::cpp14::make_unique>(); + fallback->configure(a, b, d, args, memory_group); + arm_gemm = std::move(fallback); + } +} + +} //namespace + +NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr memory_manager) + : _function(nullptr), _arm_gemm(nullptr), _memory_group(std::move(memory_manager)) +{ +} + +Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, float alpha, float beta, bool pretranspose_hint) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(beta); + ARM_COMPUTE_UNUSED(pretranspose_hint); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a); +#ifndef __aarch64__ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 || a->data_type() == DataType::S8 || a->data_type() == DataType::QASYMM8, "8bit integer types only supported for aarch64"); +#endif /* __aarch64__ */ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::U8, DataType::QASYMM8, DataType::S8, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((a->data_type() == DataType::QASYMM8 || a->data_type() == DataType::U8) && d->data_type() != DataType::U32, "Only U32 output supported for U8 / QASYMM8 input"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input"); + return Status{}; +} + +void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, ITensor *d, float alpha, float beta, bool pretranspose_hint) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a); + ARM_COMPUTE_ERROR_ON_NULLPTR(b); + ARM_COMPUTE_ERROR_ON_NULLPTR(d); + + //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured() + if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), d->info(), alpha, beta, pretranspose_hint)) + { + return; + } + + switch(a->info()->data_type()) + { + case DataType::F32: + create_function_or_arm_gemm(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint); + break; +#ifdef __aarch64__ + case DataType::U8: + case DataType::QASYMM8: + create_function_or_arm_gemm(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint); + break; + case DataType::S8: + create_function_or_arm_gemm(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint); + break; +#endif /* __aarch64__ */ +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + case DataType::F16: + create_function_or_arm_gemm(_function, _arm_gemm, _memory_group, a, b, d, alpha, beta, pretranspose_hint); + break; +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + default: + break; + } +} + +void NEGEMMAssemblyDispatch::prepare() +{ + if(_function != nullptr) + { + _function->prepare(); + } + else + { + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + _arm_gemm->prepare(); + } +} + +bool NEGEMMAssemblyDispatch::is_configured() const +{ + return (_arm_gemm != nullptr && _arm_gemm->is_configured()) || _function != nullptr; +} + +void NEGEMMAssemblyDispatch::run() +{ + _memory_group.acquire(); + if(_function != nullptr) + { + _function->run(); + } + else + { + ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr); + _arm_gemm->run(); + } + _memory_group.release(); +} } //namespace arm_compute diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp index 9b5d02ca44..47c33587a0 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp @@ -38,8 +38,7 @@ using namespace arm_compute; NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _asm_glue_unsigned(memory_manager), _asm_glue_signed(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), - _tmp_b() + : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b() { } @@ -56,16 +55,11 @@ void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITe switch(a->info()->data_type()) { case DataType::S8: - { - _asm_glue_signed.configure(a, b, output, 1.f, 0.f, true); - run_optimised = _asm_glue_unsigned.is_configured(); - break; - } case DataType::QASYMM8: case DataType::U8: { - _asm_glue_unsigned.configure(a, b, output, 1.f, 0.f, true); - run_optimised = _asm_glue_unsigned.is_configured(); + _asm_glue.configure(a, b, output, 1.f, 0.f, true); + run_optimised = _asm_glue.is_configured(); break; } default: @@ -133,13 +127,9 @@ void NEGEMMLowpAssemblyMatrixMultiplyCore::run() NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY); } - if(_asm_glue_unsigned.is_configured()) - { - _asm_glue_unsigned.run(); - } - else if(_asm_glue_signed.is_configured()) + if(_asm_glue.is_configured()) { - _asm_glue_signed.run(); + _asm_glue.run(); } else { diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp index a57271c17c..773492d0ce 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp @@ -41,9 +41,9 @@ using namespace arm_compute; using namespace arm_compute::misc::shape_calculator; NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr memory_manager) - : _memory_group(memory_manager), _asm_glue_unsigned(memory_manager), _asm_glue_signed(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), - _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _original_b(nullptr), _a_offset(0), _b_offset(0), - _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false) + : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), + _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), + _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false) { } @@ -67,17 +67,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, #ifdef __aarch64__ switch(a->info()->data_type()) { - case DataType::S8: - { - _asm_glue_signed.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run); - _dot_product_path = _asm_glue_signed.is_configured(); - break; - } case DataType::QASYMM8: case DataType::U8: + case DataType::S8: { - _asm_glue_unsigned.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run); - _dot_product_path = _asm_glue_unsigned.is_configured(); + _asm_glue.configure(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run); + _dot_product_path = _asm_glue.is_configured(); break; } default: @@ -275,13 +270,9 @@ void NEGEMMLowpMatrixMultiplyCore::run() } // Run GEMM - if(_asm_glue_unsigned.is_configured()) + if(_asm_glue.is_configured()) { - _asm_glue_unsigned.run(); - } - else if(_asm_glue_signed.is_configured()) - { - _asm_glue_signed.run(); + _asm_glue.run(); } else { @@ -311,18 +302,11 @@ void NEGEMMLowpMatrixMultiplyCore::prepare() if(!_is_prepared) { // Run assembly reshape - if((_asm_glue_signed.is_configured() || _asm_glue_signed.is_configured()) && _reshape_b_only_on_first_run) + if(_asm_glue.is_configured() && _reshape_b_only_on_first_run) { ARM_COMPUTE_ERROR_ON(!_original_b->is_used()); - if(_asm_glue_unsigned.is_configured()) - { - _asm_glue_unsigned.prepare(); - } - else if(_asm_glue_signed.is_configured()) - { - _asm_glue_signed.prepare(); - } + _asm_glue.prepare(); _original_b->mark_as_unused(); } // Run non-assembly reshape -- cgit v1.2.1