diff options
Diffstat (limited to 'src/runtime/heuristics')
28 files changed, 3532 insertions, 0 deletions
diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp new file mode 100644 index 0000000000..aba32871d0 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <utility> + +namespace arm_compute +{ +namespace cl_direct_conv +{ +using namespace arm_compute::misc::shape_calculator; + +ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) +{ +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71( + &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16, + &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default( + &ClDirectConvDefaultConfigBifrost::configure_default_f32, + &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G71: + func = configs_G71.get_function(src->data_type()); + break; + default: + func = configs_default.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution"); + return (this->*func)(src, wei, conv_info); +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 2; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 16; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 2; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = export_to_cl_image(wei); + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 8; + + desc.export_weights_to_cl_image = export_to_cl_image(wei); + } + + return desc; +} +} // namespace cl_direct_conv +} // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h new file mode 100644 index 0000000000..ed6a4c3c68 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST +#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST + +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** Bifrost based OpenCL direct convolution configuration */ +class ClDirectConvDefaultConfigBifrost final : public IClDirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDirectConvDefaultConfigBifrost(GPUTarget gpu); + + // Inherited overridden method + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + +private: + DirectConvComputeKernelInfo + configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp new file mode 100644 index 0000000000..4b7666d5aa --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp @@ -0,0 +1,413 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <utility> + +namespace arm_compute +{ +namespace cl_direct_conv +{ +using namespace arm_compute::misc::shape_calculator; + +ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu) +{ +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); + + ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57( + &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16, + &ClDirectConvDefaultConfigValhall::configure_G78_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G57: + func = configs_G57.get_function(src->data_type()); + break; + case GPUTarget::G78: + default: + func = configs_G78.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for direct convolution"); + return (this->*func)(src, wei, conv_info); +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + if (is_pointwise) + { + if (ofm == 4) + { + desc.m0 = 1; + desc.n0 = 4; + desc.k0 = 16; + } + else + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + } + else + { + desc.m0 = 1; + desc.n0 = 2; + desc.k0 = 16; + } + } + else + { + if (m < 64) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 4; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const int32_t k = wei_shape[0]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + // k0 should be as larger as possible. However, we should avoid + // having left-over for loops that make the implementation slower. + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + + if (is_pointwise) + { + if (ofm == 4) + { + desc.m0 = 1; + desc.n0 = 4; + } + else + { + desc.m0 = 1; + desc.n0 = 1; + } + } + else + { + desc.m0 = 1; + desc.n0 = dst_shape[0]; + } + } + else + { + if (m < 64) + { + desc.m0 = 1; + desc.n0 = 1; + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + } + else + { + if (ofm >= 16) + { + if (m / 6 > 24000) + { + desc.m0 = 6; + } + else + { + desc.m0 = 5; + } + desc.n0 = 8; + desc.k0 = 4; + } + else + { + desc.m0 = 2; + desc.n0 = 8; + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + } + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + + desc.n0 = 4; + + if (output_shape[0] > 16) + { + desc.m0 = 4; + } + + desc.k0 = 16; + + desc.export_weights_to_cl_image = false; + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t m = dst_shape[1] * dst_shape[2]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + if (is_pointwise) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 1; + desc.n0 = dst_shape[0]; + desc.k0 = 16; + } + } + else + { + if (m < 64) + { + if (m == 1) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 4; + desc.n0 = 2; + desc.k0 = 8; + } + } + else + { + desc.m0 = 4; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Get the output shape + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const bool is_pointwise = (wei_shape[1] == wei_shape[2]) && wei_shape[1] == 1; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (dst_shape[0] <= 4) + { + if (is_pointwise) + { + desc.m0 = 2; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 1; + desc.n0 = dst_shape[0]; + desc.k0 = 16; + } + } + else + { + if (m < 64) + { + if (m == 1) + { + desc.m0 = 1; + desc.n0 = 1; + desc.k0 = 16; + } + else + { + desc.m0 = 4; + desc.n0 = 2; + desc.k0 = 8; + } + } + else + { + if (ofm > 16) + { + desc.m0 = 4; + desc.n0 = 8; + desc.k0 = 8; + } + else + { + desc.m0 = 8; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + } + + return desc; +} +} // namespace cl_direct_conv +} // namespace arm_compute diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h new file mode 100644 index 0000000000..efd879a567 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL + +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** Valhall based OpenCL direct convolution configuration */ +class ClDirectConvDefaultConfigValhall final : public IClDirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDirectConvDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + +private: + DirectConvComputeKernelInfo + configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h new file mode 100644 index 0000000000..215b17ef79 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H + +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h" +#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h" +#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** ClDirectConvolution factory class */ +class ClDirectConvKernelConfigurationFactory final +{ +public: + /** Static method to call the ClDirectConvolution kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClDirectConvKernelConfig + */ + static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71); + case GPUTarget::BIFROST: + return std::make_unique<ClDirectConvDefaultConfigBifrost>(gpu); + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClDirectConvDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG_H diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h new file mode 100644 index 0000000000..e5b270c720 --- /dev/null +++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_direct_conv +{ +/** Basic container for the OpenCL direct convolution configuration functions */ +template <class T> +class ClDirectConvConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for direct convolution F32 + * @param[in] func_f16 Function to call for direct convolution F16 + * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} + { + } + + /** Method to return the direct convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the Direct convolution kernel configuration */ +class IClDirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClDirectConvKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig); + /** Virtual destructor */ + virtual ~IClDirectConvKernelConfig() = default; + /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + */ + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_direct_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp new file mode 100644 index 0000000000..98ebf3ebbe --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +namespace +{ +DWCComputeKernelInfo configure_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + + if (is_g71) + { + desc.export_weights_to_cl_image = false; + } + else + { + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + } + + if (depth_multiplier == 1) + { + desc.n0 = 4; + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo configure_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier, + bool is_g71) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Src and weights have the same dimension indices + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape src_shape = src->tensor_shape(); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t src_w = src_shape[idx_w]; + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + + if (is_g71) + { + desc.export_weights_to_cl_image = false; + } + else + { + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + } + + if (depth_multiplier == 1) + { + if (desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + if ((src_w % 5) == 0) + { + desc.m0 = 5; + } + else + { + desc.m0 = 4; + } + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace + +ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) +{ +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71( + &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x( + &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16, + &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G71: + func = configs_G71.get_function(src->data_type()); + break; + default: + func = configs_G7x.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution"); + return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + ARM_COMPUTE_UNUSED(wei); + + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = false; + desc.n0 = (depth_multiplier == 1) ? 4 : 1; + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + { + desc.m0 = 2; + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace cl_dwc +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h new file mode 100644 index 0000000000..41d86c9c14 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST + +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Bifrost based OpenCL depthwise convolution configuration */ +class ClDWCNativeDefaultConfigBifrost final : public IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDWCNativeDefaultConfigBifrost(GPUTarget gpu); + + // Inherited overridden method + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; + +private: + DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGBIFROST */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp new file mode 100644 index 0000000000..ef1bb3858c --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/helpers/AdjustVecSize.h" + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu) +{ +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation, + unsigned int depth_multiplier); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + + ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16, + &ClDWCNativeDefaultConfigValhall::configure_G78_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G77: + func = configs_G77.get_function(src->data_type()); + break; + case GPUTarget::G78: + default: + func = configs_G78.get_function(src->data_type()); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for depthwise convolution"); + return (this->*func)(src, wei, conv_info, dilation, depth_multiplier); +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if (depth_multiplier == 1) + { + desc.n0 = 4; + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + // Src and weights have the same dimension indices + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape src_shape = src->tensor_shape(); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t src_w = src_shape[idx_w]; + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if (depth_multiplier == 1) + { + if (desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + if ((src_w % 5) == 0) + { + desc.m0 = 5; + } + else + { + desc.m0 = 4; + } + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + ARM_COMPUTE_UNUSED(wei); + + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = false; + desc.n0 = (depth_multiplier == 1) ? 4 : 1; + if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1) + { + desc.m0 = 2; + } + else + { + desc.m0 = 1; + } + } + + return desc; +} + +DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) +{ + DWCComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const size_t idx_c = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL); + const size_t idx_w = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH); + const TensorShape wei_shape = wei->tensor_shape(); + const size_t kernel_c = wei_shape[idx_c]; + const size_t kernel_w = wei_shape[idx_w]; + + desc.export_input_to_cl_image = false; + desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier); + + if (depth_multiplier == 1) + { + if (desc.export_weights_to_cl_image == false) + { + desc.n0 = 8; + } + else + { + desc.n0 = 4; + } + } + else + { + if ((depth_multiplier % 4) == 0) + { + desc.n0 = 4; + } + else if ((depth_multiplier % 2) == 0) + { + desc.n0 = 2; + } + else + { + desc.n0 = 1; + } + } + + // Note: If we reduce n0, export to cl_image must be false + ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && + (desc.export_weights_to_cl_image == true)); + + desc.n0 = adjust_vec_size(desc.n0, kernel_c); + + // Set m0 only if stride_x == 1 and dilation_x == 1 + if (conv_info.stride().first == 1 && dilation.x() == 1) + { + if ((kernel_w >= 9) || (kernel_w == 1)) + { + desc.m0 = 1; + } + else + { + desc.m0 = 2; + } + } + else + { + desc.m0 = 1; + } + } + + return desc; +} +} // namespace cl_dwc +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h new file mode 100644 index 0000000000..fabce77b54 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL + +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Valhall based OpenCL depthwise convolution configuration */ +class ClDWCNativeDefaultConfigValhall final : public IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClDWCNativeDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) override; + +private: + DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); + DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier); +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp new file mode 100644 index 0000000000..c8b006c546 --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier) +{ + // Check whether we can use the cl image with the weights. + if (!export_to_cl_image(weights)) + { + return false; + } + + const size_t idx_w = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT); + const size_t kernel_w = weights->tensor_shape()[idx_w]; + const size_t kernel_h = weights->tensor_shape()[idx_h]; + + // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons: + // 1- When the kernel size is 1x1 + // 2- When the depth multiplier is greater than 1 and not multiple of 4. + if ((kernel_w == 1) && (kernel_h == 1)) + { + return false; + } + + if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0) + { + return false; + } + + return true; +} +} // namespace cl_dwc +} // namespace arm_compute diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h new file mode 100644 index 0000000000..e3484c04ff --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS + +namespace arm_compute +{ +// Forward declaration +class ITensorInfo; + +namespace cl_dwc +{ +/** Utility function to know whether we can use the cl image storage for the weights of depthwise convolution to get better performance + * + * @param[in] weights Weights TensorInfo of the depthwise convolution + * @param[in] depth_multiplier Depth multiplier + * + * @return true if the weights of depthwise convolution can be kept in the cl image storage to improve the performance + */ +bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier); + +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEHEURISTICSHELPERS */ diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h new file mode 100644 index 0000000000..031cf1859a --- /dev/null +++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H + +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h" +#include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h" +#include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_dwc +{ +/** ClDWCNativeKernelConfigurationFactory factory class */ +class ClDWCNativeKernelConfigurationFactory final +{ +public: + /** Static method to call the ClDWCNative kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClDWCNativeKernelConfig + */ + static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + // The heuristic for Midgard is the same as the one used for Arm Mali-G71 + return std::make_unique<ClDWCNativeDefaultConfigBifrost>(GPUTarget::G71); + case GPUTarget::BIFROST: + return std::make_unique<ClDWCNativeDefaultConfigBifrost>(gpu); + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClDWCNativeDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_DWC_NATIVE_CLDWCNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h new file mode 100644 index 0000000000..614a6622df --- /dev/null +++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_dwc +{ +/** Basic container for the OpenCL depthwise convolution configuration functions */ +template <class T> +class ClDWCNativeConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for depthwise convolution F32 + * @param[in] func_f16 Function to call for depthwise convolution F16 + * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} + { + } + + /** Method to return the depthwise convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the depthwise convolution kernel configuration */ +class IClDWCNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig); + /** Virtual destructor */ + virtual ~IClDWCNativeKernelConfig() = default; + /** This method returns the @ref DWCComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + * @param[in] dilation Kernel dilation + * @param[in] depth_multiplier Output feature maps multiplier + */ + virtual DWCComputeKernelInfo configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info, + const Size2D &dilation, + unsigned int depth_multiplier) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_dwc +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_DWC_NATIVE_ICLDWCNATIVEKERNELCONFIG */ diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp new file mode 100644 index 0000000000..3380d8f1b7 --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +using namespace arm_compute::misc::shape_calculator; + +ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu) +{ +} + +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)( + const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + + ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16); + + // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes + // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned + // for the pointwise convolution cases. + + ConfigurationFunctionExecutorPtr func = configs_G77.get_function(src->data_type()); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for indirect convolution"); + return (this->*func)(src, wei, conv_info); +} + +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + const int32_t stride_x = conv_info.stride().first; + const int32_t stride_y = conv_info.stride().second; + const int32_t ofm = dst_shape[0]; + const int32_t m = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y); + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (ofm <= 4) + { + desc.m0 = 1; + desc.n0 = 2; + desc.k0 = 16; + } + else + { + // The 16000 threshold value has been identified as the right + // one for using the biggest block size allowed on F32: 5x4x4 + if (m < 16000) + { + desc.m0 = 4; + desc.n0 = 4; + desc.k0 = 4; + } + else + { + desc.m0 = 5; + desc.n0 = 4; + desc.k0 = 4; + } + } + } + + return desc; +} + +DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, + const ITensorInfo *wei, + const PadStrideInfo &conv_info) +{ + DirectConvComputeKernelInfo desc; + + if (src->data_layout() == DataLayout::NHWC) + { + const TensorShape wei_shape = wei->tensor_shape(); + const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info); + const bool export_weights_to_cl_image = export_to_cl_image(wei); + + const int32_t ofm = dst_shape[0]; + const int32_t m = dst_shape[1] * dst_shape[2]; + const int32_t k = wei_shape[0]; + + desc.export_weights_to_cl_image = export_weights_to_cl_image; + + if (ofm <= 4) + { + // k0 should be as larger as possible. However, we should avoid + // having left-over for loops that make the implementation slower. + if ((k % 16) == 0) + { + desc.k0 = 16; + } + else if ((k % 8) == 0) + { + desc.k0 = 8; + } + else + { + desc.k0 = 4; + } + + desc.m0 = 1; + desc.n0 = ofm; + } + else + { + // The 16000 threshold value has been identified as the right + // one for using the biggest block size allowed on F16: 8x4 + if (m >= 16000 && k < 4) + { + desc.m0 = 8; + desc.n0 = 4; + desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4 + } + else + { + desc.m0 = 5; + desc.n0 = 4; + desc.k0 = 8; + } + } + } + + return desc; +} +} // namespace cl_indirect_conv +} // namespace arm_compute diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h new file mode 100644 index 0000000000..bab808c66c --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL +#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL + +#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +/** Valhall based OpenCL indirect convolution configuration */ +class ClIndirectConvDefaultConfigValhall final : public IClIndirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClIndirectConvDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override; + +private: + DirectConvComputeKernelInfo + configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); + DirectConvComputeKernelInfo + configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info); +}; +} // namespace cl_indirect_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVDEFAULTCONFIGVALHALL */ diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h new file mode 100644 index 0000000000..5e7ba6f8e9 --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H + +#include "src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h" +#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +/** ClIndirectConvolution factory class */ +class ClIndirectConvKernelConfigurationFactory final +{ +public: + /** Static method to call the ClIndirectConvolution kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClIndirectConvKernelConfig + */ + static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClIndirectConvDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_indirect_conv +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_CLINDIRECTCONVKERNELCONFIG_H diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h new file mode 100644 index 0000000000..d05da18b58 --- /dev/null +++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG +#define SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_indirect_conv +{ +/** Basic container for the OpenCL indirect convolution configuration functions */ +template <class T> +class ClIndirectConvConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + + /** Constructor + * + * @param[in] func_f32 Function to call for indirect convolution F32 + * @param[in] func_f16 Function to call for indirect convolution F16 + * + */ + ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16} + { + } + + /** Method to return the indirect convolution configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + default: + return nullptr; + } + } + +private: + std::array<T, 2> _configs; +}; + +/** Basic interface for the indirect convolution kernel configuration */ +class IClIndirectConvKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig); + /** Virtual destructor */ + virtual ~IClIndirectConvKernelConfig() = default; + /** This method returns the @ref DirectConvComputeKernelInfo for the given inputs + * + * @param[in] src Source tensor (activation tensor) + * @param[in] wei Weights tensor + * @param[in] conv_info Convolution info + */ + virtual DirectConvComputeKernelInfo + configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_indirect_conv +} // namespace arm_compute +#endif /* SRC_RUNTIME_HEURISTICS_INDIRECT_CONV_ICLINDIRECTCONVKERNELCONFIG */ diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp new file mode 100644 index 0000000000..3a02a60650 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp @@ -0,0 +1,314 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" + +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h" + +#include <utility> + +namespace arm_compute +{ +namespace cl_matmul +{ +ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu) +{ +} + +MatMulKernelInfo +ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) +{ + using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + + ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710( + &ClMatMulNativeDefaultConfigValhall::configure_G710_f32, + &ClMatMulNativeDefaultConfigValhall::configure_G710_f16, + &ClMatMulNativeDefaultConfigValhall::configure_G710_u8); + + ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G715( + &ClMatMulNativeDefaultConfigValhall::configure_G715_f32, + &ClMatMulNativeDefaultConfigValhall::configure_G715_f16, + &ClMatMulNativeDefaultConfigValhall::configure_G715_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G715: + case GPUTarget::G615: + func = configs_G715.get_function(lhs->data_type()); + break; + case GPUTarget::G710: + default: + func = configs_G710.get_function(lhs->data_type()); + break; + } + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + TensorShape lhs_shape = lhs->tensor_shape(); + TensorShape rhs_shape = rhs->tensor_shape(); + + const bool is_batched = lhs_shape.num_dimensions() > 2; + + if (is_batched == true) + { + lhs_shape.collapse_from(2); + } + + const unsigned int m = adj_lhs ? lhs_shape.x() : lhs_shape.y(); + const unsigned int n = adj_rhs ? rhs_shape.y() : rhs_shape.x(); + const unsigned int k = adj_lhs ? lhs_shape.y() : lhs_shape.x(); + const unsigned int b = lhs_shape.z(); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native"); + return (this->*func)(m, n, k, b, rhs->lock_paddings(), info); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding); + return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 4, /* k0 */ 1, /* export_to_cl_image */ false}; +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + return configure_G715_f32(m, n, k, b, rhs_lock_padding, info); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding); + return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 4, /* n0 */ 16, /* k0 */ 4, /* export_to_cl_image */ false}; +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1}, + {1568, 64, 40, 36, 2, 8, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0}, {688, 92, 68, 32, 5, 4, 4, 0}, + {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0}, + {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 2, 8, 4, 1}, + {24, 464, 412, 24, 2, 8, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; + const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; + + if ((adj_lhs == false) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_nt_nt; + configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; + } + else if ((adj_lhs == false) && (adj_rhs == true)) + { + configs_best_to_use = &configs_mnkb_best_nt_t; + configs_fallback_to_use = &configs_mnkb_fallback_nt_t; + } + else if ((adj_lhs == true) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_t_nt; + configs_fallback_to_use = &configs_mnkb_fallback_t_nt; + } + else + { + configs_best_to_use = &configs_mnkb_best_t_t; + configs_fallback_to_use = &configs_mnkb_fallback_t_t; + } + + MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); + MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); + + return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 16, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = { + {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0}, {688, 92, 68, 32, 6, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0}, + {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1}, + {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = { + {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0}, {688, 92, 68, 32, 6, 2, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0}, + {1568, 64, 40, 36, 5, 4, 8, 0}, {2920, 64, 64, 24, 6, 2, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 4, 4, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1}, + {1568, 64, 40, 36, 4, 4, 4, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = { + {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1}, {688, 92, 68, 32, 4, 4, 4, 1}, + {24, 464, 412, 24, 4, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1}, + {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 16, 1}}; + + const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + const MatMulNativeConfigsMatrix *configs_best_to_use = nullptr; + const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr; + + if ((adj_lhs == false) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_nt_nt; + configs_fallback_to_use = &configs_mnkb_fallback_nt_nt; + } + else if ((adj_lhs == false) && (adj_rhs == true)) + { + configs_best_to_use = &configs_mnkb_best_nt_t; + configs_fallback_to_use = &configs_mnkb_fallback_nt_t; + } + else if ((adj_lhs == true) && (adj_rhs == false)) + { + configs_best_to_use = &configs_mnkb_best_t_nt; + configs_fallback_to_use = &configs_mnkb_fallback_t_nt; + } + else + { + configs_best_to_use = &configs_mnkb_best_t_t; + configs_fallback_to_use = &configs_mnkb_fallback_t_t; + } + + MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b); + MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b); + + return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(rhs_lock_padding); + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = { + {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0}, {688, 92, 68, 32, 2, 8, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = { + {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0}, {688, 92, 68, 32, 4, 4, 16, 0}, + {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0}, + {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 16, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = { + {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0}, {688, 92, 68, 32, 4, 4, 4, 0}, + {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}}; + + const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = { + {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0}, {688, 92, 68, 32, 4, 4, 8, 0}, + {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0}, + {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 2, 16, 0}}; + + const bool adj_lhs = info.adj_lhs(); + const bool adj_rhs = info.adj_rhs(); + + if ((adj_lhs == false) && (adj_rhs == false)) + { + return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b); + } + else if ((adj_lhs == false) && (adj_rhs == true)) + { + return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b); + } + else if ((adj_lhs == true) && (adj_rhs == false)) + { + return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b); + } + else + { + return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b); + } +} +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h new file mode 100644 index 0000000000..5279871057 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H + +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Valhall based OpenCL matmul configuration */ +class ClMatMulNativeDefaultConfigValhall final : public IClMatMulNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClMatMulNativeDefaultConfigValhall(GPUTarget gpu); + + // Inherited overridden method + MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override; + +private: + MatMulKernelInfo configure_G710_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G710_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp new file mode 100644 index 0000000000..3878f698fd --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +ClMatMulNativeDefaultVariantValhall::ClMatMulNativeDefaultVariantValhall(GPUTarget gpu) + : IClMatMulNativeKernelVariant(gpu) +{ +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(rhs); + + using VariantFunctionExecutorPtr = + MatMulKernelType (ClMatMulNativeDefaultVariantValhall::*)(int k, bool act_enabled); + + ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_G715( + &ClMatMulNativeDefaultVariantValhall::configure_G715_float, + &ClMatMulNativeDefaultVariantValhall::configure_G715_quantized); + + ClMatMulNativeVariantArray<VariantFunctionExecutorPtr> configs_default( + &ClMatMulNativeDefaultVariantValhall::configure_default_float, + &ClMatMulNativeDefaultVariantValhall::configure_default_quantized); + + VariantFunctionExecutorPtr func = nullptr; + switch (_target) + { + case GPUTarget::G715: + case GPUTarget::G615: + func = configs_G715.get_function(lhs->data_type()); + break; + default: + func = configs_default.get_function(lhs->data_type()); + break; + } + + const int k = info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); + const bool act_enabled = act_info.enabled(); + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not supported for matmul native"); + return (this->*func)(k, act_enabled); +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_float(int k, bool act_enabled) +{ + // MMUL kernel works only when K is a multiple of 4 + if (!act_enabled && k % 4 == 0) + { + return MatMulKernelType::NATIVE_MMUL_FP; + } + + return MatMulKernelType::NATIVE_FP; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_G715_quantized(int k, bool act_enabled) +{ + // MMUL kernel works only when K is a multiple of 16 + if (!act_enabled && k % 16 == 0) + { + return MatMulKernelType::NATIVE_MMUL_QUANTIZED; + } + + return MatMulKernelType::NATIVE_QUANTIZED; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_float(int k, bool act_enabled) +{ + ARM_COMPUTE_UNUSED(k, act_enabled); + + return MatMulKernelType::NATIVE_FP; +} + +MatMulKernelType ClMatMulNativeDefaultVariantValhall::configure_default_quantized(int k, bool act_enabled) +{ + ARM_COMPUTE_UNUSED(k, act_enabled); + + return MatMulKernelType::NATIVE_QUANTIZED; +} + +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h new file mode 100644 index 0000000000..a202676e98 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H + +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Valhall based OpenCL matmul configuration */ +class ClMatMulNativeDefaultVariantValhall final : public IClMatMulNativeKernelVariant +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClMatMulNativeDefaultVariantValhall(GPUTarget gpu); + + // Inherited overridden method + MatMulKernelType select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) override; + +private: + MatMulKernelType configure_G715_float(int k, bool act_enabled); + MatMulKernelType configure_G715_quantized(int k, bool act_enabled); + MatMulKernelType configure_default_float(int k, bool act_enabled); + MatMulKernelType configure_default_quantized(int k, bool act_enabled); +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTVARIANTVALHALL_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp new file mode 100644 index 0000000000..89cad30214 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h" + +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" + +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" + +#include <limits> +#include <utility> + +namespace arm_compute +{ +namespace cl_matmul +{ +MatMulKernelInfo select_info(const MatMulKernelInfo &info0, + const MatMulKernelInfo &info1, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding) +{ + ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, + "The fallback MatMul configuration cannot have export_to_cl_image = true"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, + "The MatMul configurations must have the same adj_lhs value"); + ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, + "The MatMul configurations must have the same adj_rhs value"); + + const bool adj_lhs = info0.adj_lhs; + const bool adj_rhs = info0.adj_rhs; + + TensorInfo lhs_info = + !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type); + TensorInfo rhs_info = + !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type); + TensorInfo dst_info; + + if (rhs_lock_padding == false) + { + if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0))) + { + return info0; + } + else + { + return info1; + } + } + else + { + return info1; + } +} + +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b) +{ + size_t min_acc = std::numeric_limits<size_t>::max(); + size_t min_idx = 0; + + ARM_COMPUTE_ERROR_ON(configs.size() == 0); + const size_t num_rows = configs.size(); + const size_t num_cols = configs[0].size(); + + ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, + "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS"); + ARM_COMPUTE_UNUSED(num_cols); + + // Find nearest GeMM workload + // Note: the workload does not depend on the K dimension + for (size_t y = 0; y < num_rows; ++y) + { + size_t mc0 = static_cast<size_t>(configs[y][0]); + size_t nc0 = static_cast<size_t>(configs[y][1]); + size_t kc0 = static_cast<size_t>(configs[y][2]); + size_t bc0 = static_cast<size_t>(configs[y][3]); + + size_t acc = 0; + acc += (m - mc0) * (m - mc0); + acc += (n - nc0) * (n - nc0); + acc += (k - kc0) * (k - kc0); + acc += (b - bc0) * (b - bc0); + acc = std::sqrt(acc); + if (acc < min_acc) + { + min_acc = acc; + min_idx = y; + } + } + + // Get the configuration from the nearest GeMM shape + MatMulKernelInfo desc; + desc.adj_lhs = adj_lhs; + desc.adj_rhs = adj_rhs; + desc.m0 = configs[min_idx][4]; + desc.n0 = configs[min_idx][5]; + desc.k0 = configs[min_idx][6]; + desc.export_rhs_to_cl_image = configs[min_idx][7]; + + return desc; +} +} // namespace cl_matmul +} // namespace arm_compute diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h new file mode 100644 index 0000000000..699f5fe8c1 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H + +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +// Forward declaration +struct MatMulKernelInfo; + +namespace cl_matmul +{ +using MatMulNativeConfigsMatrix = std::vector<std::vector<int32_t>>; + +/** This function accepts two MatMulKernelInfo objects where only the first can be with cl_image2d support enabled. + * The aim of this function is to check whether the first MatMulKernelInfo object is valid. If not, the function will + * return the second MatMulKernelInfo object. Otherwise, the first one. + * + * @param[in] info0 MatMulKernelInfo with cl_image2d support + * @param[in] info1 MatMulKernelInfo to fall-back if cl_image2d cannot be used + * @param[in] m Number of rows (M) of the LHS matrix + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] k Number of rows (K) in the RHS matrix not reshaped + * @param[in] b Batch size + * @param[in] data_type Data type + * @param[in] rhs_lock_padding Flag used to know whether the RHS paddings are locked + * + * @return @ref MatMulKernelInfo + */ +MatMulKernelInfo select_info(const MatMulKernelInfo &info0, + const MatMulKernelInfo &info1, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type, + bool rhs_lock_padding); + +/** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user + * + * @param[in] configs List of best configurations for a limited number of MatMul shapes + * @param[in] adj_lhs Adjoint LHS flag value + * @param[in] adj_rhs Adjoint RHS flag value + * @param[in] m Number of rows (M) of the LHS matrix + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] k Number of rows (K) in the RHS matrix not reshaped + * @param[in] b Batch size + * + * @return @ref MatMulKernelInfo + */ +MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, + bool adj_lhs, + bool adj_rhs, + unsigned int m, + unsigned int n, + unsigned int k, + unsigned int b); +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h new file mode 100644 index 0000000000..e7485bca81 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H + +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_matmul +{ +/** ClMatMul configuration factory class */ +class ClMatMulNativeKernelConfigurationFactory final +{ +public: + /** Static method to call the ClMatMul configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClMatMulNativeKernelConfig + */ + static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClMatMulNativeDefaultConfigValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h new file mode 100644 index 0000000000..c2895b8919 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H + +#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultVariantValhall.h" +#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h" + +#include <memory> + +namespace arm_compute +{ +namespace cl_matmul +{ + +/** ClMatMul variant factory class */ +class ClMatMulNativeKernelVariantFactory final +{ +public: + /** Static method to call the ClMatMul configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return IClMatMulNativeKernelVariant + */ + static std::unique_ptr<IClMatMulNativeKernelVariant> create(GPUTarget gpu) + { + switch (get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + case GPUTarget::VALHALL: + case GPUTarget::FIFTHGEN: + return std::make_unique<ClMatMulNativeDefaultVariantValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELVARIANT_H diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h new file mode 100644 index 0000000000..00ba3641d5 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/function_info/MatMulInfo.h" + +#include "src/core/common/Macros.h" + +namespace arm_compute +{ +namespace cl_matmul +{ +/** Basic container for the OpenCL MatMul Native configuration functions */ +template <class T> +class ClMatMulNativeConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for matmul native F32 + * @param[in] func_f16 Function to call for matmul native F16 + * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} + { + } + + /** Method to return the matmul native configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the matmul native kernel configuration + * This is the base class that chooses architecture specific kernel configurations. +*/ +class IClMatMulNativeKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig); + /** Virtual destructor */ + virtual ~IClMatMulNativeKernelConfig() = default; + /** This method returns the @ref MatMulKernelInfo for the given inputs + * + * @param[in] lhs LHS tensor + * @param[in] rhs RHS tensor + * @param[in] info MatMul info + */ + virtual MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELCONFIG_H diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h new file mode 100644 index 0000000000..eac41dd6a3 --- /dev/null +++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelVariant.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H + +#include "arm_compute/core/CoreTypes.h" // DataType +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/function_info/ActivationLayerInfo.h" +#include "arm_compute/function_info/MatMulInfo.h" + +#include "src/core/common/Macros.h" + +#include <array> + +namespace arm_compute +{ +namespace cl_matmul +{ +enum class MatMulKernelType +{ + /** Native matrix multiplication for FP types */ + NATIVE_FP, + + /** Native matrix multiplication for quantized types */ + NATIVE_QUANTIZED, + + /** Native matrix multiplication using MMUL extension for FP types */ + NATIVE_MMUL_FP, + + /** Native matrix multiplication using MMUL extension for Quantized types */ + NATIVE_MMUL_QUANTIZED +}; + +/** Basic container for the OpenCL MatMul Native variant functions */ +template <class T> +class ClMatMulNativeVariantArray +{ +public: + /** Alias for Float index */ + static constexpr size_t DT_FLOAT = 0; + /** Alias for Quantized type index */ + static constexpr size_t DT_QUANTIZED = 1; + + /** Constructor + * + * @param[in] func_float Function to call for matmul native float (F32, F16) + * @param[in] func_quantized Function to call for matmul native quantized (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + ClMatMulNativeVariantArray(T func_float, T func_quantized) : _configs{func_float, func_quantized} + { + } + + /** Method to return the matmul native variant function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch (data_type) + { + case DataType::F32: + case DataType::F16: + return _configs.at(DT_FLOAT); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_QUANTIZED); + default: + return nullptr; + } + } + +private: + std::array<T, 2> _configs; +}; + +/** Basic interface for the matmul native kernel variant + * This is the base class that chooses architecture specific kernel variants. +*/ +class IClMatMulNativeKernelVariant +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClMatMulNativeKernelVariant(GPUTarget arch) : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelVariant); + /** Virtual destructor */ + virtual ~IClMatMulNativeKernelVariant() = default; + /** This method returns the @ref MatMulKernelType for the given inputs + * + * @param[in] lhs LHS tensor + * @param[in] rhs RHS tensor + * @param[in] info MatMul info + * @param[in] act_info Activation layer info + */ + virtual MatMulKernelType select_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &info, + const ActivationLayerInfo &act_info) = 0; + +protected: + GPUTarget _target; +}; +} // namespace cl_matmul +} // namespace arm_compute +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_ICLMATMULNATIVEKERNELVARIANT_H |