From 6e09e1404c635d948cf20eb6b4b5747dfb6656f2 Mon Sep 17 00:00:00 2001 From: Murray Kornelsen Date: Wed, 13 Jul 2022 21:40:26 -0400 Subject: INT8 Quantized MeanStdDevNorm (LayerNorm) Implements LayerNorm for qasymm8 tensors. Uses uint8x16 loads and stores. Summation is performed in integer arithmetic (vpaddl) Normalization is performed in float32 before requantizing back to int8. Signed-off-by: Murray Kornelsen Change-Id: I2407c8b34717fb47adab98791bd76fb8a3c62f4a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7922 Comments-Addressed: Pablo Marquez Tello Comments-Addressed: Arm Jenkins Reviewed-by: Viet-Hoa Do Reviewed-by: Pablo Marquez Tello Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- Android.bp | 1 + filelist.json | 3 +- scripts/clang-tidy.h | 7 + .../kernels/NEMeanStdDevNormalizationKernel.cpp | 9 +- .../meanstddevnorm/generic/neon/qasymm8.cpp | 145 +++++++++++++++++++++ src/cpu/kernels/meanstddevnorm/list.h | 1 + .../NEON/MeanStdDevNormalizationLayer.cpp | 19 ++- .../fixtures/MeanStdDevNormalizationLayerFixture.h | 39 +++--- .../reference/MeanStdDevNormalizationLayer.cpp | 11 +- 9 files changed, 213 insertions(+), 22 deletions(-) create mode 100644 src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp diff --git a/Android.bp b/Android.bp index 6f6c66cc55..8c6d700062 100644 --- a/Android.bp +++ b/Android.bp @@ -520,6 +520,7 @@ cc_library_static { "src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp", "src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp", "src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp", + "src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp", "src/cpu/kernels/pool2d/neon/fp16.cpp", "src/cpu/kernels/pool2d/neon/fp32.cpp", "src/cpu/kernels/pool2d/neon/nchw/all.cpp", diff --git a/filelist.json b/filelist.json index c218ed9129..eb39915524 100644 --- a/filelist.json +++ b/filelist.json @@ -1738,7 +1738,8 @@ "neon":{ "common":["src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp"], "fp32":["src/cpu/kernels/meanstddevnorm/generic/neon/fp32.cpp"], - "fp16":["src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp"] + "fp16":["src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp"], + "qasymm8":["src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp"] } } }, diff --git a/scripts/clang-tidy.h b/scripts/clang-tidy.h index b3705122c6..24e4b15c6f 100644 --- a/scripts/clang-tidy.h +++ b/scripts/clang-tidy.h @@ -1,5 +1,12 @@ #include +#if __arm__ +inline uint32x4_t vpaddq_u32(uint32x4_t, uint32x4_t) +{ + return vdupq_n_u32(0); +} +#endif + inline float16x4_t vrsqrts_f16 (float16x4_t, float16x4_t) { return vdup_n_f16(0); diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp index 7d8fc7ec7f..37e88a8565 100644 --- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp +++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp @@ -55,7 +55,7 @@ struct MeanStdDevNormKernel MeanStdDevNormUKernelPtr ukernel; }; -static const MeanStdDevNormKernel available_kernels[] = +static const std::vector available_kernels = { { "fp32_neon_meanstddevnorm", @@ -69,6 +69,11 @@ static const MeanStdDevNormKernel available_kernels[] = REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm) }, #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + { + "qasymm8_neon_meanstddevnorm", + [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::QASYMM8; }, + REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm) + }, }; /** Micro-kernel selector @@ -95,7 +100,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, f ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 2, "Input tensor cannot have more than 2 dimensions"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8); // Checks performed when output is configured if((output != nullptr) && (output->total_size() != 0)) diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp new file mode 100644 index 0000000000..53af1e4b16 --- /dev/null +++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Window.h" +#include "src/core/NEON/NEAsymm.h" +#include "src/core/NEON/NEMath.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include +namespace +{ +inline float32x4_t clamp_v4f32(float32x4_t block, float32x4_t quant_min_vec, float32x4_t quant_max_vec) +{ + return vminq_f32(vmaxq_f32(block, quant_min_vec), quant_max_vec); +} +inline uint16x8_t fuse_words_f32(float32x4_t fb1, float32x4_t fb2) +{ + return vcombine_u16(vmovn_u32(vcvtq_u32_f32(fb1)), vmovn_u32(vcvtq_u32_f32(fb2))); +} +inline uint8x16_t fuse_shorts_u16(uint16x8_t sb1, uint16x8_t sb2) +{ + return vcombine_u8(vmovn_u16(sb1), vmovn_u16(sb2)); +} +} // namespace + +namespace arm_compute +{ +namespace cpu +{ +void neon_qasymm8_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window) +{ + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + const int window_step_x = 16; + const int window_start_x = static_cast(window.x().start()); + const int window_end_x = static_cast(window.x().end()); + + const UniformQuantizationInfo qi_out = output->info()->quantization_info().uniform(); + const float output_scale = qi_out.scale; + const int output_offset = qi_out.offset; + + Iterator input_itr(input, win); + Iterator output_itr(output, win); + + const float output_inv_scale = 1.0f / output_scale; + const float32x4_t quant_max_vec = vdupq_n_f32(255.0f); + const float32x4_t quant_min_vec = vdupq_n_f32(0.0f); + + execute_window_loop( + win, [&](const Coordinates &) + { + int x = window_start_x; + auto in_ptr = reinterpret_cast(input_itr.ptr()); + auto out_ptr = reinterpret_cast(output_itr.ptr()); + + uint32x4_t sum_vec = vdupq_n_u32(0); + uint32x4_t sum_sq_vec = vdupq_n_u32(0); + + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + sum_vec = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data))); + const uint16x8_t squares_low = vmull_u8(vget_low_u8(data), vget_low_u8(data)); + const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data)); + sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high))); + } + +#ifdef __aarch64__ + sum_vec = vpaddq_u32(sum_vec, sum_vec); + sum_vec = vpaddq_u32(sum_vec, sum_vec); + uint32_t sum = vgetq_lane_u32(sum_vec, 0); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + sum_sq_vec = vpaddq_u32(sum_sq_vec, sum_sq_vec); + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0); +#elif __arm__ // #ifdef __aarch64__ + uint32_t sum = vgetq_lane_u32(sum_vec, 0) + + vgetq_lane_u32(sum_vec, 1) + + vgetq_lane_u32(sum_vec, 2) + + vgetq_lane_u32(sum_vec, 3); + + uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + + vgetq_lane_u32(sum_sq_vec, 1) + + vgetq_lane_u32(sum_sq_vec, 2) + + vgetq_lane_u32(sum_sq_vec, 3); +#endif // #ifdef __aarch64__ + for(; x < window_end_x; ++x) + { + auto data = static_cast(*(in_ptr + x)); + sum += data; + sum_sq += (data * data); + } + + const float mean = (static_cast(sum) / static_cast(input->info()->dimension(0))); + const float var = (static_cast(sum_sq) / static_cast(input->info()->dimension(0))) - (mean * mean); + const float stdev_inv = 1.0f / sqrtf(var + epsilon); + const float32x4_t v_scale = vdupq_n_f32(stdev_inv * output_inv_scale); + const float32x4_t v_offset = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset); + for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x) + { + const uint8x16_t data = vld1q_u8(in_ptr + x); + float32x4_t db1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data))))); + float32x4_t db3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data))))); + float32x4_t db4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data))))); + db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec); + db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec); + db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec); + db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec); + const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4)); + vst1q_u8(out_ptr + x, out); + } + + for(; x < window_end_x; ++x) + { + auto data = static_cast(*(in_ptr + x)); + const uint8_t res = data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset); + *(out_ptr + x) = res; + } + }, + input_itr, output_itr); +} +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/meanstddevnorm/list.h b/src/cpu/kernels/meanstddevnorm/list.h index ac9cb37d23..6277d65884 100644 --- a/src/cpu/kernels/meanstddevnorm/list.h +++ b/src/cpu/kernels/meanstddevnorm/list.h @@ -32,6 +32,7 @@ namespace cpu DECLARE_MEANSTDDEVNORM_KERNEL(neon_fp32_meanstddevnorm); DECLARE_MEANSTDDEVNORM_KERNEL(neon_fp16_meanstddevnorm); +DECLARE_MEANSTDDEVNORM_KERNEL(neon_qasymm8_meanstddevnorm); #undef DECLARE_MEANSTDDEVNORM_KERNEL } // namespace cpu diff --git a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp index dee8f78da9..085f3608a0 100644 --- a/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp +++ b/tests/validation/NEON/MeanStdDevNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -47,7 +47,8 @@ namespace #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC RelativeTolerance tolerance_f16(half(0.2f)); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -RelativeTolerance tolerance_f32(1e-4f); +RelativeTolerance tolerance_f32(1e-4f); +RelativeTolerance tolerance_qasymm8(1); } // namespace TEST_SUITE(NEON) @@ -114,9 +115,23 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEMeanStdDevNormalizationLayerFixture, f // Validate output validate(Accessor(_target), _reference, tolerance_f32); } + TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float +TEST_SUITE(Quantized) +TEST_SUITE(QASYMM8) +FIXTURE_DATA_TEST_CASE(RunSmall, NEMeanStdDevNormalizationLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::Small2DShapes(), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("InPlace", { false, true })), + framework::dataset::make("Epsilon", { 1e-7 }))) +{ + // Validate output + validate(Accessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // Quantized +TEST_SUITE_END() // QASYMM8 + TEST_SUITE_END() // MeanStdNormalizationLayer TEST_SUITE_END() // Neon } // namespace validation diff --git a/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h b/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h index 9868cd1abf..f3c108e6da 100644 --- a/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h +++ b/tests/validation/fixtures/MeanStdDevNormalizationLayerFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -45,29 +45,35 @@ class MeanStdDevNormalizationLayerValidationFixture : public framework::Fixture { public: template - void setup(TensorShape shape, DataType dt, bool in_place, float epsilon = 1e-8f) + void setup(TensorShape shape, DataType dt, bool in_place, float epsilon = 1e-8) { - _data_type = dt; - _target = compute_target(shape, dt, in_place, epsilon); - _reference = compute_reference(shape, dt, epsilon); + QuantizationInfo qi = QuantizationInfo(0.5f, 10); + _data_type = dt; + _target = compute_target(shape, dt, in_place, epsilon, qi); + _reference = compute_reference(shape, dt, epsilon, qi); } protected: template - void fill(U &&src_tensor) + void fill(U &&tensor) { - static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); - using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; - - DistributionType distribution{ T(-1.0f), T(1.0f) }; - library->fill(src_tensor, distribution, 0); + if(is_data_type_float(_data_type)) + { + std::uniform_real_distribution<> distribution{ -1.0f, 1.0f }; + library->fill(tensor, distribution, 0); + } + else + { + std::uniform_int_distribution<> distribution{ 0, 255 }; + library->fill(tensor, distribution, 0); + } } - TensorType compute_target(TensorShape shape, DataType dt, bool in_place, float epsilon) + TensorType compute_target(TensorShape shape, DataType dt, bool in_place, float epsilon, QuantizationInfo qi) { // Create tensors - TensorType src = create_tensor(shape, dt, 1); - TensorType dst; + TensorType src = create_tensor(shape, dt, 1, qi); + TensorType dst = create_tensor(shape, dt, 1, qi); TensorType *dst_ptr = in_place ? &src : &dst; @@ -104,10 +110,10 @@ protected: } } - SimpleTensor compute_reference(const TensorShape &shape, DataType dt, float epsilon) + SimpleTensor compute_reference(const TensorShape &shape, DataType dt, float epsilon, QuantizationInfo qi) { // Create reference - SimpleTensor ref_src{ shape, dt, 1 }; + SimpleTensor ref_src{ shape, dt, 1, qi }; // Fill reference fill(ref_src); @@ -119,6 +125,7 @@ protected: SimpleTensor _reference{}; DataType _data_type{}; }; + } // namespace validation } // namespace test } // namespace arm_compute diff --git a/tests/validation/reference/MeanStdDevNormalizationLayer.cpp b/tests/validation/reference/MeanStdDevNormalizationLayer.cpp index 0a23fa19bb..a7c8a784d9 100644 --- a/tests/validation/reference/MeanStdDevNormalizationLayer.cpp +++ b/tests/validation/reference/MeanStdDevNormalizationLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019, 2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -63,6 +63,15 @@ SimpleTensor mean_std_normalization_layer(const SimpleTensor &src, float e return dst; } +template <> +SimpleTensor mean_std_normalization_layer(const SimpleTensor &src, float epsilon) +{ + SimpleTensor src_tmp = convert_from_asymmetric(src); + SimpleTensor dst_tmp = mean_std_normalization_layer(src_tmp, epsilon); + SimpleTensor dst = convert_to_asymmetric(dst_tmp, src.quantization_info()); + return dst; +} + template SimpleTensor mean_std_normalization_layer(const SimpleTensor &src, float epsilon); template SimpleTensor mean_std_normalization_layer(const SimpleTensor &src, float epsilon); } // namespace reference -- cgit v1.2.1