From 923241eb998ad031f4cce7b12d8c24a0b6c80be8 Mon Sep 17 00:00:00 2001 From: zhenglin Date: Tue, 5 Dec 2017 11:30:51 +0800 Subject: APPBROWSER-314: Performance optimazation for BatchNormalizationLayer Change-Id: Ie3ad9abb64e90720609bb6e67662eaf9dd4f3689 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111826 Reviewed-by: Joel Liang Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com (cherry picked from commit 02c1fa663926cc4fcd1995d4d18d7528e0c85d94) Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111834 Reviewed-by: Anthony Barbier --- .../cs_shaders/batchnormalization_layer.cs | 91 ++++++++++++++++------ .../kernels/GCBatchNormalizationLayerKernel.cpp | 14 ++-- .../GLES_COMPUTE/BatchNormalizationLayer.cpp | 73 +++++++++++++++++ .../fixtures/BatchNormalizationLayerFixture.h | 12 +++ 4 files changed, 161 insertions(+), 29 deletions(-) create mode 100755 tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs index c3df5d5c4d..be1d01f6c5 100644 --- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs +++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs @@ -127,12 +127,12 @@ void main(void) } #elif defined(DATA_TYPE_FP16) -BUFFER_DECLARATION(src, 1, uint, ); -BUFFER_DECLARATION(dst, 2, uint, writeonly); -BUFFER_DECLARATION(mean, 3, uint, ); -BUFFER_DECLARATION(var, 4, uint, ); -BUFFER_DECLARATION(beta, 5, uint, ); -BUFFER_DECLARATION(gamma, 6, uint, ); +BUFFER_DECLARATION(src, 1, uvec2, readonly); +BUFFER_DECLARATION(dst, 2, uvec2, writeonly); +BUFFER_DECLARATION(mean, 3, uvec2, readonly); +BUFFER_DECLARATION(var, 4, uvec2, readonly); +BUFFER_DECLARATION(beta, 5, uvec2, readonly); +BUFFER_DECLARATION(gamma, 6, uvec2, readonly); /** Apply batch normalization. * @@ -180,43 +180,86 @@ void main(void) Vector beta = CONVERT_TO_VECTOR_STRUCT_FP16(beta); Vector gamma = CONVERT_TO_VECTOR_STRUCT_FP16(gamma); - vec2 input_value; + uvec2 packed_s[5]; + vec4 unpacked_s[5]; float denominator; float numerator; - vec2 x_bar; float gamma_param; float beta_param; + vec4 x_bar; + vec4 result; uint current_slice = gl_GlobalInvocationID.z; - if((current_slice % uint(2)) == uint(0)) + packed_s[0] = src_ptr[src.current_offset >> 3]; + packed_s[1] = var_ptr[(var.current_offset + current_slice * var.stride_x) >> 3]; + packed_s[2] = mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 3]; + packed_s[3] = gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 3]; + packed_s[4] = beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 3]; + unpacked_s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y)); + unpacked_s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y)); + unpacked_s[2] = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y)); + unpacked_s[3] = vec4(unpackHalf2x16(packed_s[3].x), unpackHalf2x16(packed_s[3].y)); + unpacked_s[4] = vec4(unpackHalf2x16(packed_s[4].x), unpackHalf2x16(packed_s[4].y)); + + if((current_slice % uint(4)) == uint(0)) + { + denominator = unpacked_s[1].x; + denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON)))); + + //Calculate x bar and store results + numerator = unpacked_s[2].x; + x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator); + + gamma_param = unpacked_s[3].x; + beta_param = unpacked_s[4].x; + result = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param); + + dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw)); + } + else if((current_slice % uint(4)) == uint(1)) { - input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]); - denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).x; + denominator = unpacked_s[1].y; denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON)))); //Calculate x bar and store results - numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).x; - x_bar = MUL_OP(SUB_OP(input_value, numerator), denominator); + numerator = unpacked_s[2].y; + x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator); - gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).x; - beta_param = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).x; + gamma_param = unpacked_s[3].y; + beta_param = unpacked_s[4].y; + result = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param); - dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param)); + dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw)); + } + else if((current_slice % uint(4)) == uint(2)) + { + denominator = unpacked_s[1].z; + denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON)))); + + //Calculate x bar and store results + numerator = unpacked_s[2].z; + x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator); + + gamma_param = unpacked_s[3].z; + beta_param = unpacked_s[4].z; + result = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param); + + dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw)); } else { - input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]); - denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).y; + denominator = unpacked_s[1].w; denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON)))); //Calculate x bar and store results - numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).y; - x_bar = MUL_OP(SUB_OP(input_value, numerator), denominator); + numerator = unpacked_s[2].w; + x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator); - gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).y; - beta_param = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).y; + gamma_param = unpacked_s[3].w; + beta_param = unpacked_s[4].w; + result = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param); - dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param)); + dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw)); } } -#endif /*DATA_TYPE_FP32*/ +#endif /*DATA_TYPE_FP16*/ diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp index 982143f0b2..dee2a5579b 100644 --- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp @@ -64,7 +64,11 @@ void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTenso _gamma = gamma; _epsilon = epsilon; - const unsigned int num_elems_processed_per_iteration = 4 / input->info()->element_size(); + unsigned int num_elems_processed_per_iteration = 1; + if(input->info()->data_type() == DataType::F16) + { + num_elems_processed_per_iteration = 4; + } // Set build options std::set build_opts; @@ -83,10 +87,10 @@ void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTenso AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); - AccessWindowStatic mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 1, mean->info()->dimension(1)); - AccessWindowStatic var_access(var->info(), 0, 0, var->info()->dimension(0) + 1, var->info()->dimension(1)); - AccessWindowStatic beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 1, beta->info()->dimension(1)); - AccessWindowStatic gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 1, gamma->info()->dimension(1)); + AccessWindowStatic mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 3, mean->info()->dimension(1)); + AccessWindowStatic var_access(var->info(), 0, 0, var->info()->dimension(0) + 3, var->info()->dimension(1)); + AccessWindowStatic beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 3, beta->info()->dimension(1)); + AccessWindowStatic gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 3, gamma->info()->dimension(1)); update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access); output_access.set_valid_region(win, input->info()->valid_region()); diff --git a/tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp b/tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp new file mode 100755 index 0000000000..4464ea2401 --- /dev/null +++ b/tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITCLSS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h" +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h" +#include "tests/GLES_COMPUTE/GCAccessor.h" +#include "tests/benchmark/fixtures/BatchNormalizationLayerFixture.h" +#include "tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4BatchNormalizationLayerDataset.h" +#include "tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace test +{ +namespace +{ +const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 }); +} // namespace + +using GCBatchNormalizationLayerFixture = BatchNormalizationLayerFixture; + +TEST_SUITE(GC) + +REGISTER_FIXTURE_DATA_TEST_CASE(YOLOV2BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::ALL, + framework::dataset::combine(framework::dataset::combine(datasets::YOLOV2BatchNormalizationLayerDataset(), + data_types), + framework::dataset::make("Batches", 1))); + +REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV4BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::ALL, + framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV4BatchNormalizationLayerDataset(), + data_types), + framework::dataset::make("Batches", 1))); + +TEST_SUITE(NIGHTLY) +REGISTER_FIXTURE_DATA_TEST_CASE(YOLOV2BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(framework::dataset::combine(datasets::YOLOV2BatchNormalizationLayerDataset(), + data_types), + framework::dataset::make("Batches", { 4, 8 }))); + +REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV4BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::NIGHTLY, + framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV4BatchNormalizationLayerDataset(), + data_types), + framework::dataset::make("Batches", { 4, 8 }))); +TEST_SUITE_END() +TEST_SUITE_END() +} // namespace test +} // namespace arm_compute diff --git a/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h b/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h index 79dbc76300..55411a44d1 100644 --- a/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h +++ b/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h @@ -29,6 +29,12 @@ #include "tests/Globals.h" #include "tests/Utils.h" #include "tests/framework/Fixture.h" +#ifdef ARM_COMPUTE_GC +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" +#include "tests/GLES_COMPUTE/Helper.h" + +using namespace arm_compute::test::gles_compute; +#endif /* ARM_COMPUTE_GC */ namespace arm_compute { @@ -76,6 +82,12 @@ public: void run() { batch_norm_layer.run(); +#ifdef ARM_COMPUTE_GC + if(opengles31_is_available() && std::is_same::type, arm_compute::GCTensor>::value) + { + force_sync_tensor(dst); + } +#endif /* ARM_COMPUTE_GC */ } void teardown() -- cgit v1.2.1