aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs91
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp14
-rwxr-xr-xtests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp73
-rw-r--r--tests/benchmark/fixtures/BatchNormalizationLayerFixture.h12
4 files changed, 161 insertions, 29 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
index c3df5d5c4d..be1d01f6c5 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
@@ -127,12 +127,12 @@ void main(void)
}
#elif defined(DATA_TYPE_FP16)
-BUFFER_DECLARATION(src, 1, uint, );
-BUFFER_DECLARATION(dst, 2, uint, writeonly);
-BUFFER_DECLARATION(mean, 3, uint, );
-BUFFER_DECLARATION(var, 4, uint, );
-BUFFER_DECLARATION(beta, 5, uint, );
-BUFFER_DECLARATION(gamma, 6, uint, );
+BUFFER_DECLARATION(src, 1, uvec2, readonly);
+BUFFER_DECLARATION(dst, 2, uvec2, writeonly);
+BUFFER_DECLARATION(mean, 3, uvec2, readonly);
+BUFFER_DECLARATION(var, 4, uvec2, readonly);
+BUFFER_DECLARATION(beta, 5, uvec2, readonly);
+BUFFER_DECLARATION(gamma, 6, uvec2, readonly);
/** Apply batch normalization.
*
@@ -180,43 +180,86 @@ void main(void)
Vector beta = CONVERT_TO_VECTOR_STRUCT_FP16(beta);
Vector gamma = CONVERT_TO_VECTOR_STRUCT_FP16(gamma);
- vec2 input_value;
+ uvec2 packed_s[5];
+ vec4 unpacked_s[5];
float denominator;
float numerator;
- vec2 x_bar;
float gamma_param;
float beta_param;
+ vec4 x_bar;
+ vec4 result;
uint current_slice = gl_GlobalInvocationID.z;
- if((current_slice % uint(2)) == uint(0))
+ packed_s[0] = src_ptr[src.current_offset >> 3];
+ packed_s[1] = var_ptr[(var.current_offset + current_slice * var.stride_x) >> 3];
+ packed_s[2] = mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 3];
+ packed_s[3] = gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 3];
+ packed_s[4] = beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 3];
+ unpacked_s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
+ unpacked_s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
+ unpacked_s[2] = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y));
+ unpacked_s[3] = vec4(unpackHalf2x16(packed_s[3].x), unpackHalf2x16(packed_s[3].y));
+ unpacked_s[4] = vec4(unpackHalf2x16(packed_s[4].x), unpackHalf2x16(packed_s[4].y));
+
+ if((current_slice % uint(4)) == uint(0))
+ {
+ denominator = unpacked_s[1].x;
+ denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+ //Calculate x bar and store results
+ numerator = unpacked_s[2].x;
+ x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+
+ gamma_param = unpacked_s[3].x;
+ beta_param = unpacked_s[4].x;
+ result = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+
+ dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw));
+ }
+ else if((current_slice % uint(4)) == uint(1))
{
- input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]);
- denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).x;
+ denominator = unpacked_s[1].y;
denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
//Calculate x bar and store results
- numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).x;
- x_bar = MUL_OP(SUB_OP(input_value, numerator), denominator);
+ numerator = unpacked_s[2].y;
+ x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
- gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).x;
- beta_param = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).x;
+ gamma_param = unpacked_s[3].y;
+ beta_param = unpacked_s[4].y;
+ result = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
- dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+ dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw));
+ }
+ else if((current_slice % uint(4)) == uint(2))
+ {
+ denominator = unpacked_s[1].z;
+ denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
+
+ //Calculate x bar and store results
+ numerator = unpacked_s[2].z;
+ x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
+
+ gamma_param = unpacked_s[3].z;
+ beta_param = unpacked_s[4].z;
+ result = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
+
+ dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw));
}
else
{
- input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]);
- denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).y;
+ denominator = unpacked_s[1].w;
denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
//Calculate x bar and store results
- numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).y;
- x_bar = MUL_OP(SUB_OP(input_value, numerator), denominator);
+ numerator = unpacked_s[2].w;
+ x_bar = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
- gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).y;
- beta_param = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).y;
+ gamma_param = unpacked_s[3].w;
+ beta_param = unpacked_s[4].w;
+ result = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param);
- dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param));
+ dst_ptr[dst.current_offset >> 3] = uvec2(packHalf2x16(result.xy), packHalf2x16(result.zw));
}
}
-#endif /*DATA_TYPE_FP32*/
+#endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index 982143f0b2..dee2a5579b 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
@@ -64,7 +64,11 @@ void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTenso
_gamma = gamma;
_epsilon = epsilon;
- const unsigned int num_elems_processed_per_iteration = 4 / input->info()->element_size();
+ unsigned int num_elems_processed_per_iteration = 1;
+ if(input->info()->data_type() == DataType::F16)
+ {
+ num_elems_processed_per_iteration = 4;
+ }
// Set build options
std::set<std::string> build_opts;
@@ -83,10 +87,10 @@ void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTenso
AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
- AccessWindowStatic mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 1, mean->info()->dimension(1));
- AccessWindowStatic var_access(var->info(), 0, 0, var->info()->dimension(0) + 1, var->info()->dimension(1));
- AccessWindowStatic beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 1, beta->info()->dimension(1));
- AccessWindowStatic gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 1, gamma->info()->dimension(1));
+ AccessWindowStatic mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 3, mean->info()->dimension(1));
+ AccessWindowStatic var_access(var->info(), 0, 0, var->info()->dimension(0) + 3, var->info()->dimension(1));
+ AccessWindowStatic beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 3, beta->info()->dimension(1));
+ AccessWindowStatic gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 3, gamma->info()->dimension(1));
update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
output_access.set_valid_region(win, input->info()->valid_region());
diff --git a/tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp b/tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp
new file mode 100755
index 0000000000..4464ea2401
--- /dev/null
+++ b/tests/benchmark/GLES_COMPUTE/BatchNormalizationLayer.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITCLSS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONCLCTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
+#include "tests/GLES_COMPUTE/GCAccessor.h"
+#include "tests/benchmark/fixtures/BatchNormalizationLayerFixture.h"
+#include "tests/datasets/system_tests/googlenet/inceptionv4/GoogLeNetInceptionV4BatchNormalizationLayerDataset.h"
+#include "tests/datasets/system_tests/yolo/v2/YOLOV2BatchNormalizationLayerDataset.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace
+{
+const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
+} // namespace
+
+using GCBatchNormalizationLayerFixture = BatchNormalizationLayerFixture<GCTensor, GCBatchNormalizationLayer, GCAccessor>;
+
+TEST_SUITE(GC)
+
+REGISTER_FIXTURE_DATA_TEST_CASE(YOLOV2BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::ALL,
+ framework::dataset::combine(framework::dataset::combine(datasets::YOLOV2BatchNormalizationLayerDataset(),
+ data_types),
+ framework::dataset::make("Batches", 1)));
+
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV4BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::ALL,
+ framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV4BatchNormalizationLayerDataset(),
+ data_types),
+ framework::dataset::make("Batches", 1)));
+
+TEST_SUITE(NIGHTLY)
+REGISTER_FIXTURE_DATA_TEST_CASE(YOLOV2BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::NIGHTLY,
+ framework::dataset::combine(framework::dataset::combine(datasets::YOLOV2BatchNormalizationLayerDataset(),
+ data_types),
+ framework::dataset::make("Batches", { 4, 8 })));
+
+REGISTER_FIXTURE_DATA_TEST_CASE(GoogLeNetInceptionV4BatchNormalizationLayer, GCBatchNormalizationLayerFixture, framework::DatasetMode::NIGHTLY,
+ framework::dataset::combine(framework::dataset::combine(datasets::GoogLeNetInceptionV4BatchNormalizationLayerDataset(),
+ data_types),
+ framework::dataset::make("Batches", { 4, 8 })));
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h b/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h
index 79dbc76300..55411a44d1 100644
--- a/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h
+++ b/tests/benchmark/fixtures/BatchNormalizationLayerFixture.h
@@ -29,6 +29,12 @@
#include "tests/Globals.h"
#include "tests/Utils.h"
#include "tests/framework/Fixture.h"
+#ifdef ARM_COMPUTE_GC
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "tests/GLES_COMPUTE/Helper.h"
+
+using namespace arm_compute::test::gles_compute;
+#endif /* ARM_COMPUTE_GC */
namespace arm_compute
{
@@ -76,6 +82,12 @@ public:
void run()
{
batch_norm_layer.run();
+#ifdef ARM_COMPUTE_GC
+ if(opengles31_is_available() && std::is_same<typename std::decay<TensorType>::type, arm_compute::GCTensor>::value)
+ {
+ force_sync_tensor(dst);
+ }
+#endif /* ARM_COMPUTE_GC */
}
void teardown()