8 files changed, 320 insertions, 84 deletions
diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
index 1a2c3f7b20..c9fabc5fcd 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
@@ -45,7 +45,7 @@ const vec4  vec4_min  = vec4(float_min);
 /** Identifies the maximum value across the 1st dimension.
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
+ * @note In case the input is not multiple of 8 NON_MULTIPLE_OF_8 must be passed.
  *
  * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
  * @param[in]  src_attrs The attributes of the source tensor
@@ -74,21 +74,24 @@ void main(void)
     vec4 max_val = vec4_min;
 
     // Calculate max of row
-    uint width2 = width >> 2;
-    for(int i = 0; i < int(width2); i++)
+    uint width3 = width >> 3;
+    for(int i = 0; i < int(width3); i++)
     {
-        vec4 data = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
-        max_val   = MAX_OP(data, max_val);
+        vec4 data[2];
+        data[0] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        data[1] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, (i << 3) + 4, 0));
+        max_val = MAX_OP(data[0], max_val);
+        max_val = MAX_OP(data[1], max_val);
     }
 
-#ifdef NON_MULTIPLE_OF_4
-    // Handle non multiple of 4
-    for(int i = int(width2 << 2); i < int(width); i++)
+#ifdef NON_MULTIPLE_OF_8
+    // Handle non multiple of 8
+    for(int i = int(width3 << 3); i < int(width); i++)
     {
         float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
         max_val.x  = MAX_OP(data, max_val.x);
     }
-#endif /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_8 */
 
     // Perform max reduction
     max_val.xy = MAX_OP(max_val.xy, max_val.zw);
@@ -111,25 +114,29 @@ void main(void)
     vec4 max_val = vec4_min;
 
     // Calculate max of row
-    uint width2 = width >> 2;
-    for(int i = 0; i < int(width2); i++)
+    uint width3 = width >> 3;
+    for(int i = 0; i < int(width3); i++)
     {
-        vec4 data = VLOAD2_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
-        max_val   = MAX_OP(data, max_val);
+        vec4 data[2];
+        data    = VLOAD4_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        max_val = MAX_OP(data[0], max_val);
+        max_val = MAX_OP(data[1], max_val);
     }
 
-#ifdef NON_MULTIPLE_OF_4
-    // Handle non multiple of 4
-    for(int i = int(width2 << 2); i < int(width); i = i + 2)
+#ifdef NON_MULTIPLE_OF_8
+    // Handle non multiple of 8
+    uint width1 = width >> 1 << 1;
+    for(int i = int(width3 << 3); i < int(width1); i = i + 2)
     {
-        vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        vec2 data  = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        max_val.xy = MAX_OP(data, max_val.xy);
+    }
+    if(width != width1)
+    {
+        vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, width1, 0));
         max_val.x = MAX_OP(data.x, max_val.x);
-        if((i + 1) < int(width))
-        {
-            max_val.x = MAX_OP(data.y, max_val.x);
-        }
     }
-#endif /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_8 */
 
     // Perform max reduction
     max_val.xy = MAX_OP(max_val.xy, max_val.zw);
@@ -146,7 +153,7 @@ void main(void)
  * then gets the exponent of each element as sums all elements across each row.
  *
  * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
+ * @note In case the input is not multiple of 8 NON_MULTIPLE_OF_8 must be passed.
  *
  * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
  * @param[in]  src_attrs The attributes of the source tensor
@@ -187,19 +194,25 @@ void main(void)
     vec4 sum1D = vec4(0);
 
     // Shift values, exp and sum
-    uint width2 = width >> 2;
-    for(int i = 0; i < int(width2); i++)
+    uint width3 = width >> 3;
+    for(int i = 0; i < int(width3); i++)
     {
-        vec4 data = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
-        data      = SUB_OP(data, max_val);
-        data      = EXP_OP(data);
-        VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, i << 2, 0), data);
-        sum1D = ADD_OP(sum1D, data);
+        vec4 data[2];
+        data[0] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        data[1] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, (i << 3) + 4, 0));
+        data[0] = SUB_OP(data[0], max_val);
+        data[1] = SUB_OP(data[1], max_val);
+        data[0] = EXP_OP(data[0]);
+        data[1] = EXP_OP(data[1]);
+        VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data[0]);
+        VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, (i << 3) + 4, 0), data[1]);
+        sum1D = ADD_OP(sum1D, data[0]);
+        sum1D = ADD_OP(sum1D, data[1]);
     }
 
-#ifdef NON_MULTIPLE_OF_4
-    // Handle non multiple of 4
-    for(int i = int(width2 << 2); i < int(width); i++)
+#ifdef NON_MULTIPLE_OF_8
+    // Handle non multiple of 8
+    for(int i = int(width3 << 3); i < int(width); i++)
     {
         float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
         data       = SUB_OP(data, max_val.x);
@@ -207,7 +220,7 @@ void main(void)
         STORE(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), data);
         sum1D.x = ADD_OP(sum1D.x, data);
     }
-#endif /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_8 */
 
     // Perform min/max reduction
     sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
@@ -238,44 +251,40 @@ void main(void)
     vec4 sum1D = vec4(0.f);
 
     // Shift values, exp and sum
-    uint width2 = width >> 2;
-    for(int i = 0; i < int(width2); i++)
+    uint width3 = width >> 3;
+    for(int i = 0; i < int(width3); i++)
     {
-        vec4 data = VLOAD2_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
-        data      = SUB_OP(data, max_val);
-        data      = EXP_OP(data);
-        VSTORE2_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i << 2, 0), data);
-        sum1D = ADD_OP(sum1D, data);
+        vec4 data[2];
+        data    = VLOAD4_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
+        data[0] = SUB_OP(data[0], max_val);
+        data[1] = SUB_OP(data[1], max_val);
+        data[0] = EXP_OP(data[0]);
+        data[1] = EXP_OP(data[1]);
+        VSTORE4_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data);
+        sum1D = ADD_OP(sum1D, data[0]);
+        sum1D = ADD_OP(sum1D, data[1]);
     }
 
-#ifdef NON_MULTIPLE_OF_4
-    // Handle non multiple of 4
-    for(int i = int(width2 << 2); i < int(width); i = i + 2)
+#ifdef NON_MULTIPLE_OF_8
+    // Handle non multiple of 8
+    uint width1 = width >> 1 << 1;
+    for(int i = int(width3 << 3); i < int(width1); i = i + 2)
     {
-        float data;
-        vec2  datamiddle = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
-        data             = SUB_OP(datamiddle.x, max_val.x);
-        data             = EXP_OP(data);
-        vec2 datares;
-        if((i + 1) < int(width))
-        {
-            float data2;
-            data2   = SUB_OP(datamiddle.y, max_val.x);
-            data2   = EXP_OP(data2);
-            datares = vec2(data, data2);
-            data    = ADD_OP(data2, data);
-        }
-        else
-        {
-            datares = vec2(data, 0.f);
-        }
-
-        STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), datares);
-
+        vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+        data      = SUB_OP(data, max_val.xy);
+        data      = EXP_OP(data);
+        STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), data);
+        sum1D.xy = ADD_OP(sum1D.xy, data);
+    }
+    if(width != width1)
+    {
+        float data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, width1, 0)).x;
+        data       = SUB_OP(data, max_val.x);
+        data       = EXP_OP(data);
+        STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, width1, 0), vec2(data, 0.0));
         sum1D.x = ADD_OP(sum1D.x, data);
     }
-#endif /* NON_MULTIPLE_OF_4 */
-
+#endif /* NON_MULTIPLE_OF_8 */
     // Perform min/max reduction
     sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
     sum1D.x  = ADD_OP(sum1D.x, sum1D.y);
@@ -317,8 +326,12 @@ void main(void)
 
     // Load max value of 1D logits vector (row)
     vec4 sum_val = vec4(LOAD(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)));
-    vec4 data    = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
-    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, DIV_OP(data, sum_val));
+
+    vec4 data[2];
+    data[0] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+    data[1] = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), DIV_OP(data[0], sum_val));
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 4, 0), DIV_OP(data[1], sum_val));
 }
 #elif defined(DATA_TYPE_FP16)
 TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
@@ -332,8 +345,13 @@ void main(void)
 
     // Load max value of 1D logits vector (row)
     vec4 sum_val = vec4(LOAD_UNPACK2_HALF(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)).x);
-    vec4 data    = VLOAD2_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, DIV_OP(data, sum_val));
+
+    vec4 data[2];
+    data = VLOAD4_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
+    vec4 ret[2];
+    ret[0] = DIV_OP(data[0], sum_val);
+    ret[1] = DIV_OP(data[1], sum_val);
+    VSTORE4_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), ret);
 }
 #else // DATA_TYPE_FP32
 #error Data type not supported
diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
index 29a1385f87..040a66358f 100644
--- a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
@@ -66,10 +66,10 @@ void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output)
     build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
     build_opts.insert("#define SOFTMAX_LAYER_MAX");
 
-    // Tell the kernel that the width is not a multiple of 4
-    if((input->info()->dimension(0) % 4) != 0)
+    // Tell the kernel that the width is not a multiple of 8
+    if((input->info()->dimension(0) % 8) != 0)
     {
-        build_opts.insert("#define NON_MULTIPLE_OF_4");
+        build_opts.insert("#define NON_MULTIPLE_OF_8");
     }
 
     // Create kernel
@@ -80,8 +80,8 @@ void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output)
     _kernel.set_argument(idx++, input->info()->dimension(0));
 
     // Configure kernel window
-    // The kernel loops over all elements in steps of 4
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4);
+    // The kernel loops over all elements in steps of 8
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8);
     unsigned int       num_elems_written_per_iteration   = 1;
     if(input->info()->data_type() == DataType::F16)
     {
@@ -131,10 +131,10 @@ void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTen
     build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
     build_opts.insert("#define SOFTMAX_LAYER_SHIFT_EXP_SUM");
 
-    // Tell the kernel that the width is not a multiple of 4
-    if((input->info()->dimension(0) % 4) != 0)
+    // Tell the kernel that the width is not a multiple of 8
+    if((input->info()->dimension(0) % 8) != 0)
     {
-        build_opts.insert("#define NON_MULTIPLE_OF_4");
+        build_opts.insert("#define NON_MULTIPLE_OF_8");
     }
 
     // Create kernel
@@ -145,8 +145,8 @@ void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTen
     _kernel.set_argument(idx++, input->info()->dimension(0));
 
     // Configure window
-    // The kernel loops over all elements in steps of 4
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4);
+    // The kernel loops over all elements in steps of 8
+    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8);
     unsigned int       num_elems_written_per_iteration   = 1;
     if(input->info()->data_type() == DataType::F16)
     {
@@ -227,7 +227,7 @@ void GCLogits1DNormKernel::configure(const IGCTensor *input, const IGCTensor *su
     _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
 
     // Configure window
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
     unsigned int           num_elems_written_per_iteration   = 1;
     if(input->info()->data_type() == DataType::F16)
     {
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
index d7d47d2802..1db927c8ff 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
@@ -61,6 +61,8 @@ void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output)
 void GCSoftmaxLayer::run()
 {
     GCScheduler::get().enqueue(_max_kernel, false);
+    GCScheduler::get().sync();
     GCScheduler::get().enqueue(_shift_exp_sum_kernel, false);
+    GCScheduler::get().sync();
     GCScheduler::get().enqueue(_norm_kernel);
 }
diff --git a/tests/benchmark/CL/SoftmaxLayer.cpp b/tests/benchmark/CL/SoftmaxLayer.cpp
new file mode 100644
index 0000000000..6f0918fd95
--- /dev/null
+++ b/tests/benchmark/CL/SoftmaxLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+#include "tests/CL/CLAccessor.h"
+#include "tests/benchmark/fixtures/SoftmaxLayerFixture.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace
+{
+const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
+} // namespace
+
+using CLSoftmaxLayerFixture = SoftmaxLayerFixture<CLTensor, CLSoftmaxLayer, CLAccessor>;
+
+TEST_SUITE(CL)
+
+REGISTER_FIXTURE_DATA_TEST_CASE(SoftmaxLayer, CLSoftmaxLayerFixture, framework::DatasetMode::ALL,
+                                framework::dataset::combine(datasets::SoftmaxLayerSmallShapes(), data_types));
+
+TEST_SUITE(NIGHTLY)
+
+REGISTER_FIXTURE_DATA_TEST_CASE(SoftmaxLayer, CLSoftmaxLayerFixture, framework::DatasetMode::NIGHTLY,
+                                framework::dataset::combine(datasets::SoftmaxLayerLargeShapes(), data_types));
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/GLES_COMPUTE/SoftmaxLayer.cpp b/tests/benchmark/GLES_COMPUTE/SoftmaxLayer.cpp
new file mode 100644
index 0000000000..66123aa57f
--- /dev/null
+++ b/tests/benchmark/GLES_COMPUTE/SoftmaxLayer.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
+#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
+#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
+#include "tests/GLES_COMPUTE/GCAccessor.h"
+#include "tests/benchmark/fixtures/SoftmaxLayerFixture.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace
+{
+const auto data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32 });
+} // namespace
+
+using GCSoftmaxLayerFixture = SoftmaxLayerFixture<GCTensor, GCSoftmaxLayer, GCAccessor>;
+
+TEST_SUITE(GC)
+
+REGISTER_FIXTURE_DATA_TEST_CASE(SoftmaxLayer, GCSoftmaxLayerFixture, framework::DatasetMode::ALL,
+                                framework::dataset::combine(datasets::SoftmaxLayerSmallShapes(), data_types));
+
+TEST_SUITE(NIGHTLY)
+
+REGISTER_FIXTURE_DATA_TEST_CASE(SoftmaxLayer, GCSoftmaxLayerFixture, framework::DatasetMode::NIGHTLY,
+                                framework::dataset::combine(datasets::SoftmaxLayerLargeShapes(), data_types));
+
+TEST_SUITE_END()
+TEST_SUITE_END()
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/benchmark/fixtures/ConvolutionLayerFixture.h b/tests/benchmark/fixtures/ConvolutionLayerFixture.h
index 09e6cbfaf8..b526cc3870 100644
--- a/tests/benchmark/fixtures/ConvolutionLayerFixture.h
+++ b/tests/benchmark/fixtures/ConvolutionLayerFixture.h
@@ -81,8 +81,6 @@ public:
 #ifdef ARM_COMPUTE_GC
         if(opengles31_is_available() && std::is_same<typename std::decay<TensorType>::type, arm_compute::GCTensor>::value)
         {
-            GCScheduler::get().sync();
-            force_sync_tensor(src);
             force_sync_tensor(dst);
         }
 #endif /* ARM_COMPUTE_GC */
diff --git a/tests/benchmark/fixtures/SoftmaxLayerFixture.h b/tests/benchmark/fixtures/SoftmaxLayerFixture.h
new file mode 100644
index 0000000000..6e0472ce3a
--- /dev/null
+++ b/tests/benchmark/fixtures/SoftmaxLayerFixture.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_SOFTMAXLAYERFIXTURE
+#define ARM_COMPUTE_TEST_SOFTMAXLAYERFIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "tests/Globals.h"
+#include "tests/Utils.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+
+#ifdef ARM_COMPUTE_GC
+#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
+#include "tests/GLES_COMPUTE/Helper.h"
+
+using namespace arm_compute::test::gles_compute;
+#endif /* ARM_COMPUTE_GC */
+
+namespace arm_compute
+{
+namespace test
+{
+/** Fixture that can be used for NEON, CL and OpenGL ES */
+template <typename TensorType, typename Function, typename Accessor>
+class SoftmaxLayerFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, DataType data_type)
+    {
+        // Set batched in source and destination shapes
+        const int fixed_point_position = is_data_type_fixed_point(data_type) ? 3 : 0;
+
+        // Create tensors
+        src = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position);
+        dst = create_tensor<TensorType>(shape, data_type, 1, fixed_point_position);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Create and configure function
+        smx_layer.configure(&src, &dst);
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        // Fill tensors
+        library->fill_tensor_uniform(Accessor(src), 0);
+    }
+
+    void run()
+    {
+        smx_layer.run();
+#ifdef ARM_COMPUTE_GC
+        if(opengles31_is_available() && std::is_same<typename std::decay<TensorType>::type, arm_compute::GCTensor>::value)
+        {
+            force_sync_tensor(dst);
+        }
+#endif /* ARM_COMPUTE_GC */
+    }
+
+    void teardown()
+    {
+        src.allocator()->free();
+        dst.allocator()->free();
+    }
+
+private:
+    TensorType src{};
+    TensorType dst{};
+    Function   smx_layer{};
+};
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_SOFTMAXLAYERFIXTURE */
diff --git a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
index 888f87e9ef..a2114a9c37 100644
--- a/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/SoftmaxLayer.cpp
@@ -79,7 +79,7 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(concat(datase
     validate(dst.info()->valid_region(), valid_region);
 
     // Validate padding
-    const PaddingSize padding = PaddingCalculator(shape.x(), 4).required_padding();
+    const PaddingSize padding = PaddingCalculator(shape.x(), 8).required_padding();
     validate(src.info()->padding(), padding);
     validate(dst.info()->padding(), padding);
 }