Remove OpenGL ES support

Remove the following: - Relevant backend kernels - Relevant backend functions - Relevant backend validation tests - Relevant backend specific examples - Remove backend support from Graph API - Remove backend support from build system Update documentation Resolves: COMPMID-4149 Change-Id: Id0621d6ee35169754de458103907aaba4ef770c0 Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5097 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
author: Manuel Bottini <manuel.bottini@arm.com> 2021-02-16 15:15:19 +0000
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2021-02-23 18:21:55 +0000
commit: ceaa0bfe219631b5a4e638613f90f9fa47a3defe (patch)
tree: 3bb878645ae7509f7807197d320a02882ad84751 /src
parent: c40562d4467e3a68b0dac5e865570c8f38d1487e (diff)
download: ComputeLibrary-ceaa0bfe219631b5a4e638613f90f9fa47a3defe.tar.gz
94 files changed, 0 insertions, 18419 deletions
diff --git a/src/core/GLES_COMPUTE/GCCoreRuntimeContext.cpp b/src/core/GLES_COMPUTE/GCCoreRuntimeContext.cpp
deleted file mode 100644
index a374c59989..0000000000
--- a/src/core/GLES_COMPUTE/GCCoreRuntimeContext.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h"
-
-namespace arm_compute
-{
-GCCoreRuntimeContext::GCCoreRuntimeContext()
-    : _kernel_lib(nullptr)
-{
-}
-
-GCCoreRuntimeContext::GCCoreRuntimeContext(GCKernelLibrary *kernel_lib)
-    : _kernel_lib(kernel_lib)
-{
-}
-
-GCKernelLibrary *GCCoreRuntimeContext::kernel_library() const
-{
-    return _kernel_lib;
-}
-} // namespace arm_compute
diff --git a/src/core/GLES_COMPUTE/GCHelpers.cpp b/src/core/GLES_COMPUTE/GCHelpers.cpp
deleted file mode 100644
index 0c9ed8218c..0000000000
--- a/src/core/GLES_COMPUTE/GCHelpers.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h"
-
-namespace arm_compute
-{
-GPUTarget get_target_from_device()
-{
-    const std::string device_name = reinterpret_cast<const char *>(glGetString(GL_RENDERER));
-
-    return get_target_from_name(device_name);
-}
-
-GCKernel create_opengl_kernel(GCCoreRuntimeContext *ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
-{
-    if(ctx && ctx->kernel_library())
-    {
-        // New api going through the core context
-        return ctx->kernel_library()->create_kernel(kernel_name, build_opts);
-    }
-    else
-    {
-        // Legacy code through the singleton
-        return GCKernelLibrary::get().create_kernel(kernel_name, build_opts);
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
deleted file mode 100644
index ffcc8c8088..0000000000
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ /dev/null
@@ -1,559 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Utils.h"
-
-#include <fstream>
-#include <iomanip>
-#include <regex>
-#include <utility>
-#include <vector>
-
-using namespace arm_compute;
-
-GCProgram::GCProgram()
-    : _name(), _source()
-{
-}
-
-GCProgram::GCProgram(std::string name, std::string source)
-    : _name(std::move(name)), _source(std::move(source))
-{
-}
-
-GLuint GCProgram::link_program(GLuint shader)
-{
-    GLuint program = ARM_COMPUTE_GL_CHECK(glCreateProgram());
-
-    GLint   rvalue;
-    GLsizei length;
-
-    ARM_COMPUTE_GL_CHECK(glAttachShader(program, shader));
-    ARM_COMPUTE_GL_CHECK(glLinkProgram(program));
-    ARM_COMPUTE_GL_CHECK(glDetachShader(program, shader));
-    ARM_COMPUTE_GL_CHECK(glDeleteShader(shader));
-
-    // Check if there were some issues when linking the shader.
-    ARM_COMPUTE_GL_CHECK(glGetProgramiv(program, GL_LINK_STATUS, &rvalue));
-
-    if(rvalue == 0)
-    {
-        ARM_COMPUTE_GL_CHECK(glGetProgramiv(program, GL_INFO_LOG_LENGTH, &length));
-
-        std::vector<GLchar> log(length);
-        ARM_COMPUTE_GL_CHECK(glGetProgramInfoLog(program, length, nullptr, log.data()));
-        ARM_COMPUTE_ERROR_VAR("Error: Linker log:\n%s\n", log.data());
-
-        return 0;
-    }
-
-    ARM_COMPUTE_GL_CHECK(glUseProgram(program));
-
-    return program;
-}
-
-GLuint GCProgram::compile_shader(const std::string &build_options)
-{
-    GLuint shader = ARM_COMPUTE_GL_CHECK(glCreateShader(GL_COMPUTE_SHADER));
-
-    const char *src[]
-    {
-        "#version 310 es\n",
-        build_options.c_str(),
-        _source.c_str()
-    };
-
-    ARM_COMPUTE_GL_CHECK(glShaderSource(shader, sizeof(src) / sizeof(src[0]), src, nullptr));
-
-    ARM_COMPUTE_GL_CHECK(glCompileShader(shader));
-
-    // Check if there were any issues when compiling the shader
-    GLint   rvalue;
-    GLsizei length;
-
-    ARM_COMPUTE_GL_CHECK(glGetShaderiv(shader, GL_COMPILE_STATUS, &rvalue));
-
-    if(rvalue == 0)
-    {
-        ARM_COMPUTE_GL_CHECK(glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &length));
-
-        std::vector<GLchar> log(length);
-        ARM_COMPUTE_GL_CHECK(glGetShaderInfoLog(shader, length, nullptr, log.data()));
-
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
-        std::istringstream ss(_source);
-        std::stringstream  output_stream;
-        std::string        line;
-        size_t             line_num = 1;
-
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("GLES Shader build options:\n%s\n", build_options.c_str());
-        while(std::getline(ss, line, '\n'))
-        {
-            output_stream << std::setw(6) << line_num << ": " << line << std::endl;
-            line_num++;
-        }
-        ARM_COMPUTE_LOG_INFO_STREAM_CORE("GLES Shader source code:\n"
-                                         << output_stream.rdbuf());
-#endif /* ARM_COMPUTE_DEBUG_ENABLED */
-
-        ARM_COMPUTE_ERROR_VAR("Error: Compiler log:\n%s\n", log.data());
-
-        return 0;
-    }
-
-    return shader;
-}
-
-GCKernel::GCKernel()
-    : _name(), _program(), _shader_arguments(), _shader_params_ubo_name(), _shader_params_binding_point(), _shader_params_index(), _shader_params_size()
-{
-}
-
-// Add a default destructor in cpp file to workaround the free unallocated value issue on Android
-GCKernel::~GCKernel() // NOLINT
-{
-}
-
-GCKernel::GCKernel(std::string name, GLuint program)
-    : _name(std::move(name)),
-      _program(program),
-      _shader_arguments(),
-      _shader_params_ubo_name(0),
-      _shader_params_binding_point(0),
-      _shader_params_index(0),
-      _shader_params_size(0)
-{
-    _shader_arguments.clear();
-
-    ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params_ubo_name));
-
-    _shader_params_index = ARM_COMPUTE_GL_CHECK(glGetUniformBlockIndex(_program, _shader_params_name));
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(_shader_params_index == GL_INVALID_INDEX, "Failed to get index of %s", _shader_params_name);
-    ARM_COMPUTE_GL_CHECK(glGetActiveUniformBlockiv(_program, _shader_params_index, GL_UNIFORM_BLOCK_DATA_SIZE, &_shader_params_size));
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(_shader_params_size == 0, "Failed to get size of %s", _shader_params_name);
-}
-
-void GCKernel::cleanup()
-{
-    ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_shader_params_ubo_name));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0));
-    ARM_COMPUTE_GL_CHECK(glDeleteProgram(_program));
-    ARM_COMPUTE_GL_CHECK(glUseProgram(0));
-}
-
-void GCKernel::use()
-{
-    ARM_COMPUTE_GL_CHECK(glUseProgram(_program));
-}
-
-void GCKernel::unuse()
-{
-    ARM_COMPUTE_GL_CHECK(glUseProgram(0));
-}
-
-void GCKernel::update_shader_params()
-{
-    ARM_COMPUTE_ERROR_ON_MSG_VAR((_shader_params_size != (int)(_shader_arguments.size() * sizeof(_shader_arguments[0]))), "Arguments size (%zu) is not equal to shader params block size (%d)",
-                                 _shader_arguments.size() * sizeof(_shader_arguments[0]), _shader_params_size);
-
-    ARM_COMPUTE_GL_CHECK(glUniformBlockBinding(_program, _shader_params_index, _shader_params_binding_point));
-    ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_UNIFORM_BUFFER, _shader_params_binding_point, _shader_params_ubo_name));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, _shader_params_ubo_name));
-    ARM_COMPUTE_GL_CHECK(glBufferData(GL_UNIFORM_BUFFER, _shader_params_size, _shader_arguments.data(), GL_DYNAMIC_DRAW));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0));
-}
-
-const std::map<std::string, std::string> GCKernelLibrary::_shader_program_map =
-{
-    { "absdiff", "absdiff.cs" },
-    { "tensorshift", "tensor_shift.cs" },
-    { "direct_convolution1x1", "direct_convolution1x1.cs" },
-    { "direct_convolution3x3", "direct_convolution3x3.cs" },
-    { "direct_convolution5x5", "direct_convolution5x5.cs" },
-    { "pooling_layer_2", "pooling_layer.cs" },
-    { "pooling_layer_3", "pooling_layer.cs" },
-    { "pooling_layer_7", "pooling_layer.cs" },
-    { "pooling_layer_3_optimized", "pooling_layer.cs" },
-    { "pooling_layer_n", "pooling_layer.cs" },
-    { "fill_image_borders_replicate", "fill_border.cs" },
-    { "fill_image_borders_constant", "fill_border.cs" },
-    { "gemm_accumulate_biases", "gemm.cs" },
-    { "gemm_interleave4x4", "gemm.cs" },
-    { "gemm_ma", "gemm.cs" },
-    { "gemm_mm_interleaved_transposed", "gemm.cs" },
-    { "gemm_mm_floating_point", "gemm.cs" },
-    { "gemm_transpose1x4", "gemm.cs" },
-    { "reshape_to_columns", "convolution_layer.cs" },
-    { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cs" },
-    { "im2col_generic", "convolution_layer.cs" },
-    { "im2col_reduced", "convolution_layer.cs" },
-    { "col2im", "convolution_layer.cs" },
-    { "transpose", "transpose.cs" },
-    { "activation_layer", "activation_layer.cs" },
-    { "softmax_layer_max", "softmax_layer.cs" },
-    { "softmax_layer_shift_exp_sum", "softmax_layer.cs" },
-    { "softmax_layer_norm", "softmax_layer.cs" },
-    { "pixelwise_mul_float", "pixelwise_mul_float.cs" },
-    { "normalization_layer", "normalization_layer.cs" },
-    { "batchnormalization_layer", "batchnormalization_layer.cs" },
-    { "concatenate_depth", "concatenate.cs" },
-    { "dropout", "dropout.cs" },
-    { "normalize_planar_yuv_layer", "normalize_planar_yuv_layer.cs" },
-    { "scale_nearest_neighbour", "scale.cs" },
-    { "arithmetic_add", "arithmetic_add.cs" },
-    { "depthwise_convolution_3x3", "depthwise_convolution3x3.cs" },
-};
-
-const std::map<std::string, std::string> GCKernelLibrary::_program_source_map =
-{
-#ifdef EMBEDDED_KERNELS
-    {
-        "helpers_cs.h",
-#include "./cs_shaders/helpers_cs.hembed"
-    },
-    {
-        "activation_layer_helpers_cs.h",
-#include "./cs_shaders/activation_layer_helpers_cs.hembed"
-    },
-    {
-        "absdiff.cs",
-#include "./cs_shaders/absdiff.csembed"
-    },
-    {
-        "tensor_shift.cs",
-#include "./cs_shaders/tensor_shift.csembed"
-    },
-    {
-        "convolution_layer.cs",
-#include "./cs_shaders/convolution_layer.csembed"
-    },
-    {
-        "direct_convolution1x1.cs",
-#include "./cs_shaders/direct_convolution1x1.csembed"
-    },
-    {
-        "direct_convolution3x3.cs",
-#include "./cs_shaders/direct_convolution3x3.csembed"
-    },
-    {
-        "direct_convolution5x5.cs",
-#include "./cs_shaders/direct_convolution5x5.csembed"
-    },
-    {
-        "pooling_layer.cs",
-#include "./cs_shaders/pooling_layer.csembed"
-    },
-    {
-        "fill_border.cs",
-#include "./cs_shaders/fill_border.csembed"
-    },
-    {
-        "gemm.cs",
-#include "./cs_shaders/gemm.csembed"
-    },
-    {
-        "transpose.cs",
-#include "./cs_shaders/transpose.csembed"
-    },
-    {
-        "activation_layer.cs",
-#include "./cs_shaders/activation_layer.csembed"
-    },
-    {
-        "softmax_layer.cs",
-#include "./cs_shaders/softmax_layer.csembed"
-    },
-    {
-        "pixelwise_mul_float.cs",
-#include "./cs_shaders/pixelwise_mul_float.csembed"
-    },
-    {
-        "normalization_layer.cs",
-#include "./cs_shaders/normalization_layer.csembed"
-    },
-    {
-        "batchnormalization_layer.cs",
-#include "./cs_shaders/batchnormalization_layer.csembed"
-    },
-    {
-        "concatenate.cs",
-#include "./cs_shaders/concatenate.csembed"
-    },
-    {
-        "dropout.cs",
-#include "./cs_shaders/dropout.csembed"
-    },
-    {
-        "normalize_planar_yuv_layer.cs",
-#include "./cs_shaders/normalize_planar_yuv_layer.csembed"
-    },
-    {
-        "scale.cs",
-#include "./cs_shaders/scale.csembed"
-    },
-    {
-        "arithmetic_add.cs",
-#include "./cs_shaders/arithmetic_add.csembed"
-    },
-    {
-        "depthwise_convolution3x3.cs",
-#include "./cs_shaders/depthwise_convolution3x3.csembed"
-    },
-#endif /* EMBEDDED_KERNELS */
-};
-
-GCKernelLibrary::GCKernelLibrary()
-    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _frame_buffer(0), _tex_rt(0), _shader_path("./"), _programs_map(), _built_programs_map()
-{
-}
-
-GCKernelLibrary &GCKernelLibrary::get()
-{
-    static GCKernelLibrary _kernel_library;
-    return _kernel_library;
-}
-
-void GCKernelLibrary::init(std::string shader_path, EGLDisplay dpy, EGLContext ctx)
-{
-    //TODO: deal with old display and context.
-    _shader_path = std::move(shader_path);
-
-    _display = dpy;
-    _context = ctx;
-
-    eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context);
-    setup_dummy_fbo();
-}
-
-void GCKernelLibrary::set_shader_path(const std::string &shader_path)
-{
-    _shader_path = shader_path;
-}
-
-void GCKernelLibrary::set_context(EGLDisplay dpy, EGLContext ctx)
-{
-    //TODO: deal with old display and context.
-    _display = dpy;
-    _context = ctx;
-
-    eglMakeCurrent(dpy, EGL_NO_SURFACE, EGL_NO_SURFACE, ctx);
-    setup_dummy_fbo();
-}
-
-GCKernel GCKernelLibrary::create_kernel(const std::string &shader_name, const StringSet &build_options_set) const
-{
-    // Find which program contains the kernel
-    auto shader_program_it = _shader_program_map.find(shader_name);
-
-    if(_shader_program_map.end() == shader_program_it)
-    {
-        ARM_COMPUTE_ERROR_VAR("Shader %s not found in the GCKernelLibrary", shader_name.c_str());
-    }
-
-    // Check if the program has been built before with same build options.
-    const std::string program_name       = shader_program_it->second;
-    const std::string build_options      = stringify_set(build_options_set);
-    const std::string built_program_name = program_name + "_" + build_options;
-    auto              built_program_it   = _built_programs_map.find(built_program_name);
-
-    GCKernel kernel;
-
-    if(_built_programs_map.end() != built_program_it)
-    {
-        // If program has been built, retrieve to create kernel from it
-        kernel = built_program_it->second;
-    }
-    else
-    {
-        GCProgram program = load_program(program_name);
-
-        std::string source_name = _shader_path + shader_program_it->second;
-
-        // load shader
-        GLuint shader = program.compile_shader(build_options);
-
-        // Build program
-        GLuint gles_program = program.link_program(shader);
-
-        // Create GCKernel
-        kernel = GCKernel(shader_name, gles_program);
-
-        // Add built program to internal map
-        _built_programs_map.emplace(built_program_name, kernel);
-    }
-
-    kernel.use();
-    kernel.clear_arguments();
-    // set shader params binding point
-    kernel.set_shader_params_binding_point(0);
-
-    return kernel;
-}
-
-std::string GCKernelLibrary::preprocess_shader(const std::string &shader_source) const
-{
-    enum class ParserStage
-    {
-        FIRST,
-        SKIP_COMMENTS = FIRST,
-        RESOLVE_INCLUDES,
-        LAST
-    };
-
-    // Define a GLES compute shader parser function
-    std::function<std::string(const std::string &, ParserStage, int)> cs_parser;
-    cs_parser = [&](const std::string & src, ParserStage stage, int) -> std::string
-    {
-        std::string dst;
-
-        if(stage == ParserStage::LAST || std::regex_match(src, std::regex(R"(\s*)")))
-        {
-            return src;
-        }
-        auto next_stage = static_cast<ParserStage>(static_cast<int>(stage) + 1);
-
-        std::string search_pattern;
-        switch(stage)
-        {
-            case ParserStage::SKIP_COMMENTS:
-                search_pattern = R"((/\*([^*]|\n|(\*+([^*/]|\n)))*\*+/)|(//.*))";
-                break;
-            case ParserStage::RESOLVE_INCLUDES:
-                search_pattern = R"rgx((?:^|\n)[ \t]*#include "(.*)")rgx";
-                break;
-            default:
-                break;
-        }
-
-        std::regex  search_regex(search_pattern);
-        std::smatch match;
-        ptrdiff_t   parsed_pos = 0;
-        if(std::regex_search(src, match, search_regex))
-        {
-            // Pass the content before the match to the next stage
-            dst.append(cs_parser(src.substr(0, match.position()), next_stage, 0));
-            parsed_pos = match.position() + match.length();
-
-            // Deal with the matched content
-            switch(stage)
-            {
-                case ParserStage::RESOLVE_INCLUDES:
-                {
-                    // Replace with the included file contents
-                    // And parse the content from the first stage
-                    const std::string source_name = _shader_path + match.str(1);
-                    dst.append(cs_parser(read_file(source_name, false), ParserStage::FIRST, 0));
-                    break;
-                }
-                case ParserStage::SKIP_COMMENTS:
-                default:
-                    dst.append(match.str());
-                    break;
-            }
-            next_stage = stage;
-        }
-        dst.append(cs_parser(src.substr(parsed_pos, src.length() - parsed_pos), next_stage, 0));
-
-        return dst;
-    };
-
-    return cs_parser(shader_source, ParserStage::FIRST, 0);
-}
-
-const GCProgram &GCKernelLibrary::load_program(const std::string &program_name) const
-{
-    const auto program_it = _programs_map.find(program_name);
-
-    if(program_it != _programs_map.end())
-    {
-        return program_it->second;
-    }
-
-    GCProgram program;
-
-#ifdef EMBEDDED_KERNELS
-    const auto program_source_it = _program_source_map.find(program_name);
-
-    if(_program_source_map.end() == program_source_it)
-    {
-        ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
-    }
-
-    program = GCProgram(program_name, program_source_it->second);
-#else  /* EMBEDDED_KERNELS */
-    // Check for binary
-    std::string source_name = _shader_path + program_name;
-    if(std::ifstream(source_name).is_open())
-    {
-        program = GCProgram(program_name, preprocess_shader(read_file(source_name, false)));
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_VAR("Shader file %s does not exist.", source_name.c_str());
-    }
-#endif /* EMBEDDED_KERNELS */
-
-    // Insert program to program map
-    const auto new_program = _programs_map.emplace(program_name, std::move(program));
-
-    return new_program.first->second;
-}
-
-void GCKernelLibrary::setup_dummy_fbo()
-{
-    ARM_COMPUTE_GL_CHECK(glGenFramebuffers(1, &_frame_buffer));
-    ARM_COMPUTE_GL_CHECK(glBindFramebuffer(GL_FRAMEBUFFER, _frame_buffer));
-    ARM_COMPUTE_GL_CHECK(glGenTextures(1, &_tex_rt));
-    ARM_COMPUTE_GL_CHECK(glBindTexture(GL_TEXTURE_2D, _tex_rt));
-    ARM_COMPUTE_GL_CHECK(glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 1, 1, 0, GL_RGB, GL_UNSIGNED_BYTE, nullptr));
-    ARM_COMPUTE_GL_CHECK(glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, _tex_rt, 0));
-}
-
-GCKernelLibrary::~GCKernelLibrary()
-{
-    for(auto &program : _built_programs_map)
-    {
-        static_cast<GCKernel>(program.second).cleanup();
-    }
-
-    ARM_COMPUTE_GL_CHECK(glBindTexture(GL_TEXTURE_2D, 0));
-    ARM_COMPUTE_GL_CHECK(glBindFramebuffer(GL_FRAMEBUFFER, 0));
-    ARM_COMPUTE_GL_CHECK(glDeleteTextures(1, &_tex_rt));
-    ARM_COMPUTE_GL_CHECK(glDeleteFramebuffers(1, &_frame_buffer));
-}
-
-std::string GCKernelLibrary::stringify_set(const StringSet &s) const
-{
-    std::string concat_set;
-
-    // Concatenate set
-    for(const auto &el : s)
-    {
-        concat_set += el + "\n";
-    }
-
-    return concat_set;
-}
diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp
deleted file mode 100644
index eb96f9150c..0000000000
--- a/src/core/GLES_COMPUTE/IGCKernel.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-#include <cstddef>
-#include <sstream>
-
-using namespace arm_compute;
-
-void arm_compute::enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws)
-{
-    ARM_COMPUTE_UNUSED(kernel);
-
-    if(kernel.kernel().get_program() == 0)
-    {
-        return;
-    }
-
-    ARM_COMPUTE_ERROR_ON((0 == (window.x().end() - window.x().start())) || (0 == (window.y().end() - window.y().start())));
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR((((window.x().end() - window.x().start()) % (window.x().step() * lws[0])) != 0),
-                                 "window x end =%d, start=%d, step=%d, lws x=%zu", window.x().end(), window.x().start(), window.x().step(), lws[0]);
-    ARM_COMPUTE_ERROR_ON_MSG_VAR((((window.y().end() - window.y().start()) % (window.y().step() * lws[1])) != 0),
-                                 "window y end =%d, start=%d, step=%d, lws y=%zu", window.y().end(), window.y().start(), window.y().step(), lws[1]);
-    ARM_COMPUTE_ERROR_ON_MSG_VAR((((window.z().end() - window.z().start()) % (window.z().step() * lws[2])) != 0),
-                                 "window z end =%d, start=%d, step=%d, lws z=%zu", window.z().end(), window.z().start(), window.z().step(), lws[2]);
-
-    ARM_COMPUTE_GL_CHECK(glDispatchCompute(((window.x().end() - window.x().start()) / window.x().step()) / lws[0],
-                                           ((window.y().end() - window.y().start()) / window.y().step()) / lws[1],
-                                           ((window.z().end() - window.z().start()) / window.z().step()) / lws[2]));
-}
-
-IGCKernel::IGCKernel()
-    : _kernel(), _lws_hint(gles::NDRange(1U, 1U, 1U)), _target(GPUTarget::MIDGARD)
-{
-}
-
-GCKernel &IGCKernel::kernel()
-{
-    return _kernel;
-}
-
-template <unsigned int dimension_size>
-unsigned int           IGCKernel::num_arguments_per_tensor() const
-{
-    // Rounding up the tensor attributes structure in compute shader to a multiple of a vec4
-    return ceil_to_multiple(1 + 2 * dimension_size, 4);
-}
-
-template <unsigned int dimension_size>
-void IGCKernel::add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
-
-    const ITensorInfo *info    = tensor->info();
-    const Strides     &strides = info->strides_in_bytes();
-
-    // Calculate offset to the start of the window
-    unsigned int offset_first_element = info->offset_first_element_in_bytes();
-
-    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
-    {
-        offset_first_element += window[n].start() * strides[n];
-    }
-
-    unsigned int idx_start = idx;
-
-    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
-    {
-        _kernel.set_argument(idx++, strides[dimension]);
-        _kernel.set_argument(idx++, strides[dimension] * window[dimension].step());
-    }
-
-    _kernel.set_argument(idx++, offset_first_element);
-
-    // Rounding up the tensor attributes structure in compute shader to a multiple of a vec4
-    unsigned int idx_end = ceil_to_multiple(idx, 4);
-    for(unsigned int i = idx; i < idx_end; ++i)
-    {
-        _kernel.set_argument(i, 0);
-    }
-    idx = idx_end;
-
-    ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, binding_point, tensor->gc_buffer()));
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
-                                 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
-    ARM_COMPUTE_UNUSED(idx_start);
-}
-
-void IGCKernel::add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
-{
-    add_tensor_argument<1>(idx, tensor, binding_point, window);
-}
-
-void IGCKernel::add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
-{
-    add_tensor_argument<2>(idx, tensor, binding_point, window);
-}
-
-void IGCKernel::add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
-{
-    add_tensor_argument<3>(idx, tensor, binding_point, window);
-}
-
-unsigned int IGCKernel::num_arguments_per_1D_tensor() const
-{
-    return num_arguments_per_tensor<1>();
-}
-
-unsigned int IGCKernel::num_arguments_per_2D_tensor() const
-{
-    return num_arguments_per_tensor<2>();
-}
-
-unsigned int IGCKernel::num_arguments_per_3D_tensor() const
-{
-    return num_arguments_per_tensor<3>();
-}
diff --git a/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp b/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp
deleted file mode 100644
index 4c38412814..0000000000
--- a/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-void IGCSimple2DKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, 1, slice);
-        add_2D_tensor_argument(idx, _output, 2, slice);
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp b/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp
deleted file mode 100644
index df852858e2..0000000000
--- a/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-void IGCSimple3DKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window slice = window.first_slice_window_3D();
-
-    _kernel.use();
-
-    do
-    {
-        unsigned int idx     = 0;
-        unsigned int binding = 1; // SSBO binding starts from 1.
-        add_3D_tensor_argument(idx, _input, binding++, slice);
-        add_3D_tensor_argument(idx, _output, binding++, slice);
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
deleted file mode 100644
index fb31ac8377..0000000000
--- a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-using namespace arm_compute;
-
-IGCSimpleKernel::IGCSimpleKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void IGCSimpleKernel::configure(const IGCTensor *input, IGCTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
-{
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);
-
-    IGCKernel::configure(win);
-}
diff --git a/src/core/GLES_COMPUTE/IGCTensor.cpp b/src/core/GLES_COMPUTE/IGCTensor.cpp
deleted file mode 100644
index 0f310b839f..0000000000
--- a/src/core/GLES_COMPUTE/IGCTensor.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-
-using namespace arm_compute;
-
-IGCTensor::IGCTensor()
-    : _mapping(nullptr), _needs_shifting(false)
-{
-}
-
-void IGCTensor::map(bool blocking)
-{
-    _mapping = do_map(blocking);
-}
-
-void IGCTensor::unmap()
-{
-    do_unmap();
-    _mapping = nullptr;
-}
-
-void IGCTensor::clear()
-{
-    this->map();
-    std::memset(static_cast<void *>(_mapping), 0, this->info()->total_size());
-    this->unmap();
-}
-
-uint8_t *IGCTensor::buffer() const
-{
-    return _mapping;
-}
-
-bool IGCTensor::needs_shifting() const
-{
-    return _needs_shifting;
-}
-
-void IGCTensor::set_needs_shifting(bool needs_shifting)
-{
-    _needs_shifting = needs_shifting;
-}
diff --git a/src/core/GLES_COMPUTE/OpenGLES.cpp b/src/core/GLES_COMPUTE/OpenGLES.cpp
deleted file mode 100644
index f56bcfaeb6..0000000000
--- a/src/core/GLES_COMPUTE/OpenGLES.cpp
+++ /dev/null
@@ -1,826 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-
-#include <dlfcn.h>
-#include <iostream>
-#include <vector>
-
-using eglGetProcAddress_func         = __eglMustCastToProperFunctionPointerType EGLAPIENTRY (*)(const char *procname);
-using eglBindAPI_func                = EGLBoolean EGLAPIENTRY (*)(EGLenum api);
-using eglChooseConfig_func           = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config);
-using eglCreateContext_func          = EGLContext EGLAPIENTRY (*)(EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list);
-using eglDestroyContext_func         = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLContext ctx);
-using eglGetDisplay_func             = EGLDisplay EGLAPIENTRY (*)(EGLNativeDisplayType display_id);
-using eglInitialize_func             = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLint *major, EGLint *minor);
-using eglMakeCurrent_func            = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx);
-using eglTerminate_func              = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy);
-using eglGetError_func               = EGLint         EGLAPIENTRY (*)();
-using eglQueryString_func            = char const * EGLAPIENTRY (*)(EGLDisplay dpy, EGLint name);
-using glAttachShader_func            = void GL_APIENTRY (*)(GLuint program, GLuint shader);
-using glCompileShader_func           = void GL_APIENTRY (*)(GLuint shader);
-using glCreateProgram_func           = GLuint GL_APIENTRY (*)();
-using glCreateShader_func            = GLuint GL_APIENTRY (*)(GLenum type);
-using glDeleteProgram_func           = void GL_APIENTRY (*)(GLuint program);
-using glDeleteShader_func            = void GL_APIENTRY (*)(GLuint shader);
-using glDetachShader_func            = void GL_APIENTRY (*)(GLuint program, GLuint shader);
-using glGetProgramInfoLog_func       = void GL_APIENTRY (*)(GLuint program, GLsizei bufsize, GLsizei *length, GLchar *infolog);
-using glGetProgramiv_func            = void GL_APIENTRY (*)(GLuint program, GLenum pname, GLint *params);
-using glGetShaderInfoLog_func        = void GL_APIENTRY (*)(GLuint shader, GLsizei bufsize, GLsizei *length, GLchar *infolog);
-using glGetShaderiv_func             = void GL_APIENTRY (*)(GLuint shader, GLenum pname, GLint *params);
-using glLinkProgram_func             = void GL_APIENTRY (*)(GLuint program);
-using glShaderSource_func            = void GL_APIENTRY (*)(GLuint shader, GLsizei count, const GLchar *const *string, const GLint *length);
-using glUseProgram_func              = void GL_APIENTRY (*)(GLuint program);
-using glBindBuffer_func              = void GL_APIENTRY (*)(GLenum target, GLuint buffer);
-using glBindBufferBase_func          = void GL_APIENTRY (*)(GLenum target, GLuint index, GLuint buffer);
-using glBufferData_func              = void GL_APIENTRY (*)(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage);
-using glDeleteBuffers_func           = void GL_APIENTRY (*)(GLsizei n, const GLuint *buffers);
-using glDispatchCompute_func         = void GL_APIENTRY (*)(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z);
-using glFlush_func                   = void      GL_APIENTRY (*)();
-using glGenBuffers_func              = void GL_APIENTRY (*)(GLsizei n, GLuint *buffers);
-using glGetProgramResourceIndex_func = GLuint GL_APIENTRY (*)(GLuint program, GLenum programInterface, const GLchar *name);
-using glGetUniformLocation_func      = GLint GL_APIENTRY (*)(GLuint program, const GLchar *name);
-using glMapBufferRange_func          = void *GL_APIENTRY (*)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access);
-using glMemoryBarrier_func           = void GL_APIENTRY (*)(GLbitfield barriers);
-using glUniform1ui_func              = void GL_APIENTRY (*)(GLint location, GLuint v0);
-using glUnmapBuffer_func             = GLboolean GL_APIENTRY (*)(GLenum target);
-using glGetError_func                = GLenum          GL_APIENTRY (*)();
-using glGetString_func               = const GLubyte * GL_APIENTRY (*)(GLenum name);
-using glGetActiveUniformBlockiv_func = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params);
-using glUniformBlockBinding_func     = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding);
-using glGetUniformBlockIndex_func    = GLuint GL_APIENTRY (*)(GLuint program, const GLchar *uniformBlockName);
-using glGenTextures_func             = void GL_APIENTRY (*)(GLsizei n, GLuint *textures);
-using glDeleteTextures_func          = void GL_APIENTRY (*)(GLsizei n, const GLuint *textures);
-using glBindTexture_func             = void GL_APIENTRY (*)(GLenum target, GLuint texture);
-using glTexImage2D_func              = void GL_APIENTRY (*)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type,
-                                                            const GLvoid *pixels);
-using glGenFramebuffers_func      = void GL_APIENTRY (*)(GLsizei n, GLuint *framebuffers);
-using glDeleteFramebuffers_func   = void GL_APIENTRY (*)(GLsizei n, const GLuint *framebuffers);
-using glBindFramebuffer_func      = void GL_APIENTRY (*)(GLenum target, GLuint framebuffer);
-using glFramebufferTexture2D_func = void GL_APIENTRY (*)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
-
-class GLESSymbols
-{
-private:
-    void init()
-    {
-        void *egl_handle    = dlopen("libEGL.so", RTLD_LAZY | RTLD_LOCAL);
-        void *glesv2_handle = dlopen("libGLESv2.so", RTLD_LAZY | RTLD_LOCAL);
-        void *glesv3_handle = dlopen("libGLESv3.so", RTLD_LAZY | RTLD_LOCAL);
-        if(egl_handle == nullptr)
-        {
-            std::cerr << "Can't load libEGL.so: " << dlerror() << std::endl;
-        }
-        else
-        {
-#undef EGL_ENTRY
-#define EGL_ENTRY(_api) _api = reinterpret_cast<_api##_func>(dlsym(egl_handle, #_api));
-#include "./egl_entries.in"
-#undef EGL_ENTRY
-
-            if(eglGetProcAddress != nullptr)
-            {
-#undef EGL_ENTRY
-#define EGL_ENTRY(_api)   \
-    if((_api) == nullptr) \
-        (_api) = reinterpret_cast<_api##_func>(eglGetProcAddress(#_api));
-#include "./egl_entries.in"
-#undef EGL_ENTRY
-
-#undef GL_ENTRY
-#define GL_ENTRY(_api) _api = reinterpret_cast<_api##_func>(eglGetProcAddress(#_api));
-#include "./gl_entries.in"
-#undef GL_ENTRY
-            }
-
-            std::vector<void *> handles = { glesv3_handle, glesv2_handle };
-            for(auto &handle : handles)
-            {
-                if(handle != nullptr)
-                {
-#undef GL_ENTRY
-#define GL_ENTRY(_api)    \
-    if((_api) == nullptr) \
-        (_api) = reinterpret_cast<_api##_func>(dlsym(handle, #_api));
-#include "./gl_entries.in"
-#undef GL_ENTRY
-                }
-            }
-
-            //Don't call dlclose(handle) or all the symbols will be unloaded !
-        }
-    }
-    bool _initialized = false;
-
-public:
-    static GLESSymbols &get()
-    {
-        static GLESSymbols symbols = GLESSymbols();
-        if(!symbols._initialized)
-        {
-            symbols._initialized = true;
-            symbols.init();
-        }
-
-        return symbols;
-    }
-
-#undef EGL_ENTRY
-#undef GL_ENTRY
-#define EGL_ENTRY(_api) _api##_func _api = nullptr;
-#define GL_ENTRY(_api) EGL_ENTRY(_api)
-#include "./egl_entries.in"
-#include "./gl_entries.in"
-#undef EGL_ENTRY
-#undef GL_ENTRY
-};
-
-bool arm_compute::opengles31_is_available()
-{
-    return GLESSymbols::get().glDispatchCompute != nullptr;
-}
-
-__eglMustCastToProperFunctionPointerType EGLAPIENTRY eglGetProcAddress(const char *procname)
-{
-    auto func = GLESSymbols::get().eglGetProcAddress;
-    if(func != nullptr)
-    {
-        return func(procname);
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
-EGLBoolean EGLAPIENTRY eglBindAPI(EGLenum api)
-{
-    auto func = GLESSymbols::get().eglBindAPI;
-    if(func != nullptr)
-    {
-        return func(api);
-    }
-    else
-    {
-        return EGL_FALSE;
-    }
-}
-
-EGLBoolean EGLAPIENTRY eglChooseConfig(EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config)
-{
-    auto func = GLESSymbols::get().eglChooseConfig;
-    if(func != nullptr)
-    {
-        return func(dpy, attrib_list, configs, config_size, num_config);
-    }
-    else
-    {
-        return EGL_FALSE;
-    }
-}
-
-EGLContext EGLAPIENTRY eglCreateContext(EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list)
-{
-    auto func = GLESSymbols::get().eglCreateContext;
-    if(func != nullptr)
-    {
-        return func(dpy, config, share_context, attrib_list);
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
-EGLBoolean EGLAPIENTRY eglDestroyContext(EGLDisplay dpy, EGLContext ctx)
-{
-    auto func = GLESSymbols::get().eglDestroyContext;
-    if(func != nullptr)
-    {
-        return func(dpy, ctx);
-    }
-    else
-    {
-        return EGL_FALSE;
-    }
-}
-
-EGLDisplay EGLAPIENTRY eglGetDisplay(EGLNativeDisplayType display_id)
-{
-    auto func = GLESSymbols::get().eglGetDisplay;
-    if(func != nullptr)
-    {
-        return func(display_id);
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
-EGLBoolean EGLAPIENTRY eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor)
-{
-    auto func = GLESSymbols::get().eglInitialize;
-    if(func != nullptr)
-    {
-        return func(dpy, major, minor);
-    }
-    else
-    {
-        return EGL_FALSE;
-    }
-}
-
-EGLBoolean EGLAPIENTRY eglMakeCurrent(EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx)
-{
-    auto func = GLESSymbols::get().eglMakeCurrent;
-    if(func != nullptr)
-    {
-        return func(dpy, draw, read, ctx);
-    }
-    else
-    {
-        return EGL_FALSE;
-    }
-}
-
-EGLBoolean EGLAPIENTRY eglTerminate(EGLDisplay dpy)
-{
-    auto func = GLESSymbols::get().eglTerminate;
-    if(func != nullptr)
-    {
-        return func(dpy);
-    }
-    else
-    {
-        return EGL_FALSE;
-    }
-}
-
-EGLint EGLAPIENTRY eglGetError()
-{
-    auto func = GLESSymbols::get().eglGetError;
-    if(func != nullptr)
-    {
-        return func();
-    }
-    else
-    {
-        return GL_NO_ERROR;
-    }
-}
-
-char const *EGLAPIENTRY eglQueryString(EGLDisplay dpy, EGLint name)
-{
-    auto func = GLESSymbols::get().eglQueryString;
-    if(func != nullptr)
-    {
-        return func(dpy, name);
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
-void GL_APIENTRY glAttachShader(GLuint program, GLuint shader)
-{
-    auto func = GLESSymbols::get().glAttachShader;
-    if(func != nullptr)
-    {
-        return func(program, shader);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glCompileShader(GLuint shader)
-{
-    auto func = GLESSymbols::get().glCompileShader;
-    if(func != nullptr)
-    {
-        return func(shader);
-    }
-    else
-    {
-        return;
-    }
-}
-
-GLuint GL_APIENTRY glCreateProgram()
-{
-    auto func = GLESSymbols::get().glCreateProgram;
-    if(func != nullptr)
-    {
-        return func();
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-GLuint GL_APIENTRY glCreateShader(GLenum type)
-{
-    auto func = GLESSymbols::get().glCreateShader;
-    if(func != nullptr)
-    {
-        return func(type);
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-void GL_APIENTRY glDeleteProgram(GLuint program)
-{
-    auto func = GLESSymbols::get().glDeleteProgram;
-    if(func != nullptr)
-    {
-        return func(program);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glDeleteShader(GLuint shader)
-{
-    auto func = GLESSymbols::get().glDeleteShader;
-    if(func != nullptr)
-    {
-        return func(shader);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glDetachShader(GLuint program, GLuint shader)
-{
-    auto func = GLESSymbols::get().glDetachShader;
-    if(func != nullptr)
-    {
-        return func(program, shader);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glGetProgramInfoLog(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog)
-{
-    auto func = GLESSymbols::get().glGetProgramInfoLog;
-    if(func != nullptr)
-    {
-        return func(program, bufSize, length, infoLog);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glGetProgramiv(GLuint program, GLenum pname, GLint *params)
-{
-    auto func = GLESSymbols::get().glGetProgramiv;
-    if(func != nullptr)
-    {
-        return func(program, pname, params);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glGetShaderInfoLog(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog)
-{
-    auto func = GLESSymbols::get().glGetShaderInfoLog;
-    if(func != nullptr)
-    {
-        return func(shader, bufSize, length, infoLog);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glGetShaderiv(GLuint shader, GLenum pname, GLint *params)
-{
-    auto func = GLESSymbols::get().glGetShaderiv;
-    if(func != nullptr)
-    {
-        return func(shader, pname, params);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glLinkProgram(GLuint program)
-{
-    auto func = GLESSymbols::get().glLinkProgram;
-    if(func != nullptr)
-    {
-        return func(program);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glShaderSource(GLuint shader, GLsizei count, const GLchar *const *string, const GLint *length)
-{
-    auto func = GLESSymbols::get().glShaderSource;
-    if(func != nullptr)
-    {
-        return func(shader, count, string, length);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glUseProgram(GLuint program)
-{
-    auto func = GLESSymbols::get().glUseProgram;
-    if(func != nullptr)
-    {
-        return func(program);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glBindBuffer(GLenum target, GLuint buffer)
-{
-    auto func = GLESSymbols::get().glBindBuffer;
-    if(func != nullptr)
-    {
-        return func(target, buffer);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glBindBufferBase(GLenum target, GLuint index, GLuint buffer)
-{
-    auto func = GLESSymbols::get().glBindBufferBase;
-    if(func != nullptr)
-    {
-        return func(target, index, buffer);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glBufferData(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage)
-{
-    auto func = GLESSymbols::get().glBufferData;
-    if(func != nullptr)
-    {
-        return func(target, size, data, usage);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glDeleteBuffers(GLsizei n, const GLuint *buffers)
-{
-    auto func = GLESSymbols::get().glDeleteBuffers;
-    if(func != nullptr)
-    {
-        return func(n, buffers);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glDispatchCompute(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z)
-{
-    auto func = GLESSymbols::get().glDispatchCompute;
-    if(func != nullptr)
-    {
-        return func(num_groups_x, num_groups_y, num_groups_z);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glFlush(void)
-{
-    auto func = GLESSymbols::get().glFlush;
-    if(func != nullptr)
-    {
-        return func();
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glGenBuffers(GLsizei n, GLuint *buffers)
-{
-    auto func = GLESSymbols::get().glGenBuffers;
-    if(func != nullptr)
-    {
-        return func(n, buffers);
-    }
-    else
-    {
-        return;
-    }
-}
-
-GLuint GL_APIENTRY glGetProgramResourceIndex(GLuint program, GLenum programInterface, const GLchar *name)
-{
-    auto func = GLESSymbols::get().glGetProgramResourceIndex;
-    if(func != nullptr)
-    {
-        return func(program, programInterface, name);
-    }
-    else
-    {
-        return GL_INVALID_INDEX;
-    }
-}
-
-GLint GL_APIENTRY glGetUniformLocation(GLuint program, const GLchar *name)
-{
-    auto func = GLESSymbols::get().glGetUniformLocation;
-    if(func != nullptr)
-    {
-        return func(program, name);
-    }
-    else
-    {
-        return -1;
-    }
-}
-
-void *GL_APIENTRY glMapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access)
-{
-    auto func = GLESSymbols::get().glMapBufferRange;
-    if(func != nullptr)
-    {
-        return func(target, offset, length, access);
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
-void GL_APIENTRY glMemoryBarrier(GLbitfield barriers)
-{
-    auto func = GLESSymbols::get().glMemoryBarrier;
-    if(func != nullptr)
-    {
-        return func(barriers);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glUniform1ui(GLint location, GLuint v0)
-{
-    auto func = GLESSymbols::get().glUniform1ui;
-    if(func != nullptr)
-    {
-        return func(location, v0);
-    }
-    else
-    {
-        return;
-    }
-}
-
-GLboolean GL_APIENTRY glUnmapBuffer(GLenum target)
-{
-    auto func = GLESSymbols::get().glUnmapBuffer;
-    if(func != nullptr)
-    {
-        return func(target);
-    }
-    else
-    {
-        return GL_FALSE;
-    }
-}
-
-GLenum GL_APIENTRY glGetError(void)
-{
-    auto func = GLESSymbols::get().glGetError;
-    if(func != nullptr)
-    {
-        return func();
-    }
-    else
-    {
-        return GL_NO_ERROR;
-    }
-}
-
-const GLubyte *GL_APIENTRY glGetString(GLenum name)
-{
-    auto func = GLESSymbols::get().glGetString;
-    if(func != nullptr)
-    {
-        return func(name);
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
-void GL_APIENTRY glGetActiveUniformBlockiv(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params)
-{
-    auto func = GLESSymbols::get().glGetActiveUniformBlockiv;
-    if(func != nullptr)
-    {
-        return func(program, uniformBlockIndex, pname, params);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glUniformBlockBinding(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding)
-{
-    auto func = GLESSymbols::get().glUniformBlockBinding;
-    if(func != nullptr)
-    {
-        return func(program, uniformBlockIndex, uniformBlockBinding);
-    }
-    else
-    {
-        return;
-    }
-}
-
-GLuint GL_APIENTRY glGetUniformBlockIndex(GLuint program, const GLchar *uniformBlockName)
-{
-    auto func = GLESSymbols::get().glGetUniformBlockIndex;
-    if(func != nullptr)
-    {
-        return func(program, uniformBlockName);
-    }
-    else
-    {
-        return GL_INVALID_INDEX;
-    }
-}
-
-void GL_APIENTRY glGenTextures(GLsizei n, GLuint *textures)
-{
-    auto func = GLESSymbols::get().glGenTextures;
-    if(func != nullptr)
-    {
-        return func(n, textures);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glDeleteTextures(GLsizei n, const GLuint *textures)
-{
-    auto func = GLESSymbols::get().glDeleteTextures;
-    if(func != nullptr)
-    {
-        return func(n, textures);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glBindTexture(GLenum target, GLuint texture)
-{
-    auto func = GLESSymbols::get().glBindTexture;
-    if(func != nullptr)
-    {
-        return func(target, texture);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glTexImage2D(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels)
-{
-    auto func = GLESSymbols::get().glTexImage2D;
-    if(func != nullptr)
-    {
-        return func(target, level, internalformat, width, height, border, format, type, pixels);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glGenFramebuffers(GLsizei n, GLuint *framebuffers)
-{
-    auto func = GLESSymbols::get().glGenFramebuffers;
-    if(func != nullptr)
-    {
-        return func(n, framebuffers);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glDeleteFramebuffers(GLsizei n, const GLuint *framebuffers)
-{
-    auto func = GLESSymbols::get().glDeleteFramebuffers;
-    if(func != nullptr)
-    {
-        return func(n, framebuffers);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glBindFramebuffer(GLenum target, GLuint framebuffer)
-{
-    auto func = GLESSymbols::get().glBindFramebuffer;
-    if(func != nullptr)
-    {
-        return func(target, framebuffer);
-    }
-    else
-    {
-        return;
-    }
-}
-
-void GL_APIENTRY glFramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level)
-{
-    auto func = GLESSymbols::get().glFramebufferTexture2D;
-    if(func != nullptr)
-    {
-        return func(target, attachment, textarget, texture, level);
-    }
-    else
-    {
-        return;
-    }
-}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs
deleted file mode 100644
index c5196a14dc..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-/** Calculate the absolute difference of two input images.
- *
- * @param[in]  src1_ptr   Pointer to the first source image. Supported data types: U8
- * @param[in]  src1_attrs The attributes of the first source image
- * @param[in]  src2_ptr   Pointer to the second source image. Supported data types: Same as @p in1_ptr
- * @param[in]  src2_attrs The attributes of the second source image
- * @param[out] dst_ptr    Pointer to the destination image. Supported data types: Same as @p in1_ptr
- * @param[in]  dst_attrs  The attributes of the destination image
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src1_attrs;
-    ImageAttributes src2_attrs;
-    ImageAttributes dst_attrs;
-};
-
-TENSOR_DECLARATION(1, src1Buffer, uint, src1_ptr, src1_shift, 2, readonly);
-TENSOR_DECLARATION(2, src2Buffer, uint, src2_ptr, src2_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR(src1_attrs, src1_shift);
-    ImageIterator src2_iter = CONVERT_TO_IMAGE_ITERATOR(src2_attrs, src2_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    lowp uvec4 tmp1 = LOAD_UNPACK4_CURRENT_ITEM_U8(src1_ptr, src1_iter);
-    lowp uvec4 tmp2 = LOAD_UNPACK4_CURRENT_ITEM_U8(src2_ptr, src2_iter);
-    lowp uvec4 diff = uvec4(abs(ivec4(tmp1 - tmp2)));
-
-    STORE_PACK4_CURRENT_ITEM_U8(dst_ptr, dst_iter, diff);
-}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
deleted file mode 100644
index 983b31deba..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "activation_layer_helpers_cs.h"
-#include "helpers_cs.h"
-
-/** This performs an activation function floating point inputs.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
- * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
- *
- * @param[in]  src_ptr   Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs The attributes of the source tensor
- * @param[out] dst_ptr   Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-};
-
-#ifdef DATA_TYPE_FP32
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    float data     = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-    float data_out = 0.f;
-    // Perform activation
-#ifdef LOGISTIC
-    data_out = logistic_op(data);
-#elif defined(TANH)     /*LOGISTIC*/
-    data_out = tanh_op(data);
-#elif defined(RELU)     /*RELU*/
-    data_out = relu_op(data);
-#elif defined(BRELU)    /*BRELU*/
-    data_out = brelu_op(data);
-#elif defined(LU_BRELU) /*LU_BRELU*/
-    data_out = lu_brelu_op(data);
-#elif defined(LRELU)    /*LRELU*/
-    data_out = lrelu_op(data);
-#elif defined(SRELU)    /*SRELU*/
-    data_out = srelu_op(data);
-#elif defined(ELU)      /*ELU*/
-    data_out = elu_op(data);
-#elif defined(ABS)      /*ABS*/
-    data_out = abs_op(data);
-#elif defined(SQUARE)   /*SQUARE*/
-    data_out = square_op(data);
-#elif defined(SQRT)     /*SQRT*/
-    data_out = sqrt_op(data);
-#elif defined(LINEAR)   /*LINEAR*/
-    data_out = linear_op(data);
-#elif defined(IDENTITY) /*IDENTITY*/
-    data_out = identity_op(data);
-#else                   /*LOGISTIC*/
-#error Activation function not provided
-#endif /*LOGISTIC*/
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, data_out);
-}
-
-#elif defined(DATA_TYPE_FP16)
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    vec2 data = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    // Perform activation
-    float a = data.x;
-    float b = data.y;
-    vec2  data_out;
-#ifdef LOGISTIC         /*LOGISTIC*/
-    data_out.x = logistic_op(a);
-    data_out.y = logistic_op(b);
-#elif defined(TANH)     /*TANH*/
-    data_out.x = tanh_op(a);
-    data_out.y = tanh_op(b);
-#elif defined(RELU)     /*RELU*/
-    data_out.x = relu_op(a);
-    data_out.y = relu_op(b);
-#elif defined(BRELU)    /*BRELU*/
-    data_out.x = brelu_op(a);
-    data_out.y = brelu_op(b);
-#elif defined(LU_BRELU) /*LU_BRELU*/
-    data_out.x = lu_brelu_op(a);
-    data_out.y = lu_brelu_op(b);
-#elif defined(LRELU)    /*LRELU*/
-    data_out.x = lrelu_op(a);
-    data_out.y = lrelu_op(b);
-#elif defined(SRELU)    /*SRELU*/
-    data_out.x = srelu_op(a);
-    data_out.y = srelu_op(b);
-#elif defined(ELU)      /*ELU*/
-    data_out.x = elu_op(a);
-    data_out.y = elu_op(b);
-#elif defined(ABS)      /*ABS*/
-    data_out.x = abs_op(a);
-    data_out.y = abs_op(b);
-#elif defined(SQUARE)   /*SQUARE*/
-    data_out.x = square_op(a);
-    data_out.y = square_op(b);
-#elif defined(SQRT)     /*SQRT*/
-    data_out.x = sqrt_op(a);
-    data_out.y = sqrt_op(b);
-#elif defined(LINEAR)   /*LINEAR*/
-    data_out.x = linear_op(a);
-    data_out.y = linear_op(b);
-#elif defined(IDENTITY) /*IDENTITY*/
-    data_out.x = identity_op(a);
-    data_out.y = identity_op(b);
-#else                   /*LOGISTIC*/
-#error Activation function not provided
-#endif /*LOGISTIC*/
-
-    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data_out);
-}
-#endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h
deleted file mode 100644
index e353b744ea..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer_helpers_cs.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef DATA_TYPE_FP32
-precision highp float;
-#elif defined(DATA_TYPE_FP16)
-#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT)
-precision highp float;
-#else  /*LOGISTIC_TANH_SRELU_SQRT*/
-precision mediump float;
-#endif /*LOGISTIC_TANH_SRELU_SQRT*/
-#endif /*DATA_TYPE_FP32*/
-
-#define ABS_OP(a) abs((a))
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define MLA_OP(a, b, c) ((b) * (c) + (a))
-#define DIV_OP(a, b) ((a) / (b))
-#define EXP_OP(a) exp((a))
-#define LOG_OP(a) log((a))
-#define SQRT_OP(a) sqrt((a))
-#define CONST_ONE (1.f)
-
-// Logistic Activation
-float logistic_op(float x)
-{
-    return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x)));
-}
-vec4 logistic_op(vec4 x)
-{
-    return DIV_OP(vec4(CONST_ONE), ADD_OP(CONST_ONE, EXP_OP(-x)));
-}
-// Hyperbolic Tangent Activation
-float tanh_op(float x)
-{
-    float tmp = float(B_VAL) * x;
-    if(tmp > 10.f)
-    {
-        return MUL_OP(float(A_VAL), 1.f);
-    }
-    else if(tmp < -10.f)
-    {
-        return MUL_OP(float(A_VAL), -1.f);
-    }
-    else
-    {
-        return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f));
-    }
-}
-// RELU Tangent Activation
-float relu_op(float x)
-{
-    return max(0.f, x);
-}
-vec4 relu_op(vec4 x)
-{
-    return max(vec4(0.f), x);
-}
-// Bounded RELU Activation
-float brelu_op(float x)
-{
-    return min(float(A_VAL), max(float(0.0), x));
-}
-// Lower Upper Bounded RELU Activation
-float lu_brelu_op(float x)
-{
-    return min(max(x, float(B_VAL)), float(A_VAL));
-}
-// Leaky RELU Activation
-float lrelu_op(float x)
-{
-    return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x);
-}
-// Soft RELU Activation
-float srelu_op(float x)
-{
-    return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x)));
-}
-// ELU Activation
-float elu_op(float x)
-{
-    return (x >= float(0.0)) ? x : MUL_OP(float(A_VAL), SUB_OP(EXP_OP(x), CONST_ONE));
-}
-// Absolute Activation
-float abs_op(float x)
-{
-    return ABS_OP(x);
-}
-// Square Activation
-float square_op(float x)
-{
-    return MUL_OP(x, x);
-}
-// Square-root Activation
-float sqrt_op(float x)
-{
-    return SQRT_OP(x);
-}
-// Linear Activation
-float linear_op(float x)
-{
-    return MLA_OP(float(B_VAL), float(A_VAL), x);
-}
-
-// Linear Activation
-float identity_op(float x)
-{
-    return x;
-}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs b/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
deleted file mode 100755
index faaf204c62..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/arithmetic_add.cs
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2016-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-precision mediump float;
-#define ADD(x, y) (x) + (y)
-
-/** This function add two tensors.
- *
- * @param[in]  src1_ptr   Pointer to the first source tensor. Supported data types: F16
- * @param[in]  src1_attrs The attributes of the first source tensor
- * @param[in]  src2_ptr   Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_attrs The attributes of the second source tensor
- * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_attrs  The attributes of the destination tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src1_attrs;
-    Tensor3DAttributes src2_attrs;
-    Tensor3DAttributes dst_attrs;
-};
-
-TENSOR_DECLARATION(1, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
-TENSOR_DECLARATION(2, src2Buffer, uvec4, src2_ptr, src2_shift, 4, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-
-void main(void)
-{
-    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
-    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
-    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    vec4 tmp1[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
-    vec4 tmp2[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src2_ptr, src2_iter);
-    vec4 addition[2];
-    addition[0] = ADD(tmp1[0], tmp2[0]);
-    addition[1] = ADD(tmp1[1], tmp2[1]);
-
-    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, addition);
-}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
deleted file mode 100644
index f38a90b947..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif /*DATA_TYPE_FP32*/
-
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define INVSQRT_OP(a) inversesqrt((a))
-#define SQCVT_SAT(a) (a)
-
-#if defined(LU_BRELU)
-#define ACTIVATION_FUNC(x) min(max(x, float(B_VAL)), float(A_VAL))
-#elif defined(BRELU)
-#define ACTIVATION_FUNC(x) min(max(x, float(0)), float(A_VAL))
-#elif defined(RELU)
-#define ACTIVATION_FUNC(x) max(x, float(0))
-#else /* defined(FUSED_ACT) */
-#define ACTIVATION_FUNC(x) (x)
-#endif /* defined(FUSED_ACT) */
-
-/** Apply batch normalization.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
- * @note Beta is optional with default value of 0. If not provided, the preprocessor argument "USE_DEFAULT_BETA" should be given
- * @note Gamma is optional with default value of 1. If not provided, the preprocessor argument "USE_DEFAULT_GAMMA" should be given
- *
- * @param[in]  src_ptr     Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs   The attributes of the source tensor
- * @param[out] dst_ptr     Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs   The attributes of the destination tensor
- * @param[in]  mean_ptr    Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_attrs  The attributes of the mean tensor
- * @param[in]  var_ptr     Pointer to the var tensor. Supported data types: same as @p src_ptr
- * @param[in]  var_attrs   The attributes of the var tensor
- * @param[in]  beta_ptr    (Optional) Pointer to the beta source tensor. If not provided, default value of beta is 0. Supported data types: same as @p src_ptr
- * @param[in]  beta_attrs  (Optional) The attributes of the beta tensor
- * @param[in]  gamma_ptr   (Optional) Pointer to the gamma source tensor. If not provided, default value of gamma is 1. Supported data types: same as @p src_ptr
- * @param[in]  gamma_attrs (Optional) The attributes of the gamma tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-    VectorAttributes   mean_attrs;
-    VectorAttributes   var_attrs;
-#ifndef USE_DEFAULT_BETA
-    VectorAttributes beta_attrs;
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    VectorAttributes gamma_attrs;
-#endif /* USE_DEFAULT_GAMMA */
-};
-
-#ifdef DATA_TYPE_FP32
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-TENSOR_DECLARATION(3, meanBuffer, float, mean_ptr, mean_shift, 2, readonly);
-TENSOR_DECLARATION(4, varBuffer, float, var_ptr, var_shift, 2, readonly);
-#ifndef USE_DEFAULT_BETA
-TENSOR_DECLARATION(5, betaBuffer, float, beta_ptr, beta_shift, 2, readonly);
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-#ifdef USE_DEFAULT_BETA
-TENSOR_DECLARATION(5, gammaBuffer, float, gamma_ptr, gamma_shift, 2, readonly);
-#else  /* USE_DEFAULT_BETA */
-TENSOR_DECLARATION(6, gammaBuffer, float, gamma_ptr, gamma_shift, 2, readonly);
-#endif /* USE_DEFAULT_BETA */
-#endif /* USE_DEFAULT_GAMMA */
-
-void main(void)
-{
-    Tensor3DIterator src_iter  = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-    VectorIterator   mean_iter = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
-    VectorIterator   var_iter  = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
-#ifndef USE_DEFAULT_BETA
-    VectorIterator beta_iter = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    VectorIterator gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
-#endif /* USE_DEFAULT_GAMMA */
-
-    float input_value = 0.f;
-    float denominator = 0.f;
-    float numerator   = 0.f;
-    float x_bar       = 0.f;
-
-    uint current_slice = gl_GlobalInvocationID.z;
-
-    input_value = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-    denominator = LOAD(var_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(var_iter, current_slice * var_attrs.stride_x));
-    denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
-
-    // Calculate x bar and store results
-    numerator = LOAD(mean_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(mean_iter, current_slice * mean_attrs.stride_x));
-    numerator = SUB_OP(input_value, numerator);
-    x_bar     = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-    float gamma_param = LOAD(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * gamma_attrs.stride_x));
-
-    x_bar = MUL_OP(gamma_param, x_bar);
-#endif /* USE_DEFAULT_GAMMA */
-#ifndef USE_DEFAULT_BETA
-    float beta_param = LOAD(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
-
-    x_bar = ADD_OP(x_bar, beta_param);
-#endif /* USE_DEFAULT_BETA */
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, ACTIVATION_FUNC(x_bar));
-}
-
-#elif defined(DATA_TYPE_FP16)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, meanBuffer, uvec2, mean_ptr, mean_shift, 3, readonly);
-TENSOR_DECLARATION(4, varBuffer, uvec2, var_ptr, var_shift, 3, readonly);
-#ifndef USE_DEFAULT_BETA
-TENSOR_DECLARATION(5, betaBuffer, uvec2, beta_ptr, beta_shift, 3, readonly);
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-#ifdef USE_DEFAULT_BETA
-TENSOR_DECLARATION(5, gammaBuffer, uvec2, gamma_ptr, gamma_shift, 3, readonly);
-#else  /* USE_DEFAULT_BETA */
-TENSOR_DECLARATION(6, gammaBuffer, uvec2, gamma_ptr, gamma_shift, 3, readonly);
-#endif /* USE_DEFAULT_BETA */
-#endif /* USE_DEFAULT_GAMMA */
-
-void main(void)
-{
-    Tensor3DIterator src_iter   = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter   = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-    VectorIterator   mean_iter  = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
-    VectorIterator   var_iter   = CONVERT_TO_VECTOR_ITERATOR(var_attrs, var_shift);
-#ifndef USE_DEFAULT_BETA
-    VectorIterator   beta_iter  = CONVERT_TO_VECTOR_ITERATOR(beta_attrs, beta_shift);
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    VectorIterator   gamma_iter = CONVERT_TO_VECTOR_ITERATOR(gamma_attrs, gamma_shift);
-#endif /* USE_DEFAULT_GAMMA */
-
-    vec4  unpacked_s[5];
-    float denominator;
-    float numerator;
-    float gamma_param = 1.f;
-    float beta_param  = 0.f;
-    vec4  x_bar;
-    vec4  result;
-
-    uint current_slice = gl_GlobalInvocationID.z;
-    unpacked_s[0]      = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    unpacked_s[1]      = LOAD_UNPACK4_HALF(var_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(var_iter, current_slice * var_attrs.stride_x));
-    unpacked_s[2]      = LOAD_UNPACK4_HALF(mean_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(mean_iter, current_slice * mean_attrs.stride_x));
-#ifndef USE_DEFAULT_GAMMA
-    unpacked_s[3]      = LOAD_UNPACK4_HALF(gamma_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(gamma_iter, current_slice * gamma_attrs.stride_x));
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_BETA
-    unpacked_s[4]      = LOAD_UNPACK4_HALF(beta_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(beta_iter, current_slice * beta_attrs.stride_x));
-#endif /* USE_DEFAULT_GAMMA */
-
-    if((current_slice % uint(4)) == uint(0))
-    {
-        denominator = unpacked_s[1].x;
-        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
-
-        // Calculate x bar
-        numerator   = unpacked_s[2].x;
-        x_bar       = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-        gamma_param = unpacked_s[3].x;
-#endif /* USE_DEFAULT_GAMMA */
-#ifndef USE_DEFAULT_BETA
-        beta_param  = unpacked_s[4].x;
-#endif /* USE_DEFAULT_BETA */
-    }
-    else if((current_slice % uint(4)) == uint(1))
-    {
-        denominator = unpacked_s[1].y;
-        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
-
-        // Calculate x bar
-        numerator   = unpacked_s[2].y;
-        x_bar       = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-        gamma_param = unpacked_s[3].y;
-#endif /* USE_DEFAULT_GAMMA */
-#ifndef USE_DEFAULT_BETA
-        beta_param  = unpacked_s[4].y;
-#endif /* USE_DEFAULT_BETA */
-    }
-    else if((current_slice % uint(4)) == uint(2))
-    {
-        denominator = unpacked_s[1].z;
-        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
-
-        // Calculate x bar
-        numerator   = unpacked_s[2].z;
-        x_bar       = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-        gamma_param = unpacked_s[3].z;
-#endif /* USE_DEFAULT_GAMMA */
-#ifndef USE_DEFAULT_BETA
-        beta_param  = unpacked_s[4].z;
-#endif /* USE_DEFAULT_BETA */
-    }
-    else
-    {
-        denominator = unpacked_s[1].w;
-        denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON))));
-
-        // Calculate x bar
-        numerator   = unpacked_s[2].w;
-        x_bar       = MUL_OP(SUB_OP(unpacked_s[0], numerator), denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-        gamma_param = unpacked_s[3].w;
-#endif /* USE_DEFAULT_GAMMA */
-#ifndef USE_DEFAULT_BETA
-        beta_param  = unpacked_s[4].w;
-#endif /* USE_DEFAULT_BETA */
-    }
-
-#ifndef USE_DEFAULT_GAMMA
-    x_bar = MUL_OP(gamma_param, x_bar);
-#endif /* USE_DEFAULT_GAMMA */
-#ifndef USE_DEFAULT_BETA
-    x_bar = ADD_OP(x_bar, beta_param);
-#endif /* USE_DEFAULT_BETA */
-
-    result = ACTIVATION_FUNC(x_bar);
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
-}
-#endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
deleted file mode 100644
index d1d1a8632f..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif /*DATA_TYPE_FP16*/
-
-/** This kernel concatenates the input tensor into the output tensor along the third dimension
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- *
- * @param[in]  src_ptr   Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs The attributes of the source tensor
- * @param[out] dst_ptr   Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-};
-
-#ifdef DATA_TYPE_FP32
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    float tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
-}
-
-#elif defined(DATA_TYPE_FP16)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    uvec2 tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
-}
-#endif /*DATA_TYPE_FP16*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
deleted file mode 100644
index d40cbbbaf0..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ /dev/null
@@ -1,791 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-#ifdef RESHAPE_TO_COLUMNS
-
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr       Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs     The attributes of the source tensor
- * @param[out] dst_ptr       Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs     The attributes of the destination tensor
- * @param[in]  biases_ptr    Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_attrs  The attributes of the biases tensor
- * @param[in]  width         The width of the input tensor
- * @param[in]  height        The height of the input tensor
- * @param[in]  depth         The depth of the input tensor
- * @param[in]  total_filters Total number of filters. 4th dimension of the weights matrix
- */
-
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    ImageAttributes    dst_attrs;
-#ifdef HAS_BIAS
-    VectorAttributes biases_attrs;
-#endif /* HAS_BIAS */
-    uint width;
-    uint height;
-    uint depth;
-    uint total_filters;
-};
-
-#if defined(DATA_TYPE_FP32)
-
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-#ifdef HAS_BIAS
-TENSOR_DECLARATION(3, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-void main()
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    ImageIterator    dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-#ifdef HAS_BIAS
-    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))
-                           && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1)));
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint(
-                                                    gl_GlobalInvocationID.z)
-                                                * uint(width) * uint(height) * uint(dst_attrs.stride_y))));
-    // Linearize convolution elements
-    if(is_last_thread)
-    {
-        for(uint i = 0u; i < uint(total_filters); ++i)
-        {
-            float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-            STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
-#ifdef HAS_BIAS
-            float b = LOAD_CURRENT_ITEM(biases_ptr, biases_iter);
-            STORE(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, biases_attrs.stride_x);
-#endif /* HAS_BIAS */
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x);
-        }
-    }
-    else
-    {
-        for(uint i = 0u; i < uint(total_filters); ++i)
-        {
-            float s0 = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-            STORE_CURRENT_ITEM(dst_ptr, dst_iter, s0);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_x);
-        }
-    }
-}
-
-#elif defined(DATA_TYPE_FP16)
-
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
-#ifdef HAS_BIAS
-TENSOR_DECLARATION(3, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-void main()
-{
-    Tensor3DIterator src_iter    = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    ImageIterator    dst_iter    = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-#ifdef HAS_BIAS
-    VectorIterator   biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    bool is_last_thread = (((int(gl_GlobalInvocationID.x)) == (int(gl_NumWorkGroups.x * gl_WorkGroupSize.x) - 1)) && ((int(gl_GlobalInvocationID.y)) == (int(gl_NumWorkGroups.y * gl_WorkGroupSize.y) - 1))
-                           && ((int(gl_GlobalInvocationID.z)) == (int(gl_NumWorkGroups.z * gl_WorkGroupSize.z) - 1)));
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, ((uint(gl_GlobalInvocationID.x) * uint(dst_attrs.stride_y)) + (uint(gl_GlobalInvocationID.y) * uint(width) * uint(dst_attrs.stride_y)) + (uint(
-                                                    gl_GlobalInvocationID.z)
-                                                * uint(width) * uint(height) * uint(dst_attrs.stride_y))));
-    // Linearize convolution elements
-    if(is_last_thread)
-    {
-        for(uint i = 0u; i < uint(total_filters); i = i + 2u)
-        {
-            vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
-            vec2 s;
-            if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
-            {
-                s.x = s0.x;
-            }
-            else
-            {
-                s.x = s0.y;
-            }
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
-
-            vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
-            if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
-            {
-                s.y = s1.x;
-            }
-            else
-            {
-                s.y = s1.y;
-            }
-            STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
-#ifdef HAS_BIAS
-            vec2 b = LOAD_UNPACK2_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
-            STORE_PACK2_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_y), b);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(biases_iter, (2u * biases_attrs.stride_x));
-#endif /* HAS_BIAS */
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x));
-        }
-    }
-    else
-    {
-        for(uint i = 0u; i < uint(total_filters); i = i + 2u)
-        {
-            vec2 s0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
-            vec2 s;
-            if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
-            {
-                s.x = s0.x;
-            }
-            else
-            {
-                s.x = s0.y;
-            }
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
-
-            vec2 s1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
-            if(int(CURRENT_ITEM_OFFSET_IN_BYTES(src_iter) >> 1u) % 2 == 0)
-            {
-                s.y = s1.x;
-            }
-            else
-            {
-                s.y = s1.y;
-            }
-            STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (depth * src_attrs.stride_z));
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (2u * dst_attrs.stride_x));
-        }
-    }
-}
-
-#endif /* DATA_TYPE_FP32 */
-#endif // RESHAPE_TO_COLUMNS
-
-#ifdef IM2COL_GENERIC
-
-/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note PAD_LEFT/PAD_RIGHT/PAD_TOP/PAD_BOTTOM must be passed for padding info, e.g. "#define PAD_LEFT xxx"
- * @note KERNEL_WIDTH/KERNEL_HEIGHT/KERNEL_DEPTH must be passed for kernel dimension, e.g. "#define KERNEL_WIDTH xxx"
- * @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx"
- * @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx"
- * @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx"
- * @note DILATION_X/DILATION_Y must be passed for dilation sizes, e.g. "#define DILATION_X xxx"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr      Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs    The attributes of the source tensor
- * @param[out] dst_ptr      Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs    The attributes of the destination tensor
- * @param[in]  src_stride_w Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w Stride of the destination tensor in W dimension (in bytes).
- */
-
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    ImageAttributes    dst_attrs;
-    uint               src_stride_w;
-    uint               dst_stride_w;
-};
-
-#ifdef DATA_TYPE_FP32
-
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
-    ImageIterator    dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    int xc    = int(gl_GlobalInvocationID.x);                // x coordinate in the convolved tensor
-    int yc    = int(gl_GlobalInvocationID.y);                // y coordinate in the convolved tensor
-    int ch    = int(gl_GlobalInvocationID.z) % KERNEL_DEPTH; // input feature map
-    int batch = int(gl_GlobalInvocationID.z) / KERNEL_DEPTH; // the batch
-
-    // Calculate input indeces
-    int xi = xc * STRIDE_X - PAD_LEFT;
-    int yi = yc * STRIDE_Y - PAD_TOP;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, (ch * int(src_attrs.stride_z)) + (batch * int(src_stride_w)));
-
-    // Calculate output indeces
-    int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
-    int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-    // sizeof is not available in GLES, so we'll use stride_x
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, (yo * int(dst_attrs.stride_y)) + (batch * int(dst_stride_w)) + xo * int(dst_attrs.stride_x));
-
-    uint src_pos = 0u;
-
-    // Linearize convolution elements
-    for(int y = yi, y_e = yi + KERNEL_HEIGHT * DILATION_Y; y < y_e; y += DILATION_Y)
-    {
-        for(int x = xi, x_e = xi + KERNEL_WIDTH * DILATION_X; x < x_e; x += DILATION_X, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, int(dst_attrs.stride_x)))
-        {
-#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-            src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));
-            STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
-#else  /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
-            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
-            {
-                STORE_CURRENT_ITEM(dst_ptr, dst_iter, 0.0f);
-            }
-            else
-            {
-                src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * int(src_attrs.stride_x) + y * int(src_attrs.stride_y));
-                STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, src_pos));
-            }
-#endif /* PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0 */
-        }
-    }
-
-#ifdef HAS_BIAS
-    if(ch == (KERNEL_DEPTH - 1))
-    {
-        STORE_CURRENT_ITEM(dst_ptr, dst_iter, 1.0f);
-    }
-#endif /* HAS_BIAS */
-}
-
-#elif defined(DATA_TYPE_FP16)
-
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
-
-#ifdef KERNEL_1x1
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
-    ImageIterator    dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    uint xc    = gl_GlobalInvocationID.x;
-    uint yc    = gl_GlobalInvocationID.y;
-    uint zc    = gl_GlobalInvocationID.z;
-    uint ch    = zc % uint(KERNEL_DEPTH); // input feature map
-    uint batch = zc / uint(KERNEL_DEPTH); // the batch
-
-    // Calculate input indeces
-    uint xi = xc;
-    uint yi = yc;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.step_z);
-
-    // Calculate output indeces
-    uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x;
-    uint xo                = ch * dst_element_count;
-    uint yo                = xc + yc * uint(CONVOLVED_WIDTH);
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo);
-
-    bool x_start_even = ((xc % 2u) == 0u);
-    bool z_depth_even = ((uint(KERNEL_DEPTH) % 2u) == 0u);
-    uint input_pos    = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y);
-    uint tmp_left     = 0u;
-    uint tmp_right    = 0u;
-
-    if(ch % 2u != 0u)
-    {
-        return;
-    }
-
-    if(z_depth_even || (!z_depth_even && (int(ch) < (KERNEL_DEPTH - 1))))
-    {
-        tmp_left  = LOAD(src_ptr, input_pos);
-        input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.stride_x + yi * src_attrs.stride_y + src_attrs.stride_z);
-        tmp_right = LOAD(src_ptr, input_pos);
-        if(x_start_even)
-        {
-            tmp_right = (tmp_left & 0xffffu) + (tmp_right << 16u);
-        }
-        else
-        {
-            tmp_right = (tmp_left >> 16u) + (tmp_right & 0xffff0000u);
-        }
-        STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
-
-#ifdef HAS_BIAS
-        if(ch == (uint(KERNEL_DEPTH) - 2u))
-        {
-            mediump vec2 bias_vec = vec2(1.f, 0.f);
-            uint         bias_u   = packHalf2x16(bias_vec);
-            STORE_CURRENT_ITEM(dst_ptr, dst_iter, bias_u);
-        }
-#endif /* HAS_BIAS */
-    }
-    else
-    {
-        tmp_left = LOAD(src_ptr, input_pos);
-        if(x_start_even)
-        {
-            tmp_right = (tmp_left & 0xffffu);
-        }
-        else
-        {
-            tmp_right = (tmp_left >> 16u);
-        }
-
-#ifdef HAS_BIAS
-        mediump vec2 bias_vec = vec2(0.f, 1.f);
-        uint         bias_u   = packHalf2x16(bias_vec);
-        tmp_right += (bias_u & 0xffff0000u);
-#endif /* HAS_BIAS */
-
-        STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
-    }
-}
-
-#else /* KERNEL_1x1 */
-
-void main(void)
-{
-    uint xc    = gl_GlobalInvocationID.x;
-    uint yc    = gl_GlobalInvocationID.y;
-    uint zc    = gl_GlobalInvocationID.z;
-    uint ch    = zc % uint(KERNEL_DEPTH); // input feature map
-    uint batch = zc / uint(KERNEL_DEPTH); // the batch
-
-    Tensor3DIterator src_iter   = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
-    Tensor3DIterator src_iter_b = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
-    ImageIterator    dst_iter   = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    // Calculate input indeces
-    uint src_element_count = src_attrs.step_x / src_attrs.stride_x;
-    uint xi                = (xc * uint(STRIDE_X)) / src_element_count;
-    uint yi                = yc * uint(STRIDE_Y);
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, batch * src_stride_w + ch * src_attrs.stride_z);
-
-    // Calculate output indeces
-    uint dst_element_count = dst_attrs.step_x / dst_attrs.stride_x;
-    uint xo                = (ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT)) * dst_element_count;
-    uint yo                = xc + yc * uint(CONVOLVED_WIDTH);
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, batch * dst_stride_w + yo * dst_attrs.stride_y + xo);
-
-    bool x_start_even = ((xc * uint(STRIDE_X)) % 2u == 0u);
-    bool z_start_even = ((ch % 2u) == 0u);
-    uint input_pos    = 0u;
-    uint tmp          = 0u;
-    uint tmp_left     = 0u;
-    uint tmp_right    = 0u;
-
-    // Linearize convolution elements
-    for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
-    {
-        uint xstart = 0u;
-        uint xend   = 0u;
-
-        // even col, even row
-        if(x_start_even)
-        {
-            if(((y - yi + ch) % 2u) == 0u)
-            {
-                for(uint x = xi, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
-                {
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
-                    STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos));
-                }
-            }
-            else
-            {
-                // 1st pair
-                if(!z_start_even && (y == yi))
-                {
-                    // cross 2d feature map
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w +
-                                                               (ch - 1u) * src_attrs.stride_z);
-                }
-                else
-                {
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter,
-                                                               (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y);
-                }
-                tmp_right = LOAD(src_ptr, input_pos);
-                input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);
-                tmp_left  = LOAD(src_ptr, input_pos);
-                tmp_right = (tmp_right & 0xffffu) + (tmp_left << 16u);
-                STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
-                TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
-
-                // remaining
-                for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
-                {
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x - 1u) * src_attrs.step_x + y * src_attrs.stride_y);
-                    tmp_left  = LOAD(src_ptr, input_pos);
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
-                    tmp_right = LOAD(src_ptr, input_pos);
-                    tmp_right = (tmp_left >> 16u) + (tmp_right << 16u);
-                    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
-                }
-            }
-        }
-        else
-        {
-            if((((y - yi) % 2u) == 0u && !z_start_even) || (((y - yi) % 2u) != 0u && z_start_even))
-            {
-                // 1st pair
-                if(y == yi)
-                {
-                    // cross 2d feature map
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter_b, (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (yi + uint(KERNEL_HEIGHT) - 1u) * src_attrs.stride_y + batch * src_stride_w +
-                                                               (ch - 1u) * src_attrs.stride_z);
-                }
-                else
-                {
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter,
-                                                               (xi + (uint(KERNEL_WIDTH) / 2u)) * src_attrs.step_x + (y - 1u) * src_attrs.stride_y);
-                }
-
-                tmp_right = LOAD(src_ptr, input_pos);
-                input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);
-                tmp_left  = LOAD(src_ptr, input_pos);
-                tmp_right = (tmp_right >> 16u) + (tmp_left & 0xffff0000u);
-                STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
-                TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
-
-                // remaining
-                for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u) + 1u; x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
-                {
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
-                    STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD(src_ptr, input_pos));
-                }
-            }
-            else if((((y - yi) % 2u) == 0u && z_start_even) || (((y - yi) % 2u) != 0u && !z_start_even))
-            {
-                // 1st pair
-                input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, xi * src_attrs.step_x + y * src_attrs.stride_y);
-                tmp_right = LOAD(src_ptr, input_pos);
-                input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (xi + 1u) * src_attrs.step_x + y * src_attrs.stride_y);
-                tmp_left  = LOAD(src_ptr, input_pos);
-                tmp_right = (tmp_right >> 16u) + (tmp_left << 16u);
-                STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
-                TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x);
-
-                // remaining
-                for(uint x = xi + 1u, x_e = xi + (uint(KERNEL_WIDTH) / 2u); x < x_e; ++x, TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.step_x))
-                {
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
-                    tmp_right = LOAD(src_ptr, input_pos);
-                    input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (x + 1u) * src_attrs.step_x + y * src_attrs.stride_y);
-                    tmp_left  = LOAD(src_ptr, input_pos);
-                    tmp_right = (tmp_right >> 16u) + (tmp_left << 16u);
-                    STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp_right);
-                }
-            }
-        }
-    }
-
-    // NOTE: must handle last element manually instead of in loops
-    // to avoid write conflict across 2d boundary
-    if(ch == uint(KERNEL_DEPTH) - 1u)
-    {
-        uint x    = xi + (uint(KERNEL_WIDTH) / 2u);
-        uint y    = yi + uint(KERNEL_HEIGHT) - 1u;
-        input_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.step_x + y * src_attrs.stride_y);
-        tmp       = LOAD(src_ptr, input_pos);
-        if(!x_start_even)
-        {
-            tmp = (tmp >> 16u) + (tmp << 16u);
-        }
-
-#ifdef HAS_BIAS
-        mediump vec2 bias_vec = vec2(1.f, 1.f);
-        uint         bias_u   = packHalf2x16(bias_vec);
-        if(z_start_even)
-        {
-            tmp = (tmp & 0xffffu) + (bias_u & 0xffff0000u);
-        }
-        else
-        {
-            tmp = (bias_u & 0xffffu);
-        }
-#endif /* HAS_BIAS */
-
-        STORE_CURRENT_ITEM(dst_ptr, dst_iter, tmp);
-    }
-}
-
-#endif /* KERNEL_1x1 */
-#else  /* DATA_TYPE_FP32 */
-#error Data type not supported
-#endif /* DATA_TYPE_FP32 */
-#endif /* IM2COL_GENERIC */
-
-#ifdef IM2COL_REDUCED
-
-/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr   Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs The attributes of the source tensor
- * @param[out] dst_ptr   Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination tensor
- * @param[in]  width     The width of the input tensor
- * @param[in]  height    The height of the input tensor
- */
-
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    VectorAttributes   dst_attrs;
-    uint               width;
-    uint               height;
-};
-
-#ifdef DATA_TYPE_FP32
-
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    VectorIterator   dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    uvec3 pos            = uvec3(gl_GlobalInvocationID.xyz);
-    uvec3 size           = uvec3(gl_WorkGroupSize.xyz);
-    uint  image_size     = width * height;
-    uint  tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x + pos.y * width + pos.z * image_size);
-
-    STORE(dst_ptr, tmp_out_offset, LOAD_CURRENT_ITEM(src_ptr, src_iter));
-
-#ifdef HAS_BIAS
-    // If it is the last thread in the 3 dimensional workgroup
-    if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1))
-    {
-        tmp_out_offset += (dst_attrs.stride_x >> uint(2));
-        STORE(dst_ptr, tmp_out_offset, 1.f);
-    }
-#endif // HAS_BIAS
-}
-
-#elif defined(DATA_TYPE_FP16)
-
-#if defined(IM2COL_REDUCED_8X)
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, restrict);
-#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, restrict);
-#else                            /* IM2COL_REDUCED_8X */
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict);
-#endif                           /* IM2COL_REDUCED_8X */
-
-#if defined(IM2COL_REDUCED_GENERIC)
-
-void main(void)
-{
-    Tensor3DIterator src_iter        = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator src_nostep_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
-    VectorIterator   dst_iter        = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    uvec3 pos            = uvec3(gl_GlobalInvocationID.xyz);
-    uvec3 size           = uvec3(gl_WorkGroupSize.xyz);
-    uint  image_size     = width * height;
-    uint  element_count  = src_attrs.step_x / src_attrs.stride_x;
-    uint  tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * element_count + pos.y * width + pos.z * image_size);
-    uint  width_fp16     = (width + uint(1)) >> uint(1);
-    uint  tmp;
-
-    // odd width
-    if(width % uint(2) != uint(0))
-    {
-        // even row
-        if((pos.y + pos.z * height) % uint(2) == uint(0))
-        {
-            // skip last element of each line to avoid write conflict except for last line
-            if((pos.x < (width / element_count)) || ((pos.y == gl_NumWorkGroups.y - 1u) && (pos.z == gl_NumWorkGroups.z - 1u)))
-            {
-                tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-                STORE(dst_ptr, tmp_out_offset, tmp);
-            }
-        }
-        else
-        {
-            // special op
-            uint tmp_left  = uint(0);
-            uint tmp_right = uint(0);
-            tmp_right      = LOAD_CURRENT_ITEM(src_ptr, src_iter); //right half
-            if(pos.x == uint(0))
-            {
-                tmp_left  = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, int(width), int(pos.y) - 1, int(pos.z))); //left half
-                tmp_right = (tmp_left & uint(0xffff)) + (tmp_right << uint(16));
-            }
-            else
-            {
-                tmp_left  = LOAD(src_ptr, TENSOR3D_OFFSET(src_nostep_iter, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z)));
-                tmp_right = ((tmp_left >> uint(16)) + (tmp_right << uint(16)));
-            }
-            STORE(dst_ptr, tmp_out_offset, tmp_right);
-        }
-    }
-    else
-    {
-        tmp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-        STORE(dst_ptr, tmp_out_offset, tmp);
-    }
-
-#ifdef HAS_BIAS
-    // If it is the last thread in the 3 dimensional workgroup
-    if(pos.x == (size.x - 1u) && pos.y == (size.y - 1u) && pos.z == (size.z - 1u))
-    {
-        tmp_out_offset += (dst_attrs.stride_x >> dst_shift);
-
-        // FIXME: need odd/even detection for tmp_out_offset?
-        mediump vec2 bias_vec = vec2(1.0f, 1.0f);
-        STORE_PACK2_HALF(dst_ptr, tmp_out_offset, bias_vec);
-    }
-#endif // HAS_BIAS
-}
-
-#else /* IM2COL_REDUCED_GENERIC */
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    VectorIterator   dst_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    uvec3 pos            = uvec3(gl_GlobalInvocationID.xyz);
-#if defined(IM2COL_REDUCED_8X)
-    uint  tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(8) + pos.y * width + pos.z * uint(IMAGE_SIZE));
-    uvec4 tmp            = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-    STORE(dst_ptr, tmp_out_offset, tmp);
-#elif defined(IM2COL_REDUCED_4X) /* IM2COL_REDUCED_8X */
-    uint  tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(4) + pos.y * width + pos.z * uint(IMAGE_SIZE));
-    uvec2 tmp            = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-    STORE(dst_ptr, tmp_out_offset, tmp);
-#else                            /* IM2COL_REDUCED_8X */
-    uint tmp_out_offset = VECTOR_OFFSET(dst_iter, pos.x * uint(2) + pos.y * width + pos.z * uint(IMAGE_SIZE));
-    uint tmp            = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-    STORE(dst_ptr, tmp_out_offset, tmp);
-#endif                           /* IM2COL_REDUCED_8X */
-}
-
-#endif /* IM2COL_REDUCED_GENERIC */
-#else  /* DATA_TYPE_FP32 */
-#error Data type not supported
-#endif /* DATA_TYPE_FP32 */
-#endif /* IM2COL_REDUCED */
-
-#ifdef COL2IM
-#ifdef WIDTH_OUTPUT
-
-/** This kernel performs a reshaping of the output of the convolution layer.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- *
- * @param[in]  src_ptr     Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs   The attributes of the source tensor
- * @param[out] dst_ptr     Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs   The attributes of the destination tensor
- * @param[in]  dst_depth   The length of the destination tensor in Z dimension
- * @param[in]  dst_strideZ The actual stride of the destination tensor in Z dimension
- */
-
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-    uint               dst_depth;
-    uint               dst_strideZ;
-};
-
-#ifdef DATA_TYPE_FP32
-
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, pos.x * src_attrs.step_y + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ);
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter));
-}
-
-#elif defined(DATA_TYPE_FP16)
-
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, restrict);
-
-void main(void)
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    uvec3 pos = uvec3(gl_GlobalInvocationID.xyz);
-
-    if((pos.z % dst_depth) % 2u == 0u)
-    {
-        uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ;
-        uint tmp1_in_offset         = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes);
-        uint tmp2_in_offset         = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y);
-        vec2 tmp1                   = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset);
-        vec2 tmp2                   = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset);
-        vec2 result                 = vec2(tmp1.x, tmp2.x);
-        STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
-    }
-    else
-    {
-        uint common_offset_in_bytes = pos.x * src_attrs.step_y * 2u + pos.y * uint(WIDTH_OUTPUT) * src_attrs.step_y + (pos.z % dst_depth) * src_attrs.stride_x + (pos.z / dst_depth) * dst_strideZ - 2u;
-        uint tmp1_in_offset         = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes);
-        uint tmp2_in_offset         = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, common_offset_in_bytes + src_attrs.step_y);
-        vec2 tmp1                   = LOAD_UNPACK2_HALF(src_ptr, tmp1_in_offset);
-        vec2 tmp2                   = LOAD_UNPACK2_HALF(src_ptr, tmp2_in_offset);
-        vec2 result                 = vec2(tmp1.y, tmp2.y);
-        STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
-    }
-}
-
-#else /* DATA_TYPE_FP32 */
-#error Data type not supported
-#endif /* DATA_TYPE_FP32 */
-#endif /* WIDTH_OUTPUT */
-#endif /* COL2IM */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
deleted file mode 100644
index 3e7e1fd351..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/depthwise_convolution3x3.cs
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-/** This kernel performs a depthwise convolution.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note This kernel has multiple optimized depthwise convolution options for FP16.
- *       The depthwise convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr       Pointer to the source tensor. Supported data types: F16
- * @param[in]  src_attrs     The attributes of the source tensor
- * @param[out] dst_ptr       Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs     The attributes of the destination tensor
- * @param[in]  weights_ptr   Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_attrs The attributes of the weights tensor
- * @param[in]  biases_ptr    Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_attrs  The attributes of the weights tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-    Tensor3DAttributes weights_attrs;
-#ifdef BIAS
-    VectorAttributes biases_attrs;
-#endif /* BIAS */
-};
-
-#if defined(DATA_TYPE_FP16)
-#if defined(PROCESS_4X_3Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uvec2, weights_ptr, weights_shift, 3, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#define LOAD_UNPACK_SWIZZLE(offset) load_unpack_swizzle_stride1(offset)
-
-vec4 convolve1x3(vec4 s[3], vec4 w)
-{
-    vec4 r;
-
-    r = s[0] * w[0] + s[1] * w[1] + s[2] * w[2];
-
-    return r;
-}
-
-vec4[3] load_unpack_swizzle_stride1(uint offset)
-{
-    vec4 s[2];
-    s = VLOAD2_UNPACK8_HALF(src_ptr, offset);
-
-    vec4 r[3];
-    r[0] = s[0];
-    r[1] = vec4(s[0].yzw, s[1].x);
-    r[2] = vec4(s[0].zw, s[1].xy);
-
-    return r;
-}
-
-void main()
-{
-    Tensor3DIterator src_iter     = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[3];
-    for(int i = 0; i < 3; i++)
-    {
-        pixels[i] = vec4(0);
-    }
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
-
-    src_iter.current_offset_in_bytes -= int((z_index - z_index / uint(DEPTH_MULTIPLIER)) * src_attrs.step_z);
-
-    vec4 w[3];
-    w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
-    w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-    w[2] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-    vec4 s[3];
-    vec4 r;
-    // first line
-    s = LOAD_UNPACK_SWIZZLE(CURRENT_ITEM_OFFSET(src_iter));
-
-    r = convolve1x3(s, w[0]);
-    pixels[0] += r;
-
-    // second line
-    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 1, 0));
-
-    r = convolve1x3(s, w[1]);
-    pixels[0] += r;
-    r = convolve1x3(s, w[0]);
-    pixels[1] += r;
-
-    // third line
-    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 2, 0));
-
-    r = convolve1x3(s, w[2]);
-    pixels[0] += r;
-    r = convolve1x3(s, w[1]);
-    pixels[1] += r;
-    r = convolve1x3(s, w[0]);
-    pixels[2] += r;
-
-    // forth line
-    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 3, 0));
-
-    r = convolve1x3(s, w[2]);
-    pixels[1] += r;
-    r = convolve1x3(s, w[1]);
-    pixels[2] += r;
-
-    // fifth line
-    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 4, 0));
-
-    r = convolve1x3(s, w[2]);
-    pixels[2] += r;
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    for(int i = 0; i < 3; i++)
-    {
-        pixels[i] += vec4(b);
-    }
-#endif /* BIAS */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
-}
-#elif defined(PROCESS_4X_1Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uvec2, weights_ptr, weights_shift, 3, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 3
-#define LOAD_UNPACK_SWIZZLE(offset) load_unpack_swizzle_stride3(offset)
-#elif STRIDE_X == 2
-#define LOAD_UNPACK_SWIZZLE(offset) load_unpack_swizzle_stride2(offset)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define LOAD_UNPACK_SWIZZLE(offset) load_unpack_swizzle_stride1(offset)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4 convolve1x3(vec4 s[3], vec4 w)
-{
-    vec4 r;
-
-    r = s[0] * w[0] + s[1] * w[1] + s[2] * w[2];
-
-    return r;
-}
-
-vec4[3] load_unpack_swizzle_stride1(uint offset)
-{
-    vec4 s[2];
-    s = VLOAD2_UNPACK8_HALF(src_ptr, offset);
-
-    vec4 r[3];
-    r[0] = s[0];
-    r[1] = vec4(s[0].yzw, s[1].x);
-    r[2] = vec4(s[0].zw, s[1].xy);
-
-    return r;
-}
-
-vec4[3] load_unpack_swizzle_stride2(uint offset)
-{
-    vec4 s[3];
-    s[0] = LOAD_UNPACK4_HALF(src_ptr, offset);
-    s[1] = LOAD_UNPACK4_HALF(src_ptr, offset + uint(1));
-    s[2] = LOAD_UNPACK4_HALF(src_ptr, offset + uint(2));
-
-    vec4 r[3];
-    r[0] = vec4(s[0].xz, s[1].xz);
-    r[1] = vec4(s[0].yw, s[1].yw);
-    r[2] = vec4(s[0].z, s[1].xz, s[2].x);
-
-    return r;
-}
-
-vec4[3] load_unpack_swizzle_stride3(uint offset)
-{
-    vec4 s[3];
-    s[0] = LOAD_UNPACK4_HALF(src_ptr, offset);
-    s[1] = LOAD_UNPACK4_HALF(src_ptr, offset + uint(1));
-    s[2] = LOAD_UNPACK4_HALF(src_ptr, offset + uint(2));
-
-    vec4 r[3];
-    r[0] = vec4(s[0].xw, s[1].z, s[2].y);
-    r[1] = vec4(s[0].y, s[1].xw, s[2].z);
-    r[2] = vec4(s[0].z, s[1].y, s[2].xw);
-
-    return r;
-}
-
-void main()
-{
-    Tensor3DIterator src_iter     = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels = vec4(0.f);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_attrs.stride_z);
-
-    src_iter.current_offset_in_bytes -= int((z_index - z_index / uint(DEPTH_MULTIPLIER)) * src_attrs.step_z);
-
-    vec4 w[3];
-    w[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
-    w[1] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-    w[2] = LOAD_UNPACK4_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-    vec4 s[3];
-    vec4 r;
-    // first line
-    s = LOAD_UNPACK_SWIZZLE(CURRENT_ITEM_OFFSET(src_iter));
-
-    r = convolve1x3(s, w[0]);
-    pixels += r;
-
-    // second line
-    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 1, 0));
-
-    r = convolve1x3(s, w[1]);
-    pixels += r;
-
-    // third line
-    s = LOAD_UNPACK_SWIZZLE(TENSOR3D_OFFSET(src_iter, 0, 2, 0));
-
-    r = convolve1x3(s, w[2]);
-    pixels += r;
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    pixels += vec4(b);
-#endif /* BIAS */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
-}
-#endif /* PROCESS_4X_3Y_1Z */
-#endif /* DATA_TYPE_FP16 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
deleted file mode 100644
index c455489468..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ /dev/null
@@ -1,1057 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#ifdef FUSED_ACTIVATION
-#include "activation_layer_helpers_cs.h"
-#endif /* FUSED_ACTIVATION */
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note This kernel has multiple optimized direct convolution options for FP16.
- *       The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr          Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs        The attributes of the source tensor
- * @param[out] dst_ptr          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs        The attributes of the destination tensor
- * @param[in]  weights_ptr      Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_attrs    The attributes of the weights tensor
- * @param[in]  biases_ptr       Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_attrs     The attributes of the weights tensor
- * @param[in]  weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth    The third dimensions of the weights tensors
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-    Tensor3DAttributes weights_attrs;
-#ifdef BIAS
-    VectorAttributes biases_attrs;
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-#if defined(DATA_TYPE_FP32)
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    float pixels  = 0.f;
-    uint  z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    float temp;
-    float temp_weight;
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        temp        = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-        temp_weight = LOAD_CURRENT_ITEM(weights_ptr, weights_iter);
-        pixels += temp * temp_weight;
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels = ACT_OP(pixels);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
-}
-
-#elif defined(DATA_TYPE_FP16)
-#if defined(PROCESS_4X_1Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE(s, w) convolve_stride2(s, w)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE(s, w) convolve_stride1(s, w)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4 convolve_stride1(ImageIterator src_iter, float w)
-{
-    vec4 s;
-    s = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-
-    s *= w;
-
-    return s;
-}
-
-vec4 convolve_stride2(ImageIterator src_iter, float w)
-{
-    vec4 s[2];
-    vec4 r;
-
-    s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
-    r    = vec4(s[0].xz, s[1].xz);
-
-    r *= w;
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels = vec4(0.f);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-#ifdef WEIGHTS_OPTIMIZATION
-    float w1, w2;
-    int   nums = (int(weights_depth)) / 2;
-    for(int d = 0; d < nums; ++d)
-    {
-        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
-
-        w1      = vec2_w.x;
-        vec4 r1 = CONVOLVE(src_iter, w1);
-        pixels += r1;
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-
-        w2      = vec2_w.y;
-        vec4 r2 = CONVOLVE(src_iter, w2);
-        pixels += r2;
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
-    }
-#else  /* WEIGHTS_OPTIMIZATION */
-    float w;
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
-
-        vec4 r = CONVOLVE(src_iter, w);
-        pixels += r;
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-#endif /* WEIGHTS_OPTIMIZATION */
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    pixels += b;
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels = ACT_OP(pixels);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
-}
-#elif defined(PROCESS_4X_2Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE(s, w) convolve_stride2(s, w)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE(s, w) convolve_stride1(s, w)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4[2] convolve_stride1(ImageIterator src_iter, float w)
-{
-    vec4 s[2];
-    s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
-
-    s[0] *= w;
-    s[1] *= w;
-
-    return s;
-}
-
-vec4[2] convolve_stride2(ImageIterator src_iter, float w)
-{
-    vec4 s1[2];
-    vec4 s2[2];
-    vec4 r[2];
-
-    s1[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
-    r[0]  = vec4(s1[0].xz, s1[1].xz);
-
-    s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
-    s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, int(STRIDE_Y)));
-    r[1]  = vec4(s2[0].xz, s2[1].xz);
-
-    r[0] *= w;
-    r[1] *= w;
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[2];
-    pixels[0] = vec4(0.f);
-    pixels[1] = vec4(0.f);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-#ifdef WEIGHTS_OPTIMIZATION
-    float w1, w2;
-    int   nums = (int(weights_depth)) / 2;
-    for(int d = 0; d < nums; ++d)
-    {
-        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
-
-        w1         = vec2_w.x;
-        vec4 r1[2] = CONVOLVE(src_iter, w1);
-        pixels[0] += r1[0];
-        pixels[1] += r1[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-
-        w2         = vec2_w.y;
-        vec4 r2[2] = CONVOLVE(src_iter, w2);
-        pixels[0] += r2[0];
-        pixels[1] += r2[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
-    }
-#else  /* WEIGHTS_OPTIMIZATION */
-    float w;
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
-
-        vec4 r[2] = CONVOLVE(src_iter, w);
-        pixels[0] += r[0];
-        pixels[1] += r[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-#endif /* WEIGHTS_OPTIMIZATION */
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    pixels[0] += b;
-    pixels[1] += b;
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0] = ACT_OP(pixels[0]);
-    pixels[1] = ACT_OP(pixels[1]);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-}
-#elif defined(PROCESS_4X_3Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE(s, w) convolve_stride2(s, w)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE(s, w) convolve_stride1(s, w)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4[3] convolve_stride1(ImageIterator src_iter, float w)
-{
-    vec4 s[3];
-    s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
-    s[2] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, (2 * int(STRIDE_Y))));
-
-    s[0] *= w;
-    s[1] *= w;
-    s[2] *= w;
-
-    return s;
-}
-
-vec4[3] convolve_stride2(ImageIterator src_iter, float w)
-{
-    vec4 s1[2];
-    vec4 s2[2];
-    vec4 s3[2];
-    vec4 r[3];
-
-    s1[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
-    r[0]  = vec4(s1[0].xz, s1[1].xz);
-
-    s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
-    s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, int(STRIDE_Y)));
-    r[1]  = vec4(s2[0].xz, s2[1].xz);
-
-    s3[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, (2 * int(STRIDE_Y))));
-    s3[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, (2 * int(STRIDE_Y))));
-    r[2]  = vec4(s3[0].xz, s3[1].xz);
-
-    r[0] *= w;
-    r[1] *= w;
-    r[2] *= w;
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[3];
-    pixels[0] = vec4(0.f);
-    pixels[1] = vec4(0.f);
-    pixels[2] = vec4(0.f);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-#ifdef WEIGHTS_OPTIMIZATION
-    float w1, w2;
-    int   nums = (int(weights_depth)) / 2;
-    for(int d = 0; d < nums; ++d)
-    {
-        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
-
-        w1         = vec2_w.x;
-        vec4 r1[3] = CONVOLVE(src_iter, w1);
-        pixels[0] += r1[0];
-        pixels[1] += r1[1];
-        pixels[2] += r1[2];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-
-        w2         = vec2_w.y;
-        vec4 r2[3] = CONVOLVE(src_iter, w2);
-        pixels[0] += r2[0];
-        pixels[1] += r2[1];
-        pixels[2] += r2[2];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
-    }
-#else  /* WEIGHTS_OPTIMIZATION */
-    float w;
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
-
-        vec4 r[3] = CONVOLVE(src_iter, w);
-        pixels[0] += r[0];
-        pixels[1] += r[1];
-        pixels[2] += r[2];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-#endif /* WEIGHTS_OPTIMIZATION */
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    pixels[0] += b;
-    pixels[1] += b;
-    pixels[2] += b;
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0] = ACT_OP(pixels[0]);
-    pixels[1] = ACT_OP(pixels[1]);
-    pixels[2] = ACT_OP(pixels[2]);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
-}
-#elif defined(PROCESS_4X_4Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE(s, w, x1, y1) convolve_stride2(s, w, x1, y1)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE(s, w, x1, y1) convolve_stride1(s, w, x1, y1)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4[2] convolve_stride1(ImageIterator src_iter, float w, int x1, int y1)
-{
-    vec4 s[2];
-    s[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
-    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, (y1 + int(STRIDE_Y))));
-
-    s[0] *= w;
-    s[1] *= w;
-
-    return s;
-}
-
-vec4[2] convolve_stride2(ImageIterator src_iter, float w, int x1, int y1)
-{
-    vec4 s1[2];
-    vec4 s2[2];
-    vec4 r[2];
-
-    s1[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
-    s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, (4 + x1), y1));
-    r[0]  = vec4(s1[0].xz, s1[1].xz);
-
-    s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, (y1 + int(STRIDE_Y))));
-    s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, (4 + x1), (y1 + int(STRIDE_Y))));
-    r[1]  = vec4(s2[0].xz, s2[1].xz);
-
-    r[0] *= w;
-    r[1] *= w;
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[2];
-    vec4 pixels1[2];
-    pixels[0]  = vec4(0.f);
-    pixels[1]  = vec4(0.f);
-    pixels1[0] = vec4(0.f);
-    pixels1[1] = vec4(0.f);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-#ifdef WEIGHTS_OPTIMIZATION
-    float w1, w2;
-    int   nums = (int(weights_depth)) / 2;
-    for(int d = 0; d < nums; ++d)
-    {
-        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
-
-        w1         = vec2_w.x;
-        vec4 r1[2] = CONVOLVE(src_iter, w1, 0, 0);
-        vec4 r2[2] = CONVOLVE(src_iter, w1, 0, (2 * int(STRIDE_Y)));
-        pixels[0] += r1[0];
-        pixels[1] += r1[1];
-        pixels1[0] += r2[0];
-        pixels1[1] += r2[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-
-        w2         = vec2_w.y;
-        vec4 r3[2] = CONVOLVE(src_iter, w2, 0, 0);
-        vec4 r4[2] = CONVOLVE(src_iter, w2, 0, (2 * int(STRIDE_Y)));
-        pixels[0] += r3[0];
-        pixels[1] += r3[1];
-        pixels1[0] += r4[0];
-        pixels1[1] += r4[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
-    }
-#else  /* WEIGHTS_OPTIMIZATION */
-    float w;
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
-
-        vec4 r1[2] = CONVOLVE(src_iter, w, 0, 0);
-        vec4 r2[2] = CONVOLVE(src_iter, w, 0, (2 * int(STRIDE_Y)));
-        pixels[0] += r1[0];
-        pixels[1] += r1[1];
-        pixels1[0] += r2[0];
-        pixels1[1] += r2[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-#endif /* WEIGHTS_OPTIMIZATION */
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    pixels[0] += b;
-    pixels[1] += b;
-    pixels1[0] += b;
-    pixels1[1] += b;
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0]  = ACT_OP(pixels[0]);
-    pixels[1]  = ACT_OP(pixels[1]);
-    pixels1[0] = ACT_OP(pixels1[0]);
-    pixels1[1] = ACT_OP(pixels1[1]);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels1[0]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 3, 0), pixels1[1]);
-}
-#elif defined(PROCESS_4X_2Y_2Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE(s, w) convolve_stride2(s, w)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE(s, w) convolve_stride1(s, w)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4[2] convolve_stride1(ImageIterator src_iter, float w)
-{
-    vec4 s[2];
-    s[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    s[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
-
-    s[0] *= w;
-    s[1] *= w;
-
-    return s;
-}
-
-vec4[2] convolve_stride2(ImageIterator src_iter, float w)
-{
-    vec4 s1[2];
-    vec4 s2[2];
-    vec4 r[2];
-
-    s1[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    s1[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, 0));
-    r[0]  = vec4(s1[0].xz, s1[1].xz);
-
-    s2[0] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, int(STRIDE_Y)));
-    s2[1] = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 4, int(STRIDE_Y)));
-    r[1]  = vec4(s2[0].xz, s2[1].xz);
-
-    r[0] *= w;
-    r[1] *= w;
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    uint z_base_index = uint(gl_GlobalInvocationID.z) << uint(1);
-
-    // store orginal src current offset
-    int s_offset_in_bytes = src_iter.current_offset_in_bytes;
-
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_base_index * weights_stride_w);
-
-    for(int z = 0; z < 2; ++z)
-    {
-        uint z_index = z_base_index + uint(z);
-
-        src_iter.current_offset_in_bytes = s_offset_in_bytes;
-
-        vec4 pixels[2];
-        pixels[0] = vec4(0.f);
-        pixels[1] = vec4(0.f);
-
-#ifdef WEIGHTS_OPTIMIZATION
-        float w1, w2;
-        int   nums = (int(weights_depth)) / 2;
-        for(int d = 0; d < nums; ++d)
-        {
-            vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
-
-            w1         = vec2_w.x;
-            vec4 r1[2] = CONVOLVE(src_iter, w1);
-            pixels[0] += r1[0];
-            pixels[1] += r1[1];
-
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-
-            w2         = vec2_w.y;
-            vec4 r2[2] = CONVOLVE(src_iter, w2);
-            pixels[0] += r2[0];
-            pixels[1] += r2[1];
-
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
-        }
-#else  /* WEIGHTS_OPTIMIZATION */
-        float w;
-        for(int d = 0; d < int(weights_depth); ++d)
-        {
-            w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
-
-            vec4 r[2] = CONVOLVE(src_iter, w);
-            pixels[0] += r[0];
-            pixels[1] += r[1];
-
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-        }
-#endif /* WEIGHTS_OPTIMIZATION */
-
-#ifdef BIAS
-        vec2  vec2_b;
-        float b;
-
-        vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-        if(z_index % uint(2) == uint(0))
-        {
-            b = vec2_b.x;
-        }
-        else
-        {
-            b = vec2_b.y;
-        }
-
-        pixels[0] += b;
-        pixels[1] += b;
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-        pixels[0] = ACT_OP(pixels[0]);
-        pixels[1] = ACT_OP(pixels[1]);
-#endif /* FUSED_ACTIVATION */
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
-        STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_attrs.stride_z);
-    }
-}
-#elif defined(PROCESS_8X_1Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE(s, w) convolve_stride2(s, w)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE(s, w) convolve_stride1(s, w)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4[2] convolve_stride1(ImageIterator src_iter, float w)
-{
-    vec4 s[2];
-    s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
-
-    s[0] *= w;
-    s[1] *= w;
-
-    return s;
-}
-
-vec4[2] convolve_stride2(ImageIterator src_iter, float w)
-{
-    vec4 s1[2];
-    vec4 s2[2];
-    vec4 r[2];
-
-    s1   = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    r[0] = vec4(s1[0].xz, s1[1].xz);
-    s2   = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 8, 0));
-    r[1] = vec4(s2[0].xz, s2[1].xz);
-
-    r[0] *= w;
-    r[1] *= w;
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[2];
-    pixels[0] = vec4(0.f);
-    pixels[1] = vec4(0.f);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-#ifdef WEIGHTS_OPTIMIZATION
-    float w1, w2;
-    int   nums = (int(weights_depth)) / 2;
-    for(int d = 0; d < nums; ++d)
-    {
-        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
-
-        w1         = vec2_w.x;
-        vec4 r1[2] = CONVOLVE(src_iter, w1);
-        pixels[0] += r1[0];
-        pixels[1] += r1[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-
-        w2         = vec2_w.y;
-        vec4 r2[2] = CONVOLVE(src_iter, w2);
-        pixels[0] += r2[0];
-        pixels[1] += r2[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
-    }
-#else  /* WEIGHTS_OPTIMIZATION */
-    float w;
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
-
-        vec4 r[2] = CONVOLVE(src_iter, w);
-        pixels[0] += r[0];
-        pixels[1] += r[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-#endif /* WEIGHTS_OPTIMIZATION */
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    pixels[0] += b;
-    pixels[1] += b;
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0] = ACT_OP(pixels[0]);
-    pixels[1] = ACT_OP(pixels[1]);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
-}
-#elif defined(PROCESS_8X_2Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE(s, w, x1, y1) convolve_stride2(s, w, x1, y1)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE(s, w, x1, y1) convolve_stride1(s, w, x1, y1)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4[2] convolve_stride1(ImageIterator src_iter, float w, int x1, int y1)
-{
-    vec4 s[2];
-    s = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
-
-    s[0] *= w;
-    s[1] *= w;
-
-    return s;
-}
-
-vec4[2] convolve_stride2(ImageIterator src_iter, float w, int x1, int y1)
-{
-    vec4 s1[2];
-    vec4 s2[2];
-    vec4 r[2];
-
-    s1   = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, x1, y1));
-    r[0] = vec4(s1[0].xz, s1[1].xz);
-    s2   = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, (8 + x1), y1));
-    r[1] = vec4(s2[0].xz, s2[1].xz);
-
-    r[0] *= w;
-    r[1] *= w;
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[2];
-    vec4 pixels1[2];
-    pixels[0]  = vec4(0.f);
-    pixels[1]  = vec4(0.f);
-    pixels1[0] = vec4(0.f);
-    pixels1[1] = vec4(0.f);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-#ifdef WEIGHTS_OPTIMIZATION
-    float w1, w2;
-    int   nums = (int(weights_depth)) / 2;
-    for(int d = 0; d < nums; ++d)
-    {
-        vec2 vec2_w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter);
-
-        w1         = vec2_w.x;
-        vec4 r1[2] = CONVOLVE(src_iter, w1, 0, 0);
-        vec4 r2[2] = CONVOLVE(src_iter, w1, 0, (int(STRIDE_Y)));
-        pixels[0] += r1[0];
-        pixels[1] += r1[1];
-        pixels1[0] += r2[0];
-        pixels1[1] += r2[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-
-        w2         = vec2_w.y;
-        vec4 r3[2] = CONVOLVE(src_iter, w2, 0, 0);
-        vec4 r4[2] = CONVOLVE(src_iter, w2, 0, (int(STRIDE_Y)));
-        pixels[0] += r3[0];
-        pixels[1] += r3[1];
-        pixels1[0] += r4[0];
-        pixels1[1] += r4[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z * uint(2));
-    }
-#else  /* WEIGHTS_OPTIMIZATION */
-    float w;
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
-
-        vec4 r1[2] = CONVOLVE(src_iter, w, 0, 0);
-        vec4 r2[2] = CONVOLVE(src_iter, w, 0, (int(STRIDE_Y)));
-        pixels[0] += r1[0];
-        pixels[1] += r1[1];
-        pixels1[0] += r2[0];
-        pixels1[1] += r2[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-#endif /* WEIGHTS_OPTIMIZATION */
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    pixels[0] += b;
-    pixels[1] += b;
-    pixels1[0] += b;
-    pixels1[1] += b;
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0]  = ACT_OP(pixels[0]);
-    pixels[1]  = ACT_OP(pixels[1]);
-    pixels1[0] = ACT_OP(pixels1[0]);
-    pixels1[1] = ACT_OP(pixels1[1]);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
-    STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels1);
-}
-#endif /* PROCESS_4X_1Y_1Z */
-#else  /* DATA_TYPE_FP32 */
-#error Data type not supported
-#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
deleted file mode 100644
index c9a2121a88..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
+++ /dev/null
@@ -1,1155 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#ifdef FUSED_ACTIVATION
-#include "activation_layer_helpers_cs.h"
-#endif /* FUSED_ACTIVATION */
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note This kernel has multiple optimized direct convolution options for FP16.
- *       The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
- *       This OpenGL ES shader works with stride_x = 1 and 2
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr          Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs        The attributes of the source tensor
- * @param[out] dst_ptr          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs        The attributes of the destination tensor
- * @param[in]  weights_ptr      Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_attrs    The attributes of the weights tensor
- * @param[in]  biases_ptr       Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_attrs     The attributes of the weights tensor
- * @param[in]  weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth    The third dimensions of the weights tensors
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-    Tensor3DAttributes weights_attrs;
-#ifdef BIAS
-    VectorAttributes biases_attrs;
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-#if defined(DATA_TYPE_FP32)
-#if defined(PROCESS_1X_1Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    float pixels = 0.f;
-
-    uint z_index = gl_GlobalInvocationID.z;
-
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        vec3 temp;
-        vec3 w;
-
-        temp = VLOAD3(vec3, src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
-        w    = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
-
-        pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
-
-        temp = VLOAD3(vec3, src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
-        w    = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-
-        pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
-
-        temp = VLOAD3(vec3, src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
-        w    = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-        pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels = ACT_OP(pixels);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
-}
-
-#elif defined(PROCESS_8X_1Y_1Z)
-
-TENSOR_DECLARATION(1, srcBuffer, vec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, vec4, dst_ptr, dst_shift, 4, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4[2] convolve1x3_stride1(uint offset, vec3 w)
-{
-    vec4 middle;
-    vec4 right;
-    vec4 tmp[3];
-    vec4 r[2];
-
-    tmp = VLOAD3(vec4[3], src_ptr, offset);
-
-    middle = vec4(tmp[0].yzw, tmp[1].x);
-    right  = vec4(tmp[0].zw, tmp[1].xy);
-
-    r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
-
-    middle = vec4(tmp[1].yzw, tmp[2].x);
-    right  = vec4(tmp[1].zw, tmp[2].xy);
-
-    r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
-
-    return r;
-}
-
-vec4[2] convolve1x3_stride2(uint offset, vec3 w)
-{
-    vec4 left;
-    vec4 middle;
-    vec4 right;
-    vec4 tmp1[3];
-    vec4 tmp2[2];
-    vec4 r[2];
-
-    tmp1 = VLOAD3(vec4[3], src_ptr, offset);
-
-    left   = vec4(tmp1[0].xz, tmp1[1].xz);
-    middle = vec4(tmp1[0].yw, tmp1[1].yw);
-    right  = vec4(tmp1[0].z, tmp1[1].xz, tmp1[2].x);
-
-    r[0] = left * w[0] + middle * w[1] + right * w[2];
-
-    tmp2 = VLOAD2(vec4[2], src_ptr, offset + uint(3));
-
-    left   = vec4(tmp1[2].xz, tmp2[0].xz);
-    middle = vec4(tmp1[2].yw, tmp2[0].yw);
-    right  = vec4(tmp1[2].z, tmp2[0].xz, tmp2[1].x);
-
-    r[1] = left * w[0] + middle * w[1] + right * w[2];
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[2];
-    pixels[0] = vec4(0);
-    pixels[1] = vec4(0);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        // load 3 weights once
-        vec3 w;
-        vec4 r[2];
-
-        // first line
-        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
-
-        r = CONVOLVE1x3(CURRENT_ITEM_OFFSET(src_iter), w);
-        pixels[0] += r[0];
-        pixels[1] += r[1];
-
-        // second line
-        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-
-        r = CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 1), w);
-        pixels[0] += r[0];
-        pixels[1] += r[1];
-
-        // third line
-        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-        r = CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 2), w);
-        pixels[0] += r[0];
-        pixels[1] += r[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    float b = LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-    pixels[0] += vec4(b);
-    pixels[1] += vec4(b);
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0] = ACT_OP(pixels[0]);
-    pixels[1] = ACT_OP(pixels[1]);
-#endif /* FUSED_ACTIVATION */
-
-    VSTORE2_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
-}
-
-#elif defined(PROCESS_4X_1Y_1Z)
-
-TENSOR_DECLARATION(1, srcBuffer, vec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, vec4, dst_ptr, dst_shift, 4, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4 convolve1x3_stride1(uint offset, vec3 w)
-{
-    vec4 tmp[2];
-    vec4 middle;
-    vec4 right;
-
-    tmp = VLOAD2(vec4[2], src_ptr, offset);
-
-    middle = vec4(tmp[0].yzw, tmp[1].x);
-    right  = vec4(tmp[0].zw, tmp[1].xy);
-
-    tmp[1] = tmp[0] * w[0] + middle * w[1] + right * w[2];
-
-    return tmp[1];
-}
-
-vec4 convolve1x3_stride2(uint offset, vec3 w)
-{
-    vec4 left;
-    vec4 middle;
-    vec4 right;
-
-    vec4 tmp[3];
-
-    tmp = VLOAD3(vec4[3], src_ptr, offset);
-
-    left   = vec4(tmp[0].xz, tmp[1].xz);
-    middle = vec4(tmp[0].yw, tmp[1].yw);
-    right  = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
-
-    tmp[0] = left * w[0] + middle * w[1] + right * w[2];
-
-    return tmp[0];
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels;
-    pixels = vec4(0.f);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        // load 3 weights once
-        vec3 w;
-
-        // first line
-        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
-        pixels += CONVOLVE1x3(CURRENT_ITEM_OFFSET(src_iter), w);
-
-        // second line
-        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-        pixels += CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 1), w);
-
-        // third line
-        w = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-        pixels += CONVOLVE1x3(IMAGE_OFFSET(src_iter, 0, 2), w);
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    float b = LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-    pixels += b;
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels = ACT_OP(pixels);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
-}
-
-#elif defined(PROCESS_4X_3Y_1Z)
-
-TENSOR_DECLARATION(1, srcBuffer, vec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, vec4, dst_ptr, dst_shift, 4, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#define CONVOLVE1x3(left, middle, right, w) convolve1x3_stride1(left, middle, right, w)
-
-vec4 convolve1x3_stride1(vec4 left, vec4 middle, vec4 right, vec3 w)
-{
-    vec4 r;
-
-    r = left * w[0] + middle * w[1] + right * w[2];
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[3];
-    pixels[0] = vec4(0);
-    pixels[1] = vec4(0);
-    pixels[2] = vec4(0);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        // load 3 weights once
-        vec3 w[3];
-
-        w[0] = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
-        w[1] = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-        w[2] = VLOAD3(vec3, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-        vec4 s[2];
-        vec4 middle;
-        vec4 right;
-        // first line
-        s      = VLOAD2_CURRENT_ITEM(vec4[2], src_ptr, src_iter);
-        middle = vec4(s[0].yzw, s[1].x);
-        right  = vec4(s[0].zw, s[1].xy);
-        pixels[0] += CONVOLVE1x3(s[0], middle, right, w[0]);
-
-        // second line
-        s      = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
-        middle = vec4(s[0].yzw, s[1].x);
-        right  = vec4(s[0].zw, s[1].xy);
-        pixels[0] += CONVOLVE1x3(s[0], middle, right, w[1]);
-        pixels[1] += CONVOLVE1x3(s[0], middle, right, w[0]);
-
-        // third line
-        s      = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
-        middle = vec4(s[0].yzw, s[1].x);
-        right  = vec4(s[0].zw, s[1].xy);
-        pixels[0] += CONVOLVE1x3(s[0], middle, right, w[2]);
-        pixels[1] += CONVOLVE1x3(s[0], middle, right, w[1]);
-        pixels[2] += CONVOLVE1x3(s[0], middle, right, w[0]);
-
-        // forth line
-        s      = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
-        middle = vec4(s[0].yzw, s[1].x);
-        right  = vec4(s[0].zw, s[1].xy);
-        pixels[1] += CONVOLVE1x3(s[0], middle, right, w[2]);
-        pixels[2] += CONVOLVE1x3(s[0], middle, right, w[1]);
-
-        // fifth line
-        s      = VLOAD2(vec4[2], src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
-        middle = vec4(s[0].yzw, s[1].x);
-        right  = vec4(s[0].zw, s[1].xy);
-        pixels[2] += CONVOLVE1x3(s[0], middle, right, w[2]);
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    float b = LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    pixels[0] += vec4(b);
-    pixels[1] += vec4(b);
-    pixels[2] += vec4(b);
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0] = ACT_OP(pixels[0]);
-    pixels[1] = ACT_OP(pixels[1]);
-    pixels[2] = ACT_OP(pixels[2]);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels[0]);
-    STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-    STORE(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
-}
-
-#endif // PROCESS_nX_nY
-
-#elif defined(DATA_TYPE_FP16)
-
-#if defined(PROCESS_8X_3Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
-
-vec4[2] convolve1x3_stride1(vec4 tmp[3], vec3 w)
-{
-    vec4 middle;
-    vec4 right;
-    vec4 r[2];
-
-    middle = vec4(tmp[0].yzw, tmp[1].x);
-    right  = vec4(tmp[0].zw, tmp[1].xy);
-
-    r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2];
-
-    middle = vec4(tmp[1].yzw, tmp[2].x);
-    right  = vec4(tmp[1].zw, tmp[2].xy);
-
-    r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2];
-
-    return r;
-}
-
-vec4[3] vload2_src_unpack12_half(uint offset)
-{
-    uvec4 packed_s[2];
-    vec4  s[3];
-
-    packed_s = VLOAD2(uvec4[2], src_ptr, offset);
-
-    s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y));
-    s[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w));
-    s[2] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y));
-
-    return s;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[3][2];
-    int  i, j;
-    for(i = 0; i < 3; i++)
-    {
-        for(j = 0; j < 2; j++)
-        {
-            pixels[i][j] = vec4(0);
-        }
-    }
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        // load 3 weights once
-        uvec2 packed_w[3];
-
-        packed_w[0] = VLOAD2_CURRENT_ITEM(uvec2, weights_ptr, weights_iter);
-        packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-        packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-        vec3 w[3];
-        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
-        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
-        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
-
-        uvec4 packed_s[2];
-        vec4  s[3];
-        vec4  r[2];
-
-        // first line
-        s = vload2_src_unpack12_half(CURRENT_ITEM_OFFSET(src_iter));
-
-        r = CONVOLVE1x3(s, w[0]);
-        pixels[0][0] += r[0];
-        pixels[0][1] += r[1];
-
-        // second line
-        s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 1));
-
-        r = CONVOLVE1x3(s, w[1]);
-        pixels[0][0] += r[0];
-        pixels[0][1] += r[1];
-        r = CONVOLVE1x3(s, w[0]);
-        pixels[1][0] += r[0];
-        pixels[1][1] += r[1];
-
-        // third line
-        s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 2));
-
-        r = CONVOLVE1x3(s, w[2]);
-        pixels[0][0] += r[0];
-        pixels[0][1] += r[1];
-        r = CONVOLVE1x3(s, w[1]);
-        pixels[1][0] += r[0];
-        pixels[1][1] += r[1];
-        r = CONVOLVE1x3(s, w[0]);
-        pixels[2][0] += r[0];
-        pixels[2][1] += r[1];
-
-        // forth line
-        s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 3));
-
-        r = CONVOLVE1x3(s, w[2]);
-        pixels[1][0] += r[0];
-        pixels[1][1] += r[1];
-        r = CONVOLVE1x3(s, w[1]);
-        pixels[2][0] += r[0];
-        pixels[2][1] += r[1];
-
-        // fifth line
-        s = vload2_src_unpack12_half(IMAGE_OFFSET(src_iter, 0, 4));
-
-        r = CONVOLVE1x3(s, w[2]);
-        pixels[2][0] += r[0];
-        pixels[2][1] += r[1];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    for(i = 0; i < 3; i++)
-    {
-        for(j = 0; j < 2; j++)
-        {
-            pixels[i][j] += vec4(b);
-        }
-    }
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0] = ACT_OP(pixels[0]);
-    pixels[1] = ACT_OP(pixels[1]);
-    pixels[2] = ACT_OP(pixels[2]);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
-    STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-    STORE_PACK8_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
-}
-
-#elif defined(PROCESS_4X_1Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#if STRIDE_X == 2
-#define CONVOLVE1x3(s, w) convolve1x3_stride2(s, w)
-#define LOAD_AND_UNPACK(offset) VLOAD3_UNPACK12_HALF(src_ptr, offset)
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
-#define LOAD_AND_UNPACK(offset) VLOAD2_UNPACK8_HALF(src_ptr, offset)
-#else /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-
-vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
-{
-    vec4 middle;
-    vec4 right;
-    vec4 r;
-
-    middle = vec4(tmp[0].yzw, tmp[1].x);
-    right  = vec4(tmp[0].zw, tmp[1].xy);
-
-    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
-
-    return r;
-}
-
-vec4 convolve1x3_stride2(vec4 tmp[3], vec3 w)
-{
-    vec4 left;
-    vec4 middle;
-    vec4 right;
-    vec4 r;
-
-    left   = vec4(tmp[0].xz, tmp[1].xz);
-    middle = vec4(tmp[0].yw, tmp[1].yw);
-    right  = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
-
-    r = left * w[0] + middle * w[1] + right * w[2];
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    uvec2 packed_d;
-
-    vec4 pixels = vec4(0);
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        // load 3 weights once
-        uvec2 packed_w[3];
-
-        packed_w[0] = VLOAD2_CURRENT_ITEM(uvec2, weights_ptr, weights_iter);
-        packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-        packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-        vec3 w[3];
-        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
-        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
-        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
-
-#if STRIDE_X == 2
-        vec4 s[3];
-#elif STRIDE_X == 1 /* STRIDE_X == 1 */
-        vec4 s[2];
-#else               /* STRIDE_X not equals 1 or 2 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 2 */
-        vec4 r;
-
-        // first line
-        s = LOAD_AND_UNPACK(CURRENT_ITEM_OFFSET(src_iter));
-        pixels += CONVOLVE1x3(s, w[0]);
-
-        // second line
-        s = LOAD_AND_UNPACK(IMAGE_OFFSET(src_iter, 0, 1));
-        pixels += CONVOLVE1x3(s, w[1]);
-
-        // third line
-        s = LOAD_AND_UNPACK(IMAGE_OFFSET(src_iter, 0, 2));
-        pixels += CONVOLVE1x3(s, w[2]);
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    pixels += vec4(b);
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels = ACT_OP(pixels);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
-}
-
-#elif defined(PROCESS_4X_3Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
-
-vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
-{
-    vec4 middle;
-    vec4 right;
-    vec4 r;
-
-    middle = vec4(tmp[0].yzw, tmp[1].x);
-    right  = vec4(tmp[0].zw, tmp[1].xy);
-
-    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[3];
-    int  i;
-
-    for(i = 0; i < 3; i++)
-    {
-        pixels[i] = vec4(0);
-    }
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        // load 3 weights once
-        uvec2 packed_w[3];
-
-        packed_w[0] = VLOAD2_CURRENT_ITEM(uvec2, weights_ptr, weights_iter);
-        packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-        packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-        vec3 w[3];
-        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
-        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
-        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
-
-        vec4 s[2];
-        vec4 r;
-
-        // first line
-        s = VLOAD2_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
-        pixels[0] += CONVOLVE1x3(s, w[0]);
-
-        // second line
-        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
-        pixels[0] += CONVOLVE1x3(s, w[1]);
-        pixels[1] += CONVOLVE1x3(s, w[0]);
-
-        // third line
-        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
-        pixels[0] += CONVOLVE1x3(s, w[2]);
-        pixels[1] += CONVOLVE1x3(s, w[1]);
-        pixels[2] += CONVOLVE1x3(s, w[0]);
-
-        // forth line
-        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
-        pixels[1] += CONVOLVE1x3(s, w[2]);
-        pixels[2] += CONVOLVE1x3(s, w[1]);
-
-        // fifth line
-        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
-        pixels[2] += CONVOLVE1x3(s, w[2]);
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    for(i = 0; i < 3; i++)
-    {
-        pixels[i] += vec4(b);
-    }
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0] = ACT_OP(pixels[0]);
-    pixels[1] = ACT_OP(pixels[1]);
-    pixels[2] = ACT_OP(pixels[2]);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
-}
-
-#elif defined(PROCESS_4X_4Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
-
-vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
-{
-    vec4 middle;
-    vec4 right;
-    vec4 r;
-
-    middle = vec4(tmp[0].yzw, tmp[1].x);
-    right  = vec4(tmp[0].zw, tmp[1].xy);
-
-    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[4];
-    int  i;
-
-    for(i = 0; i < 4; i++)
-    {
-        pixels[i] = vec4(0);
-    }
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        // load 3 weights once
-        uvec2 packed_w[3];
-
-        packed_w[0] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
-        packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-        packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-        vec3 w[3];
-        w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
-        w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
-        w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
-
-        vec4 s[2];
-        vec4 r;
-
-        // first line
-        s = VLOAD2_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
-        pixels[0] += CONVOLVE1x3(s, w[0]);
-
-        // second line
-        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
-        pixels[0] += CONVOLVE1x3(s, w[1]);
-        pixels[1] += CONVOLVE1x3(s, w[0]);
-
-        // third line
-        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
-        pixels[0] += CONVOLVE1x3(s, w[2]);
-        pixels[1] += CONVOLVE1x3(s, w[1]);
-        pixels[2] += CONVOLVE1x3(s, w[0]);
-
-        // forth line
-        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
-        pixels[1] += CONVOLVE1x3(s, w[2]);
-        pixels[2] += CONVOLVE1x3(s, w[1]);
-        pixels[3] += CONVOLVE1x3(s, w[0]);
-
-        // fifth line
-        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
-        pixels[2] += CONVOLVE1x3(s, w[2]);
-        pixels[3] += CONVOLVE1x3(s, w[1]);
-
-        // sixth line
-        s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 5));
-        pixels[3] += CONVOLVE1x3(s, w[2]);
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-    if(z_index % uint(2) == uint(0))
-    {
-        b = vec2_b.x;
-    }
-    else
-    {
-        b = vec2_b.y;
-    }
-
-    for(i = 0; i < 4; i++)
-    {
-        pixels[i] += vec4(b);
-    }
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels[0] = ACT_OP(pixels[0]);
-    pixels[1] = ACT_OP(pixels[1]);
-    pixels[2] = ACT_OP(pixels[2]);
-    pixels[3] = ACT_OP(pixels[3]);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
-    STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 3, 0), pixels[3]);
-}
-#elif defined(PROCESS_4X_3Y_2Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w)
-
-vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w)
-{
-    vec4 middle;
-    vec4 right;
-    vec4 r;
-
-    middle = vec4(tmp[0].yzw, tmp[1].x);
-    right  = vec4(tmp[0].zw, tmp[1].xy);
-
-    r = tmp[0] * w[0] + middle * w[1] + right * w[2];
-
-    return r;
-}
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 pixels[3];
-    int  i;
-
-    uint z_base_index = gl_GlobalInvocationID.z << 1;
-
-    // store orginal src current offset
-    uint s_offset_in_bytes = CURRENT_ITEM_OFFSET_IN_BYTES(srcc_iter);
-
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_base_index * weights_stride_w);
-
-    for(int z = 0; z < 2; ++z)
-    {
-        uint z_index = z_base_index + uint(z);
-
-        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(src_iter, s_offset_in_bytes);
-
-        for(i = 0; i < 3; i++)
-        {
-            pixels[i] = vec4(0);
-        }
-
-        for(int d = 0; d < int(weights_depth); ++d)
-        {
-            // load 3 weights once
-            uvec2 packed_w[3];
-
-            packed_w[0] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
-            packed_w[1] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-            packed_w[2] = VLOAD2(uvec2, weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-
-            vec3 w[3];
-            w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x);
-            w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x);
-            w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x);
-
-            vec4 s[2];
-            vec4 r;
-
-            // first line
-            s = VLOAD2_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
-            pixels[0] += CONVOLVE1x3(s, w[0]);
-
-            // second line
-            s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
-            pixels[0] += CONVOLVE1x3(s, w[1]);
-            pixels[1] += CONVOLVE1x3(s, w[0]);
-
-            // third line
-            s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
-            pixels[0] += CONVOLVE1x3(s, w[2]);
-            pixels[1] += CONVOLVE1x3(s, w[1]);
-            pixels[2] += CONVOLVE1x3(s, w[0]);
-
-            // forth line
-            s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
-            pixels[1] += CONVOLVE1x3(s, w[2]);
-            pixels[2] += CONVOLVE1x3(s, w[1]);
-
-            // fifth line
-            s = VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
-            pixels[2] += CONVOLVE1x3(s, w[2]);
-
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-        }
-
-#ifdef BIAS
-        vec2  vec2_b;
-        float b;
-        vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-
-        if(z_index % uint(2) == uint(0))
-        {
-            b = vec2_b.x;
-        }
-        else
-        {
-            b = vec2_b.y;
-        }
-
-        for(i = 0; i < 3; i++)
-        {
-            pixels[i] += vec4(b);
-        }
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-        pixels[0] = ACT_OP(pixels[0]);
-        pixels[1] = ACT_OP(pixels[1]);
-        pixels[2] = ACT_OP(pixels[2]);
-        pixels[3] = ACT_OP(pixels[3]);
-#endif /* FUSED_ACTIVATION */
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels[0]);
-        STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 1, 0), pixels[1]);
-        STORE_PACK4_HALF(dst_ptr, TENSOR3D_OFFSET(dst_iter, 0, 2, 0), pixels[2]);
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, dst_stride_z);
-    }
-}
-
-#endif /* PROCESS_nX_nY_nZ */
-
-#else /* DATA_TYPE_FP32 */
-#error Data type not supported
-#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
deleted file mode 100644
index e47db549c9..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#ifdef FUSED_ACTIVATION
-#include "activation_layer_helpers_cs.h"
-#endif /* FUSED_ACTIVATION */
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-/** This kernel performs a direct convolution to convolve the low three dimensions
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note This kernel has multiple optimized direct convolution options for FP16.
- *       The direct convolution option must be passed at compile time using "#define PROCESS_nX_nY_nZ" e.g. "#define PROCESS_8X_1Y_1Z"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
- *       This OpenGL ES shader works with stride_x = 1 and 2
- * @note If biases are used then "define HAS_BIAS" has to be passed at compile time
- *
- * @param[in]  src_ptr          Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs        The attributes of the source tensor
- * @param[out] dst_ptr          Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs        The attributes of the destination tensor
- * @param[out] weights_ptr      Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_attrs    The attributes of the weights tensor
- * @param[in]  biases_ptr       Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_attrs     The attributes of the weights tensor
- * @param[in]  weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in]  weights_depth    The third dimensions of the weights tensors
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-    Tensor3DAttributes weights_attrs;
-#ifdef BIAS
-    VectorAttributes biases_attrs;
-#endif /* BIAS */
-    uint weights_stride_w;
-    uint weights_depth;
-};
-
-#ifdef DATA_TYPE_FP32
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    float pixels  = 0.f;
-    uint  z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    float temp[5];
-    float temp_weight[5];
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
-        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 0, 0));
-        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
-
-        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
-        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 1, 0));
-        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
-
-        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
-        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 2, 0));
-        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
-
-        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
-        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 3, 0));
-        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
-
-        temp        = VLOAD5(float[5], src_ptr, IMAGE_OFFSET(src_iter, 0, 4));
-        temp_weight = VLOAD5(float[5], weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, 4, 0));
-        pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4];
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    pixels = ACT_OP(pixels);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
-}
-#elif defined(DATA_TYPE_FP16)
-
-// Common definitions for DATA_TYPE_FP16
-#if STRIDE_X == 1
-#define LOAD_SRC_AT_ROW(row) VLOAD2_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row))
-#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define LOAD_SRC_AT_ROW(row) VLOAD3_UNPACK12_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, row))
-#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight)
-#else /* STRDIDE_X == 1 */
-#error STRIDE_X larger than 2 is not supported
-#endif /* STRIDE_X == 1 */
-
-#define LOAD_WEIGHT_AT_ROW(row) VLOAD3_UNPACK6_HALF(weights_ptr, TENSOR3D_OFFSET(weights_iter, 0, row, 0))
-
-vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3])
-{
-    vec4 src0 = tmp[0];
-    vec4 src1 = vec4(tmp[0].yzw, tmp[1].x);
-    vec4 src2 = vec4(tmp[0].zw, tmp[1].xy);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].xyz);
-    vec4 src4 = tmp[1];
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
-{
-    vec4 src0 = vec4(tmp[0].xz, tmp[1].xz);
-    vec4 src1 = vec4(tmp[0].yw, tmp[1].yw);
-    vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x);
-    vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y);
-    vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz);
-    vec4 ret  = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x;
-
-    return ret;
-}
-
-#if defined(PROCESS_4X_1Y_1Z)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
-#ifdef BIAS
-TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
-#endif /* BIAS */
-
-void main()
-{
-    ImageIterator    src_iter     = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
-    Tensor3DIterator dst_iter     = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-#ifdef BIAS
-    VectorIterator   biases_iter  = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
-#endif /* BIAS */
-
-    vec4 res = vec4(0);
-    vec2 w[3];
-    vec4 s[STRIDE_X + 1];
-
-    uint z_index = gl_GlobalInvocationID.z;
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
-
-    for(int d = 0; d < int(weights_depth); ++d)
-    {
-        for(int row = 0; row < 5; row++)
-        {
-            w = LOAD_WEIGHT_AT_ROW(row);
-            s = LOAD_SRC_AT_ROW(row);
-            res += CONVOLVE1x5(s, w);
-        }
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
-    }
-
-#ifdef BIAS
-    vec2  vec2_b;
-    float b;
-
-    vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
-    b      = (z_index % uint(2) == uint(0)) ? vec2_b.x : vec2_b.y;
-    res += vec4(b);
-#endif /* BIAS */
-
-#ifdef FUSED_ACTIVATION
-    res = ACT_OP(res);
-#endif /* FUSED_ACTIVATION */
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
-}
-
-#endif /* PROCESS_nX_nY_nZ */
-#else  /* DATA_TYPE_FP32 */
-#error Data type not supported
-#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
deleted file mode 100644
index 89ac8fea2e..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif /*DATA_TYPE_FP16*/
-
-uint hash(uint x)
-{
-    x += (x << 10u);
-    x ^= (x >> 6u);
-    x += (x << 3u);
-    x ^= (x >> 11u);
-    x += (x << 15u);
-    return x;
-}
-
-uint hash(uvec3 v)
-{
-    return hash(v.x ^ hash(v.y) ^ hash(v.z));
-}
-
-float float_construct(uint m)
-{
-    const uint ieee_mantissa = 0x007FFFFFu;
-    const uint ieee_one      = 0x3F800000u;
-
-    m &= ieee_mantissa;
-    m |= ieee_one;
-
-    float f = uintBitsToFloat(m);
-    return f - 1.0;
-}
-
-float rand(vec3 v, float seed)
-{
-    return float_construct(hash(floatBitsToUint(v + seed)));
-}
-
-/** Dropout is used to improve over-fit on neural networks.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- *
- * @param[in]  src_ptr    Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_attrs  The attributes of the source tensor
- * @param[out] mask_ptr   Pointer to the mask tensor. Supported data types: same as @p src_ptr
- * @param[in]  mask_attrs The attributes of the mask tensor
- * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs  The attributes of the destination tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes mask_attrs;
-    Tensor3DAttributes dst_attrs;
-};
-
-#ifdef DATA_TYPE_FP32
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, maskBuffer, float, mask_ptr, mask_shift, 2, restrict);
-TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    Tensor3DIterator src_iter  = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator mask_iter = CONVERT_TO_TENSOR3D_ITERATOR(mask_attrs, mask_shift);
-    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    float random  = 0.f;
-    float inputv  = 0.f;
-    float maskv   = 0.f;
-    float outputv = 0.f;
-
-#ifdef FORWARD
-    random = rand(vec3(gl_GlobalInvocationID.xyz), SEED);
-    maskv  = (random > RATIO) ? 1.f : 0.f;
-    STORE_CURRENT_ITEM(mask_ptr, mask_iter, maskv);
-#else  /* FORWARD */
-    maskv = LOAD_CURRENT_ITEM(mask_ptr, mask_iter);
-#endif /* FORWARD */
-
-    inputv  = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-    outputv = maskv * inputv * float(SCALE);
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, outputv);
-}
-
-#elif defined(DATA_TYPE_FP16)
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, maskBuffer, uint, mask_ptr, mask_shift, 2, restrict);
-TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    Tensor3DIterator src_iter  = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator mask_iter = CONVERT_TO_TENSOR3D_ITERATOR(mask_attrs, mask_shift);
-    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    float random1    = 0.f;
-    float random2    = 0.f;
-    vec2  input_vec  = vec2(0, 0);
-    vec2  output_vec = vec2(0, 0);
-    vec2  mask_vec   = vec2(0, 0);
-
-#ifdef FORWARD
-    random1          = rand(vec3(gl_GlobalInvocationID.xyz), SEED);
-    random2          = rand(vec3(float(gl_GlobalInvocationID.x) + 0.5f, gl_GlobalInvocationID.yz), SEED);
-    mask_vec.x       = (random1 > RATIO) ? 1.f : 0.f;
-    mask_vec.y       = (random2 > RATIO) ? 1.f : 0.f;
-
-    STORE_PACK2_CURRENT_ITEM_HALF(mask_ptr, mask_iter, mask_vec);
-#else  /* FORWARD */
-    mask_vec = LOAD_UNPACK2_CURRENT_ITEM_HALF(mask_ptr, mask_iter);
-#endif /* FORWARD */
-
-    input_vec  = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    output_vec = mask_vec * input_vec * float(SCALE);
-
-    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, output_vec);
-}
-
-#else /* DATA_TYPE_FP32 */
-
-#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
deleted file mode 100644
index 4e96a5ec74..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-#ifdef FILL_IMAGE_BORDERS_REPLICATE
-
-/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
- * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
- *
- * @param[in,out] buf_ptr     Pointer to the source image. Supported data types: F16/F32
- * @param[in]     buf_attrs   The attributes of the source image
- * @param[in]     width       Width of the valid region of the image
- * @param[in]     height      Height of the valid region of the image
- * @param[in]     start_pos_x X coordinate indicating the start point of the valid region
- * @param[in]     start_pos_y Y coordinate indicating the start point of the valid region
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes buf_attrs;
-    uint               width;
-    uint               height;
-    int                start_pos_x;
-    int                start_pos_y;
-};
-
-#if defined(DATA_TYPE_FP32)
-
-TENSOR_DECLARATION(1, bufBuffer, float, buf_ptr, buf_shift, 2, restrict);
-
-void main()
-{
-    ImageIterator buf_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(buf_attrs, buf_shift);
-
-    // Update pointer to point to the starting point of the valid region
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, start_pos_y * int(buf_attrs.stride_y) + start_pos_x * int(buf_attrs.stride_x));
-
-    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
-    int gid0        = int(gl_GlobalInvocationID.x);
-    int gidH        = gid0 - total_width;
-    int gidW        = gid0 - BORDER_SIZE_LEFT;
-
-    if(gidH >= 0)
-    {
-        // Handle left border
-        float left_val = LOAD(buf_ptr, IMAGE_OFFSET(buf_iter, 0, gidH));
-        for(int i = 0; i < BORDER_SIZE_LEFT; ++i)
-        {
-            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, -(i + 1), gidH), left_val);
-        }
-        // Handle right border
-        float right_val = LOAD(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) - 1, gidH));
-        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
-        {
-            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) + i, gidH), right_val);
-        }
-    }
-    else
-    {
-        // Get value for corners
-        int val_idx = gidW;
-        if(gidW < 0 || gidW > (int(width) - 1))
-        {
-            val_idx = gidW < 0 ? 0 : int(width) - 1;
-        }
-
-        // Handle top border
-        float top_val = LOAD(buf_ptr, IMAGE_OFFSET(buf_iter, val_idx, 0));
-        for(int i = 0; i < BORDER_SIZE_TOP; ++i)
-        {
-            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, gidW, -(i + 1)), top_val);
-        }
-        // Handle bottom border
-        float bottom_val = LOAD(buf_ptr, IMAGE_OFFSET(buf_iter, val_idx, int(height) - 1));
-        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
-        {
-            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, gidW, int(height) + i), bottom_val);
-        }
-    }
-}
-#elif defined(DATA_TYPE_FP16)
-
-TENSOR_DECLARATION(1, bufBuffer, uint, buf_ptr, buf_shift, 2, restrict);
-
-void set_replicate(uint offset, int pos, vec2 replicate_value)
-{
-    vec2 b = LOAD_UNPACK2_HALF(buf_ptr, offset);
-
-    if(pos % 2 == 0)
-    {
-        b.x = replicate_value.y;
-    }
-    else
-    {
-        b.y = replicate_value.x;
-    }
-
-    STORE_PACK2_HALF(buf_ptr, offset, b);
-}
-
-void main()
-{
-    ImageIterator buf_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(buf_attrs, buf_shift);
-
-    // Update pointer to point to the starting point of the valid region
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, start_pos_y * int(buf_attrs.stride_y) + start_pos_x * int(buf_attrs.stride_x));
-
-    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
-    int gid0        = int(gl_GlobalInvocationID.x);
-    int gidH        = gid0 - total_width;
-    int gidW        = gid0 - BORDER_SIZE_LEFT;
-
-    if(gidH >= 0)
-    {
-        // Handle left border
-        vec2 left_val = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, 0, gidH));
-        for(int i = 0; i < BORDER_SIZE_LEFT; ++i)
-        {
-            uint offset = IMAGE_OFFSET(buf_iter, -(i + 1), gidH);
-            int  pos    = BORDER_SIZE_LEFT - i - 1;
-            if(i == 0)
-            {
-                if(pos % 2 == 0)
-                {
-                    set_replicate(offset, pos, left_val);
-                }
-            }
-            else
-            {
-                if(pos % 2 == 0)
-                {
-                    if(BORDER_SIZE_LEFT % 2 == 0)
-                    {
-                        STORE_PACK2_HALF(buf_ptr, offset, left_val.xx);
-                    }
-                    else
-                    {
-                        STORE_PACK2_HALF(buf_ptr, offset, left_val.yy);
-                    }
-                    i++;
-                }
-            }
-        }
-        // Handle right border
-        vec2 right_val_origin = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) - 1, gidH));
-        vec2 right_val;
-        if((((BORDER_SIZE_LEFT + int(width)) % 2)) == 1)
-        {
-            right_val = vec2(right_val_origin.x, right_val_origin.x);
-        }
-        else
-        {
-            right_val = vec2(right_val_origin.y, right_val_origin.y);
-        }
-        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
-        {
-            uint offset = IMAGE_OFFSET(buf_iter, int(width) + i, gidH);
-            int  pos    = i + BORDER_SIZE_LEFT + int(width);
-
-            if(i == 0)
-            {
-                if(pos % 2 == 0)
-                {
-                    STORE_PACK2_HALF(buf_ptr, offset, right_val);
-                    i++;
-                }
-                else
-                {
-                    set_replicate(offset, pos, right_val);
-                }
-            }
-            else
-            {
-                if(pos % 2 == 0)
-                {
-                    STORE_PACK2_HALF(buf_ptr, offset, right_val);
-                    i++;
-                }
-            }
-        }
-    }
-    else
-    {
-        // Get value for corners
-        int val_idx = gidW;
-        if(gidW < 0 || (gidW > (int(width) - 1)))
-        {
-            val_idx = gidW < 0 ? 0 : (int(width) - 1);
-        }
-
-        // Handle top border
-        vec2 top_val = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, val_idx, 0));
-        for(int i = 0; i < BORDER_SIZE_TOP; ++i)
-        {
-            uint offset = IMAGE_OFFSET(buf_iter, gidW, -(i + 1));
-
-            if(gid0 % 2 == 0)
-            {
-                if(gidW == (int(width) - 1))
-                {
-                    if(((BORDER_SIZE_LEFT + int(width)) % 2 == 1))
-                    {
-                        STORE_PACK2_HALF(buf_ptr, offset, top_val.xx);
-                    }
-                    else
-                    {
-                        STORE_PACK2_HALF(buf_ptr, offset, top_val.yy);
-                    }
-                }
-                else
-                {
-                    if(gidW < 0)
-                    {
-                        if(BORDER_SIZE_LEFT % 2 == 0)
-                        {
-                            STORE_PACK2_HALF(buf_ptr, offset, top_val.xx);
-                        }
-                        else
-                        {
-                            STORE_PACK2_HALF(buf_ptr, offset, top_val.yy);
-                        }
-                    }
-                    else if(gidW >= int(width))
-                    {
-                        if((BORDER_SIZE_LEFT + int(width)) % 2 == 0)
-                        {
-                            STORE_PACK2_HALF(buf_ptr, offset, top_val.yy);
-                        }
-                        else
-                        {
-                            STORE_PACK2_HALF(buf_ptr, offset, top_val.xx);
-                        }
-                    }
-                    else
-                    {
-                        STORE_PACK2_HALF(buf_ptr, offset, top_val);
-                    }
-                }
-            }
-        }
-        // Handle bottom border
-        vec2 bottom_val = LOAD_UNPACK2_HALF(buf_ptr, IMAGE_OFFSET(buf_iter, val_idx, int(height) - 1));
-        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
-        {
-            uint offset = IMAGE_OFFSET(buf_iter, gidW, int(height) + i);
-
-            if(gid0 % 2 == 0)
-            {
-                if(gidW == (int(width) - 1))
-                {
-                    STORE_PACK2_HALF(buf_ptr, offset, bottom_val.xx);
-                }
-                else
-                {
-                    if(gidW < 0)
-                    {
-                        if(BORDER_SIZE_LEFT % 2 == 0)
-                        {
-                            STORE_PACK2_HALF(buf_ptr, offset, bottom_val.xx);
-                        }
-                        else
-                        {
-                            STORE_PACK2_HALF(buf_ptr, offset, bottom_val.yy);
-                        }
-                    }
-                    else if(gidW >= int(width))
-                    {
-                        if((BORDER_SIZE_LEFT + int(width)) % 2 == 0)
-                        {
-                            STORE_PACK2_HALF(buf_ptr, offset, bottom_val.yy);
-                        }
-                        else
-                        {
-                            STORE_PACK2_HALF(buf_ptr, offset, bottom_val.xx);
-                        }
-                    }
-                    else
-                    {
-                        STORE_PACK2_HALF(buf_ptr, offset, bottom_val);
-                    }
-                }
-            }
-        }
-    }
-}
-
-#endif /* DATA_TYPE_FP32 */
-
-#endif /* FILL_IMAGE_BORDERS_REPLICATE */
-
-#ifdef FILL_IMAGE_BORDERS_CONSTANT
-
-/** Fill N pixels of the padding edge of a single channel image with a constant value.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
- * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2
- *
- * @param[out] buf_ptr        Pointer to the source image. Supported data types: F16/F32
- * @param[in]  buf_attrs      The attributes of the source image
- * @param[in]  width          Width of the valid region of the image
- * @param[in]  height         Height of the valid region of the image
- * @param[in]  start_pos_x    X coordinate indicating the start point of the valid region
- * @param[in]  start_pos_y    Y coordinate indicating the start point of the valid region
- * @param[in]  constant_value Constant value to use to fill the edges
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes buf_attrs;
-    uint               width;
-    uint               height;
-    int                start_pos_x;
-    int                start_pos_y;
-    float              constant_value;
-};
-
-#if defined(DATA_TYPE_FP32)
-TENSOR_DECLARATION(1, bufBuffer, float, buf_ptr, buf_shift, 2, writeonly);
-
-void main()
-{
-    ImageIterator buf_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(buf_attrs, buf_shift);
-
-    // Update pointer to point to the starting point of the valid region
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, start_pos_y * int(buf_attrs.stride_y) + start_pos_x * int(buf_attrs.stride_x));
-
-    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
-    int gid0        = int(gl_GlobalInvocationID.x);
-    int gidH        = gid0 - total_width;
-    int gidW        = gid0 - BORDER_SIZE_LEFT;
-
-    if(gidH >= 0)
-    {
-        // Handle left border
-        for(int i = 0; i < BORDER_SIZE_LEFT; ++i)
-        {
-            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, -(i + 1), gidH), constant_value);
-        }
-        // Handle right border
-        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
-        {
-            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, int(width) + i, gidH), constant_value);
-        }
-    }
-    else
-    {
-        // Handle top border
-        for(int i = 0; i < BORDER_SIZE_TOP; ++i)
-        {
-            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, gidW, -(i + 1)), constant_value);
-        }
-        // Handle bottom border
-        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
-        {
-            STORE(buf_ptr, IMAGE_OFFSET(buf_iter, gidW, int(height) + i), constant_value);
-        }
-    }
-}
-
-#elif defined(DATA_TYPE_FP16)
-TENSOR_DECLARATION(1, bufBuffer, uint, buf_ptr, buf_shift, 2, restrict);
-
-void set_constant(uint offset, int pos)
-{
-    vec2 b = LOAD_UNPACK2_HALF(buf_ptr, offset);
-
-    if(pos % 2 == 0)
-    {
-        b.x = constant_value;
-    }
-    else
-    {
-        b.y = constant_value;
-    }
-
-    STORE_PACK2_HALF(buf_ptr, offset, b);
-}
-
-void main()
-{
-    ImageIterator buf_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(buf_attrs, buf_shift);
-
-    int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT;
-    int gid0        = int(gl_GlobalInvocationID.x);
-    int gidH        = gid0 - total_width;
-    int gidW        = gid0 - BORDER_SIZE_LEFT;
-
-    // Update pointer to point to the starting point of the valid region
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(buf_iter, start_pos_y * int(buf_attrs.stride_y) + start_pos_x * int(buf_attrs.stride_x));
-
-    vec2 b = vec2(constant_value, constant_value);
-
-    if(gidH >= 0)
-    {
-        // Handle left border
-        for(int i = 0; i < BORDER_SIZE_LEFT; ++i)
-        {
-            uint offset = IMAGE_OFFSET(buf_iter, -(i + 1), gidH);
-            int  pos    = BORDER_SIZE_LEFT - i - 1;
-
-            if(i == 0)
-            {
-                if(pos % 2 == 0)
-                {
-                    set_constant(offset, pos);
-                }
-            }
-            else
-            {
-                if(pos % 2 == 0)
-                {
-                    STORE_PACK2_HALF(buf_ptr, offset, b);
-                }
-            }
-        }
-        // Handle right border
-        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
-        {
-            uint offset = IMAGE_OFFSET(buf_iter, int(width) + i, gidH);
-            int  pos    = i + BORDER_SIZE_LEFT + int(width);
-
-            if(i == 0)
-            {
-                if(pos % 2 == 0)
-                {
-                    STORE_PACK2_HALF(buf_ptr, offset, b);
-                }
-                else
-                {
-                    set_constant(offset, pos);
-                }
-            }
-            else
-            {
-                if(pos % 2 == 0)
-                {
-                    STORE_PACK2_HALF(buf_ptr, offset, b);
-                }
-            }
-        }
-    }
-    else
-    {
-        // Handle top border
-        for(int i = 0; i < BORDER_SIZE_TOP; ++i)
-        {
-            uint offset = IMAGE_OFFSET(buf_iter, gidW, -(i + 1));
-
-            if(gid0 % 2 == 0)
-            {
-                STORE_PACK2_HALF(buf_ptr, offset, b);
-            }
-        }
-        // Handle bottom border
-        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
-        {
-            uint offset = IMAGE_OFFSET(buf_iter, gidW, int(height) + i);
-
-            if(gid0 % 2 == 0)
-            {
-                STORE_PACK2_HALF(buf_ptr, offset, b);
-            }
-        }
-    }
-}
-
-#endif /* DATA_TYPE_FP32 */
-
-#endif /* FILL_IMAGE_BORDERS_CONSTANT */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
deleted file mode 100644
index d41b48c2a7..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ /dev/null
@@ -1,1130 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-#if defined(DATA_TYPE_FP32)
-#ifdef GEMM_TRANSPOSE1xW
-/** This OpenGL ES kernel computes the "vector" 1x4 transposition of input matrix
- *
- * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F32
- * @param[in]  src_attrs The attributes of the source matrix
- * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src_attrs;
-    ImageAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    /* Compute address for Matrix B - source */
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst_attrs.stride_y);
-
-    vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
-    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, b0);
-}
-#endif /* GEMM_TRANSPOSE1xW */
-
-#ifdef GEMM_INTERLEAVE4x4
-/** This OpenGLES kernel reshapes the input matrix interleaving the values
- *
- * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F32
- * @param[in]  src_attrs The attributes of the source matrix
- * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src_attrs;
-    ImageAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    /* Compute source and destination addresses */
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    int i;
-    int j;
-
-    for(i = 0; i < 4; ++i)
-    {
-        for(j = 0; j < 4; ++j)
-        {
-            float res = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, j));
-            STORE(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, (i * 4 + j)), res);
-        }
-    }
-}
-#endif /* GEMM_INTERLEAVE4x4 */
-
-#ifdef GEMM_ACCUMULATE_BIASES
-/** This kernel accumulates each row with the biases vector
- *
- * @param[in, out] accum_ptr    Pointer to the accumulate tensor. Supported data type: F32
- * @param[in]      accum_attrs  The attributes of the accumulate tensor
- * @param[in]      biases_ptr   Pointer to the biases vector. Same as @p accum_ptr
- * @param[in]      biases_attrs The attributes of the biases tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes  accum_attrs;
-    VectorAttributes biases_attrs;
-};
-TENSOR_DECLARATION(1, accumBuffer, float, accum_ptr, accum_shift, 2, restrict);
-TENSOR_DECLARATION(2, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
-
-void main(void)
-{
-    ImageIterator  accum_iter  = CONVERT_TO_IMAGE_ITERATOR(accum_attrs, accum_shift);
-    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
-
-    for(int i = 0; i < 16; ++i)
-    {
-        float accum_value  = LOAD(accum_ptr, TENSOR_OFFSET_ADVANCE(accum_iter, i));
-        float biases_value = LOAD(biases_ptr, TENSOR_OFFSET_ADVANCE(biases_iter, i));
-        accum_value        = biases_value + accum_value;
-
-        // Store result in the accummulate buffer
-        STORE(accum_ptr, TENSOR_OFFSET_ADVANCE(accum_iter, i), accum_value);
-    }
-}
-#endif /* GEMM_ACCUMULATE_BIASES */
-
-#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED /* unvalidate */
-/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
- *
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
- *
- * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_attrs The attributes of the source matrix
- * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_attrs The attributes of the source matrix
- * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_attrs  The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src0_attrs;
-    ImageAttributes src1_attrs;
-    ImageAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
-TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main()
-{
-    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    /* Compute address for matrix A and B */
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y));
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(gl_GlobalInvocationID.x) * (src1_attrs.stride_y));
-    /* Compute end row address for matrix B */
-    int end_row_mtx_b = int(TENSOR_OFFSET_ADVANCE(src1_iter, COLS_B));
-
-    /* Reset accumulators */
-    vec4 c00 = vec4(0.0f);
-    vec4 c10 = vec4(0.0f);
-    vec4 c20 = vec4(0.0f);
-    vec4 c30 = vec4(0.0f);
-
-    // FIXME: loop unrolling really needed for GLES?
-    for(; int(CURRENT_ITEM_OFFSET(src1_iter)) <= (end_row_mtx_b - 8); TENSOR_ITERATOR_ADVANCE(src0_iter, 8), TENSOR_ITERATOR_ADVANCE(src1_iter, 8))
-    {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
-        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
-
-        c00 += vec4(a0.x) * b0;
-        c10 += vec4(a0.y) * b0;
-        c20 += vec4(a0.z) * b0;
-        c30 += vec4(a0.w) * b0;
-
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        a0 = VLOAD4(vec4, src0_ptr, TENSOR_OFFSET_ADVANCE(src0_iter, 4));
-        b0 = VLOAD4(vec4, src1_ptr, TENSOR_OFFSET_ADVANCE(src1_iter, 4));
-
-        c00 += vec4(a0.x) * b0;
-        c10 += vec4(a0.y) * b0;
-        c20 += vec4(a0.z) * b0;
-        c30 += vec4(a0.w) * b0;
-    }
-
-    for(; int(CURRENT_ITEM_OFFSET(src1_iter)) < end_row_mtx_b; TENSOR_ITERATOR_ADVANCE(src0_iter, 4), TENSOR_ITERATOR_ADVANCE(src1_iter, 4))
-    {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
-        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
-
-        c00 += vec4(a0.x) * b0;
-        c10 += vec4(a0.y) * b0;
-        c20 += vec4(a0.z) * b0;
-        c30 += vec4(a0.w) * b0;
-    }
-
-    /* Multiply by the weight of matrix product */
-    c00 = c00 * vec4(ALPHA);
-    c10 = c10 * vec4(ALPHA);
-    c20 = c20 * vec4(ALPHA);
-    c30 = c30 * vec4(ALPHA);
-
-    /* Store 4x4 block */
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), c00);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), c10);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), c20);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), c30);
-}
-#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
-
-#ifdef GEMM_MM_FLOATING_POINT
-/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
- *
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
- * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
- *
- * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_attrs The attributes of the source matrix
- * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_attrs The attributes of the source matrix
- * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_attrs  The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src0_attrs;
-    ImageAttributes src1_attrs;
-    ImageAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
-TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main()
-{
-    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
-    /* Compute the address for the vector A and matrix B */
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y) * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, idx * 4);
-
-    /* Compute end row address for matrix A */
-    int end_row_vec_a = int(TENSOR_OFFSET_ADVANCE_IN_BYTES(src0_iter, COLS_A * 4));
-
-    /* Reset accumulators */
-    vec4 acc0 = vec4(0.0f);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vec4 acc1 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vec4 acc2 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vec4 acc3 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-    for(; int(CURRENT_ITEM_OFFSET(src0_iter)) <= (end_row_vec_a - 2); TENSOR_ITERATOR_ADVANCE(src0_iter, 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(2) * src1_attrs.stride_y))
-    {
-        vec2 a0 = VLOAD2_CURRENT_ITEM(vec2, src0_ptr, src0_iter);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec2 a1 = VLOAD2(vec2, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec2 a2 = VLOAD2(vec2, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec2 a3 = VLOAD2(vec2, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
-        vec4 b1 = VLOAD4(vec4, src1_ptr, IMAGE_OFFSET(src1_iter, 0, 1));
-
-        acc0 += b0 * vec4(a0.x);
-        acc0 += b1 * vec4(a0.y);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * vec4(a1.x);
-        acc1 += b1 * vec4(a1.y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * vec4(a2.x);
-        acc2 += b1 * vec4(a2.y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * vec4(a3.x);
-        acc3 += b1 * vec4(a3.y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    }
-
-    for(; int(CURRENT_ITEM_OFFSET(src0_iter)) < end_row_vec_a; TENSOR_ITERATOR_ADVANCE(src0_iter, 1), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y))
-    {
-        // Load values from matrix A
-        float a0 = LOAD_CURRENT_ITEM(src0_ptr, src0_iter);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float a1 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
-        //float a1 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float a2 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float a3 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
-
-        acc0 += b0 * vec4(a0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * vec4(a1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * vec4(a2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * vec4(a3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    }
-
-    /* Multiply by the weight of vector-matrix product */
-    acc0 = acc0 * vec4(ALPHA);
-    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    acc1 = acc1 * vec4(ALPHA);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    acc2 = acc2 * vec4(ALPHA);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    acc3 = acc3 * vec4(ALPHA);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-}
-#endif /* GEMM_MM_FLOATING_POINT */
-
-#ifdef GEMM_MM_FLOATING_POINT_BIFROST
-/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B in case both matrices have not been reshaped
- *
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
- * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
- *
- * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_attrs The attributes of the source matrix
- * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_attrs The attributes of the source matrix
- * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_attrs  The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src0_attrs;
-    ImageAttributes src1_attrs;
-    ImageAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
-TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main()
-{
-    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
-    /* Compute the address for the vector A and matrix B */
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y) * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, idx * 4);
-
-    /* Reset accumulators */
-    vec4 acc0 = vec4(0.0f);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vec4 acc1 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vec4 acc2 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vec4 acc3 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-    // A and B src indices get incremented at the same time.
-    int i = 0;
-    for(; i <= (COLS_A - 4); i += 4)
-    {
-        // Load values from matrix A and matrix B
-        vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec4 a1 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec4 a2 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec4 a3 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
-
-        // Multiply and accumulate
-        acc0 += b0 * vec4(a0.x);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * vec4(a1.x);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * vec4(a2.x);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * vec4(a3.x);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        // Load values from matrix B
-        b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
-
-        // Multiply and accumulate
-        acc0 += b0 * vec4(a0.y);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * vec4(a1.y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * vec4(a2.y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * vec4(a3.y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        // Load values from matrix B
-        b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
-
-        // Multiply and accumulate
-        acc0 += b0 * vec4(a0.z);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * vec4(a1.z);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * vec4(a2.z);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * vec4(a3.z);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        // Load values from matrix B
-        b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
-
-        // Multiply and accumulate
-        acc0 += b0 * vec4(a0.w);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * vec4(a1.w);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * vec4(a2.w);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * vec4(a3.w);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        TENSOR_ITERATOR_ADVANCE(src0_iter, 4);
-    }
-
-    for(; i < COLS_A; ++i)
-    {
-        // Load values from matrix A
-        float a0 = LOAD_CURRENT_ITEM(src0_ptr, src0_iter);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        float a1 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        float a2 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        float a3 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
-
-        // Multiply and accumulate
-        acc0 += b0 * vec4(a0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * vec4(a1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * vec4(a2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * vec4(a3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
-        TENSOR_ITERATOR_ADVANCE(src0_iter, 1);
-    }
-
-    /* Multiply by the weight of vector-matrix product */
-    acc0 = acc0 * vec4(ALPHA);
-    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    acc1 = acc1 * vec4(ALPHA);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    acc2 = acc2 * vec4(ALPHA);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    acc3 = acc3 * vec4(ALPHA);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-}
-#endif /* GEMM_MM_FLOATING_POINT_BIFROST */
-
-#ifdef GEMM_MATRIXADDITION
-/** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @attention The beta's value need to be passed at compile time using BETA
- *
- * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F32
- * @param[in]  src_attrs The attributes of the source matrix
- * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src_attrs;
-    ImageAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, restrict);
-
-void main(void)
-{
-    /* Compute source and destination addresses */
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    /* Load values from A x B */
-    vec4 alpha_ab = VLOAD4_CURRENT_ITEM(vec4, dst_ptr, dst_iter);
-    vec4 c        = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
-
-    /* Computes alpha * axb + beta * c */
-    vec4 out1 = alpha_ab + vec4(float(BETA) * c);
-
-    /* Store final result in axb matrix */
-    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, out1);
-}
-#endif /* GEMM_MATRIXADDITION */
-
-#elif defined(DATA_TYPE_FP16)
-
-#ifdef GEMM_TRANSPOSE1xW
-/** This OpenGL ES kernel computes the "vector" 1x8 transposition of input matrix
- *
- * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F16
- * @param[in]  src_attrs The attributes of the source matrix
- * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src_attrs;
-    ImageAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-
-void main(void)
-{
-    /* Compute address for Matrix B - source */
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    /* Compute address for Matrix B transposed - destination. X and Y are swapped */
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst_attrs.stride_y);
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, LOAD_CURRENT_ITEM(src_ptr, src_iter));
-}
-#endif /* GEMM_TRANSPOSE1xW */
-
-#ifdef GEMM_INTERLEAVE4x4
-/** This OpenGLES kernel reshapes the input matrix interleaving the values
- *
- * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F16
- * @param[in]  src_attrs The attributes of the source matrix
- * @param[out] dst_ptr   Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src_attrs;
-    ImageAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-
-void main(void)
-{
-    /* Compute source and destination addresses */
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    vec4 s0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    vec4 s1[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
-    vec4 s2[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
-    vec4 s3[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
-
-    vec4 s[2];
-    s[0] = vec4(s0[0].x, s1[0].x, s2[0].x, s3[0].x);
-    s[1] = vec4(s0[0].y, s1[0].y, s2[0].y, s3[0].y);
-    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, s);
-
-    s[0] = vec4(s0[0].z, s1[0].z, s2[0].z, s3[0].z);
-    s[1] = vec4(s0[0].w, s1[0].w, s2[0].w, s3[0].w);
-    STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 1u), s);
-
-    s[0] = vec4(s0[1].x, s1[1].x, s2[1].x, s3[1].x);
-    s[1] = vec4(s0[1].y, s1[1].y, s2[1].y, s3[1].y);
-    STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 2u), s);
-
-    s[0] = vec4(s0[1].z, s1[1].z, s2[1].z, s3[1].z);
-    s[1] = vec4(s0[1].w, s1[1].w, s2[1].w, s3[1].w);
-    STORE_PACK8_HALF(dst_ptr, TENSOR_OFFSET_ADVANCE(dst_iter, 3u), s);
-}
-#endif /* GEMM_INTERLEAVE4x4 */
-
-#ifdef GEMM_MM_FLOATING_POINT
-/** This OpenGL ES kernel computes the matrix multiplication between matrix A(src0) and matrix B(src1)
- * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x4 before running the matrix multiplication
- *
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
- *
- * @param[in]  src0_ptr   Pointer to the source matrix.Supported data types: F16
- * @param[in]  src0_attrs The attributes of the source matrix
- * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_attrs The attributes of the source matrix
- * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_attrs  The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src0_attrs;
-    ImageAttributes src1_attrs;
-    ImageAttributes dst_attrs;
-};
-
-#if defined(MM_PROCESS_4X)
-TENSOR_DECLARATION(1, src0Buffer, uint, src0_ptr, src0_shift, 2, readonly);
-TENSOR_DECLARATION(2, src1Buffer, uvec2, src1_ptr, src1_shift, 3, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-
-void main()
-{
-    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
-    /* Compute the address for the vector A and matrix B */
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * src0_attrs.stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(idx) * src1_attrs.stride_x);
-
-    /* Compute end row address for matrix A */
-    uint end_row_vec_a = uint(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) + uint(COLS_A << 1);
-
-    /* Reset accumulators */
-    vec4 acc0 = vec4(0.0f);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vec4 acc1 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vec4 acc2 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vec4 acc3 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(4));
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(2) * src1_attrs.stride_y))
-    {
-        vec2 a0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec2 a1 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec2 a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec2 a3 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        vec4 b0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
-        vec4 b1 = LOAD_UNPACK4_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, 1));
-
-        acc0 += b0 * vec4(a0.x);
-        acc0 += b1 * vec4(a0.y);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * vec4(a1.x);
-        acc1 += b1 * vec4(a1.y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * vec4(a2.x);
-        acc2 += b1 * vec4(a2.y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * vec4(a3.x);
-        acc3 += b1 * vec4(a3.y);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    }
-
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 2), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y))
-    {
-        vec2 a0 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec2 a1 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec2 a2 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec2 a3 = LOAD_UNPACK2_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        vec4 b0 = LOAD_UNPACK4_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
-
-        acc0 += b0 * (a0.x);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        acc1 += b0 * (a1.x);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        acc2 += b0 * (a2.x);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        acc3 += b0 * (a3.x);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    }
-
-    /* Multiply by the weight of vector-matrix product */
-    acc0 = acc0 * vec4(ALPHA);
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
-#endif                                 // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-}
-#elif defined(MM_PROCESS_4X_OPTIMIZED) /* PROCESS_4X */
-TENSOR_DECLARATION(1, src0Buffer, uvec4, src0_ptr, src0_shift, 4, readonly);
-TENSOR_DECLARATION(2, src1Buffer, uvec2, src1_ptr, src1_shift, 3, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-
-void main()
-{
-    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
-    /* Compute the address for the vector A and matrix B */
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * src0_attrs.stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(idx) * src1_attrs.stride_x);
-
-    /* Compute end row address for matrix A */
-    uint end_row_vec_a = uint(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) + uint(COLS_A << 1);
-
-    /* Reset accumulators */
-    vec4 acc0 = vec4(0.0f);
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    vec4 acc1 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    vec4 acc2 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    vec4 acc3 = vec4(0.0f);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
-    {
-        vec4 a0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec4 a1[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec4 a2[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec4 a3[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        vec4 b;
-
-        for(int i = 0; i < 8; i++)
-        {
-            int j = i >> 2;
-            int k = i % 4;
-
-            b = LOAD_UNPACK4_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
-
-            acc0 += b * vec4(a0[j][k]);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-            acc1 += b * vec4(a1[j][k]);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-            acc2 += b * vec4(a2[j][k]);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-            acc3 += b * vec4(a3[j][k]);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        }
-    }
-
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 2 * 8), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
-    {
-        vec4 a0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
-
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-        vec4 a1[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-        vec4 a2[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        vec4 a3[2] = LOAD_UNPACK8_HALF(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-
-        vec4 b;
-
-        int leftover = COLS_A % 8;
-
-        for(int i = 0; i < leftover; i++)
-        {
-            int j = i >> 2;
-            int k = i % 4;
-
-            b = LOAD_UNPACK4_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
-
-            acc0 += b * vec4(a0[j][k]);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-            acc1 += b * vec4(a1[j][k]);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-            acc2 += b * vec4(a2[j][k]);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-            acc3 += b * vec4(a3[j][k]);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-        }
-    }
-
-    /* Multiply by the weight of vector-matrix product */
-    acc0 = acc0 * vec4(ALPHA);
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, acc0);
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
-#endif                       // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-}
-#elif defined(MM_PROCESS_8X) /* PROCESS_8X */
-TENSOR_DECLARATION(1, src0Buffer, uvec4, src0_ptr, src0_shift, 4, readonly);
-TENSOR_DECLARATION(2, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-
-void main()
-{
-    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
-    /* Compute the address for the vector A and matrix B */
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * src0_attrs.stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(idx) * src1_attrs.stride_x);
-
-    /* Compute end row address for matrix A */
-    uint end_row_vec_a = uint(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) + uint(COLS_A << 1);
-
-    /* Reset accumulators */
-    vec4 acc[2];
-
-    acc[0] = vec4(0.0f);
-    acc[1] = vec4(0.0f);
-
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) <= int(end_row_vec_a - uint(16));
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * src0_attrs.stride_x), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
-    {
-        vec4 a[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
-        vec4 b[2];
-
-        for(int i = 0; i < 8; i++)
-        {
-            int j = i >> 2;
-            int k = i % 4;
-
-            b = LOAD_UNPACK8_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
-
-            acc[0] += b[0] * vec4(a[j][k]);
-            acc[1] += b[1] * vec4(a[j][k]);
-        }
-    }
-
-    for(; int(CURRENT_ITEM_OFFSET_IN_BYTES(src0_iter)) < int(end_row_vec_a);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(8) * uint(2)), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(8) * src1_attrs.stride_y))
-    {
-        vec4 a[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
-        vec4 b[2];
-
-        int leftover = COLS_A % 8;
-
-        for(int i = 0; i < leftover; i++)
-        {
-            int j = i >> 2;
-            int k = i % 4;
-
-            b = LOAD_UNPACK8_HALF(src1_ptr, IMAGE_OFFSET(src1_iter, 0, i));
-
-            acc[0] += b[0] * vec4(a[j][k]);
-            acc[1] += b[1] * vec4(a[j][k]);
-        }
-    }
-
-    /* Multiply by the weight of vector-matrix product */
-    acc[0] = acc[0] * vec4(ALPHA);
-    acc[1] = acc[1] * vec4(ALPHA);
-
-    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, acc);
-}
-#endif                       /* PROCESS_8X */
-#endif                       /* GEMM_MM_FLOATING_POINT */
-
-#ifdef GEMM_ACCUMULATE_BIASES
-#if defined(ACCUM_PROCESS_4X)
-/** This kernel accumulates each row with the biases vector
- *
- * @param[in, out] accum_ptr    Pointer to the accumulate tensor. Supported data type: F16
- * @param[in]      accum_attrs  The attributes of the accumulate tensor
- * @param[in]      biases_ptr   Pointer to the biases vector. Same as @p accum_ptr
- * @param[in]      biases_attrs The attributes of the biases tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes  accum_attrs;
-    VectorAttributes biases_attrs;
-};
-
-TENSOR_DECLARATION(1, accumBuffer, uvec2, accum_ptr, accum_shift, 3, restrict);
-TENSOR_DECLARATION(2, biasesBuffer, uvec2, biases_ptr, biases_shift, 3, readonly);
-
-void main(void)
-{
-    ImageIterator  accum_iter  = CONVERT_TO_IMAGE_ITERATOR(accum_attrs, accum_shift);
-    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
-
-    vec4 u[2];
-    u[0] = LOAD_UNPACK4_CURRENT_ITEM_HALF(accum_ptr, accum_iter);
-    u[1] = LOAD_UNPACK4_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
-
-    vec4 tmp;
-    tmp = u[0] + u[1];
-    STORE_PACK4_CURRENT_ITEM_HALF(accum_ptr, accum_iter, tmp);
-}
-#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_8X */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes  accum_attrs;
-    VectorAttributes biases_attrs;
-};
-
-TENSOR_DECLARATION(1, accumBuffer, uvec4, accum_ptr, accum_shift, 4, restrict);
-TENSOR_DECLARATION(2, biasesBuffer, uvec4, biases_ptr, biases_shift, 4, readonly);
-
-void main(void)
-{
-    ImageIterator  accum_iter  = CONVERT_TO_IMAGE_ITERATOR(accum_attrs, accum_shift);
-    VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR(biases_attrs, biases_shift);
-
-    vec4 u[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(accum_ptr, accum_iter);
-    vec4 v[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(biases_ptr, biases_iter);
-
-    vec4 r[2];
-    r[0] = u[0] + v[0];
-    r[1] = u[1] + v[1];
-    STORE_PACK8_CURRENT_ITEM_HALF(accum_ptr, accum_iter, r);
-}
-#endif                          /* ACCUM_PROCESS_8X */
-#endif                          /* GEMM_ACCUMULATE_BIASES */
-
-#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED
-/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
- *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
- *
- * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
- *
- * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_attrs The attributes of the source matrix
- * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_attrs The attributes of the source matrix
- * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_attrs  The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src0_attrs;
-    ImageAttributes src1_attrs;
-    ImageAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, src0Buffer, uvec2, src0_ptr, src0_shift, 3, readonly);
-TENSOR_DECLARATION(2, src1Buffer, uvec4, src1_ptr, src1_shift, 4, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-
-void main()
-{
-    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
-    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
-    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    /* Compute address for matrix A and B */
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y));
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, uint(gl_GlobalInvocationID.x) * (src1_attrs.stride_y));
-    /* Compute end row address for matrix B */
-    int end_row_mtx_b = (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) + int(COLS_B);
-
-    /* Reset accumulators */
-    vec4 c00[2];
-    vec4 c10[2];
-    vec4 c20[2];
-    vec4 c30[2];
-    c00[0] = vec4(0.0f);
-    c00[1] = vec4(0.0f);
-    c10[0] = vec4(0.0f);
-    c10[1] = vec4(0.0f);
-    c20[0] = vec4(0.0f);
-    c20[1] = vec4(0.0f);
-    c30[0] = vec4(0.0f);
-    c30[1] = vec4(0.0f);
-
-    // FIXME: loop unrolling really needed for GLES?
-    for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) <= (end_row_mtx_b - 16); TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 16), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 32))
-    {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        vec4 a0    = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
-        vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
-
-        c00[0] += vec4(a0.x) * b0[0];
-        c00[1] += vec4(a0.x) * b0[1];
-        c10[0] += vec4(a0.y) * b0[0];
-        c10[1] += vec4(a0.y) * b0[1];
-        c20[0] += vec4(a0.z) * b0[0];
-        c20[1] += vec4(a0.z) * b0[1];
-        c30[0] += vec4(a0.w) * b0[0];
-        c30[1] += vec4(a0.w) * b0[1];
-
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        a0 = LOAD_UNPACK4_HALF(src0_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src0_iter, 8));
-        b0 = LOAD_UNPACK8_HALF(src1_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src1_iter, 16));
-
-        c00[0] += vec4(a0.x) * b0[0];
-        c00[1] += vec4(a0.x) * b0[1];
-        c10[0] += vec4(a0.y) * b0[0];
-        c10[1] += vec4(a0.y) * b0[1];
-        c20[0] += vec4(a0.z) * b0[0];
-        c20[1] += vec4(a0.z) * b0[1];
-        c30[0] += vec4(a0.w) * b0[0];
-        c30[1] += vec4(a0.w) * b0[1];
-    }
-
-    for(; (int(CURRENT_ITEM_OFFSET_IN_BYTES(src1_iter)) >> 1) < end_row_mtx_b; TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, 8), TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, 16))
-    {
-        /* Load values from matrix A (interleaved) and matrix B (transposed) */
-        vec4 a0    = LOAD_UNPACK4_CURRENT_ITEM_HALF(src0_ptr, src0_iter);
-        vec4 b0[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src1_ptr, src1_iter);
-
-        c00[0] += vec4(a0.x) * b0[0];
-        c00[1] += vec4(a0.x) * b0[1];
-        c10[0] += vec4(a0.y) * b0[0];
-        c10[1] += vec4(a0.y) * b0[1];
-        c20[0] += vec4(a0.z) * b0[0];
-        c20[1] += vec4(a0.z) * b0[1];
-        c30[0] += vec4(a0.w) * b0[0];
-        c30[1] += vec4(a0.w) * b0[1];
-    }
-
-    /* Multiply by the weight of matrix product */
-    c00[0] = c00[0] * vec4(ALPHA);
-    c00[1] = c00[1] * vec4(ALPHA);
-    c10[0] = c10[0] * vec4(ALPHA);
-    c10[1] = c10[1] * vec4(ALPHA);
-    c20[0] = c20[0] * vec4(ALPHA);
-    c20[1] = c20[1] * vec4(ALPHA);
-    c30[0] = c30[0] * vec4(ALPHA);
-    c30[1] = c30[1] * vec4(ALPHA);
-
-    /* Store 4x8 block */
-    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), c00);
-    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), c10);
-    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), c20);
-    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), c30);
-}
-#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */
-#else  /* DATA_TYPE_FP16 */
-#error Data type not supported
-#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
deleted file mode 100644
index 4e3551700f..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
+++ /dev/null
@@ -1,498 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_HELPER_CS_H
-#define ARM_COMPUTE_HELPER_CS_H
-
-#define SHADER_PARAMS_DECLARATION \
-    layout(std140, binding = 0) uniform shader_params
-
-#define TENSOR_DECLARATION(location, buffer_type, type, ptr_name, shift_name, element_shift, access) \
-    layout(std430, binding = location) access buffer buffer_type                                     \
-    {                                                                                                \
-        type ptr_name[];                                                                             \
-    };                                                                                               \
-    const uint shift_name = uint(element_shift)
-
-struct VectorAttributes
-{
-    uint stride_x;                      /**< Stride of the vector in X dimension (in bytes) */
-    uint step_x;                        /**< stride_x * number of elements along X processed per workitem (in bytes) */
-    uint offset_first_element_in_bytes; /**< The offset of the first element in the vector (in bytes) */
-    uint padding;                       /**< The padding to rounding up the structure to a multiple of a vec4 */
-};
-
-struct ImageAttributes
-{
-    uint stride_x;                      /**< Stride of the image in X dimension (in bytes) */
-    uint step_x;                        /**< stride_x * number of elements along X processed per workitem (in bytes) */
-    uint stride_y;                      /**< Stride of the image in Y dimension (in bytes) */
-    uint step_y;                        /**< stride_y * number of elements along Y processed per workitem (in bytes) */
-    uint offset_first_element_in_bytes; /**< The offset of the first element in the image (in bytes) */
-    uint padding1;                      /**< The padding to rounding up the structure to a multiple of a vec4 */
-    uint padding2;                      /**< The padding to rounding up the structure to a multiple of a vec4 */
-    uint padding3;                      /**< The padding to rounding up the structure to a multiple of a vec4 */
-};
-
-struct Tensor3DAttributes
-{
-    uint stride_x;                      /**< Stride of the tensor in X dimension (in bytes) */
-    uint step_x;                        /**< stride_x * number of elements along X processed per workitem (in bytes) */
-    uint stride_y;                      /**< Stride of the tensor in Y dimension (in bytes) */
-    uint step_y;                        /**< stride_y * number of elements along Y processed per workitem (in bytes) */
-    uint stride_z;                      /**< Stride of the tensor in Z dimension (in bytes) */
-    uint step_z;                        /**< stride_z * number of elements along Z processed per workitem (in bytes) */
-    uint offset_first_element_in_bytes; /**< The offset of the first element in the tensor (in bytes) */
-    uint padding;                       /**< The padding to rounding up the structure to a multiple of a vec4 */
-};
-
-struct VectorIterator
-{
-    int current_offset_in_bytes; /**< Current offset of vector (in bytes) */
-    int stride_x;                /**< Stride of the vector in X dimension (in bytes) */
-    int element_shift;           /**< The number of bits to shift by for one element */
-};
-
-struct ImageIterator
-{
-    int current_offset_in_bytes; /**< Current offset of image (in bytes) */
-    int stride_x;                /**< Stride of the image in X dimension (in bytes) */
-    int stride_y;                /**< Stride of the image in Y dimension (in bytes) */
-    int element_shift;           /**< The number of bits to shift by for one element */
-};
-
-struct Tensor3DIterator
-{
-    int current_offset_in_bytes; /**< Current offset of tensor (in bytes) */
-    int stride_x;                /**< Stride of the tensor in X dimension (in bytes) */
-    int stride_y;                /**< Stride of the tensor in Y dimension (in bytes) */
-    int stride_z;                /**< Stride of the tensor in Z dimension (in bytes) */
-    int element_shift;           /**< The number of bits to shift by for one element */
-};
-
-#define CONVERT_TO_VECTOR_ITERATOR(attrs, element_shift)                          \
-    update_vector_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
-                              attrs.stride_x, attrs.step_x)
-
-#define CONVERT_TO_VECTOR_ITERATOR_NO_STEP(attrs, element_shift)                  \
-    update_vector_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
-                              attrs.stride_x, uint(0))
-
-#define CONVERT_TO_IMAGE_ITERATOR(attrs, element_shift)                          \
-    update_image_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
-                             attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y)
-
-#define CONVERT_TO_IMAGE_ITERATOR_NO_STEP(attrs, element_shift)                  \
-    update_image_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
-                             attrs.stride_x, uint(0), attrs.stride_y, uint(0))
-
-#define CONVERT_TO_TENSOR3D_ITERATOR(attrs, element_shift)                          \
-    update_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
-                                attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y, attrs.stride_z, attrs.step_z)
-
-#define CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(attrs, element_shift)                  \
-    update_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
-                                attrs.stride_x, uint(0), attrs.stride_y, uint(0), attrs.stride_z, uint(0))
-
-#define CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(attrs, element_shift)                               \
-    update_image_from_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
-                                           attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y, attrs.stride_z, attrs.step_z)
-
-#define CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(attrs, element_shift)                       \
-    update_image_from_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
-                                           attrs.stride_x, uint(0), attrs.stride_y, uint(0), attrs.stride_z, attrs.step_z)
-
-/** Wrap vector information into a VectorIterator structure, and make the offset to be this workitem's position.
- *
- * @param[in] element_shift                 The number of bits to shift by for one element
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
- * @param[in] stride_x                      Stride of the vector in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
- *
- * @return A VectorIterator object
- */
-VectorIterator update_vector_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
-{
-    VectorIterator vector_iter;
-    vector_iter.element_shift           = int(element_shift);
-    vector_iter.stride_x                = int(stride_x);
-    vector_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x);
-
-    return vector_iter;
-}
-
-/** Wrap image information into an ImageIterator structure, and make the offset to be this workitem's position.
- *
- * @param[in] element_shift                 The number of bits to shift by for one element
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] stride_x                      Stride of the image in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
- * @param[in] stride_y                      Stride of the image in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per workitem (in bytes)
- *
- * @return An ImageIterator object
- */
-ImageIterator update_image_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
-{
-    ImageIterator image_iter;
-    image_iter.element_shift           = int(element_shift);
-    image_iter.stride_x                = int(stride_x);
-    image_iter.stride_y                = int(stride_y);
-    image_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y);
-
-    return image_iter;
-}
-
-/** Wrap 3D tensor information into a Tensor3DIterator structure, and make the offset to be this workitem's position.
- *
- * @param[in] element_shift                 The number of bits to shift by for one element
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source tersor
- * @param[in] stride_x                      Stride of the tersor in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
- * @param[in] stride_y                      Stride of the tersor in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per workitem (in bytes)
- * @param[in] stride_z                      Stride of the tersor in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per workitem (in bytes)
- *
- * @return A 3D Tensor3DIterator object
- */
-Tensor3DIterator update_tensor3D_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
-{
-    Tensor3DIterator tensor_iter;
-    tensor_iter.element_shift           = int(element_shift);
-    tensor_iter.stride_x                = int(stride_x);
-    tensor_iter.stride_y                = int(stride_y);
-    tensor_iter.stride_z                = int(stride_z);
-    tensor_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z);
-
-    return tensor_iter;
-}
-
-/** Wrap 3D tensor information into an ImageIterator structure, and make the offset to be this workitem's position.
- *
- * @param[in] element_shift                 The number of bits to shift by for one element
- * @param[in] offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] stride_x                      Stride of the tensor in X dimension (in bytes)
- * @param[in] step_x                        stride_x * number of elements along X processed per workitem (in bytes)
- * @param[in] stride_y                      Stride of the tensor in Y dimension (in bytes)
- * @param[in] step_y                        stride_y * number of elements along Y processed per workitem (in bytes)
- * @param[in] stride_z                      Stride of the tensor in Z dimension (in bytes)
- * @param[in] step_z                        stride_z * number of elements along Z processed per workitem (in bytes)
- *
- * @return An ImageIterator object
- */
-ImageIterator update_image_from_tensor3D_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
-{
-    ImageIterator image_iter;
-    image_iter.element_shift           = int(element_shift);
-    image_iter.stride_x                = int(stride_x);
-    image_iter.stride_y                = int(stride_y);
-    image_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z);
-
-    return image_iter;
-}
-
-#define VECTOR_OFFSET(tensor_iter, x) \
-    uint(vector_offset_in_bytes(tensor_iter, int(x)) >> tensor_iter.element_shift)
-
-#define IMAGE_OFFSET(tensor_iter, x, y) \
-    uint(image_offset_in_bytes(tensor_iter, int(x), int(y)) >> tensor_iter.element_shift)
-
-#define TENSOR3D_OFFSET(tensor_iter, x, y, z) \
-    uint(tensor3D_offset_in_bytes(tensor_iter, int(x), int(y), int(z)) >> tensor_iter.element_shift)
-
-#define TENSOR_OFFSET_ADVANCE(tensor_iter, n) \
-    uint((tensor_iter.current_offset_in_bytes >> tensor_iter.element_shift) + int(n))
-
-#define TENSOR_OFFSET_ADVANCE_IN_BYTES(tensor_iter, n) \
-    uint((tensor_iter.current_offset_in_bytes + int(n)) >> tensor_iter.element_shift)
-
-#define CURRENT_ITEM_OFFSET(tensor_iter) \
-    uint(tensor_iter.current_offset_in_bytes >> tensor_iter.element_shift)
-
-#define CURRENT_ITEM_OFFSET_IN_BYTES(tensor_iter) \
-    uint(tensor_iter.current_offset_in_bytes)
-
-#define TENSOR_ITERATOR_ADVANCE(tensor_iter, n) \
-    tensor_iter.current_offset_in_bytes += (int(n) << tensor_iter.element_shift)
-
-#define TENSOR_ITERATOR_ADVANCE_IN_BYTES(tensor_iter, n) \
-    tensor_iter.current_offset_in_bytes += int(n)
-
-#define SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(tensor_iter, n) \
-    tensor_iter.current_offset_in_bytes = int(n)
-
-/** Get the offset of a VectorIterator
- *
- * @param[in] vector_iter The VectorIterator object pointed to the starting position of the buffer
- * @param[in] x           Relative X position
- *
- * @return The relative offset of the VectorIterator object (in bytes)
- */
-uint vector_offset_in_bytes(VectorIterator vector_iter, int x)
-{
-    return uint(vector_iter.current_offset_in_bytes + x * vector_iter.stride_x);
-}
-
-/** Get the offset of an ImageIterator
- *
- * @param[in] vector_iter The ImageIterator object pointed to the starting position of the buffer
- * @param[in] x           Relative X position
- * @param[in] y           Relative Y position
- *
- * @return The relative offset of the ImageIterator object (in bytes)
- */
-uint image_offset_in_bytes(ImageIterator image_iter, int x, int y)
-{
-    return uint(image_iter.current_offset_in_bytes + x * image_iter.stride_x + y * image_iter.stride_y);
-}
-
-/** Get the offset of a Tensor3DIterator
- *
- * @param[in] vector_iter The Tensor3DIterator object pointed to the starting position of the buffer
- * @param[in] x           Relative X position
- * @param[in] y           Relative Y position
- * @param[in] z           Relative Z position
- *
- * @return The relative offset of the Tensor3DIterator object (in bytes)
- */
-uint tensor3D_offset_in_bytes(Tensor3DIterator tensor_iter, int x, int y, int z)
-{
-    return uint(tensor_iter.current_offset_in_bytes + x * tensor_iter.stride_x + y * tensor_iter.stride_y + z * tensor_iter.stride_z);
-}
-
-#define LOAD(tensor_ptr, offset) tensor_ptr[offset]
-#define STORE(tensor_ptr, offset, data) tensor_ptr[offset] = data
-#define LOAD_CURRENT_ITEM(tensor_ptr, tensor_iter) tensor_ptr[CURRENT_ITEM_OFFSET(tensor_iter)]
-#define STORE_CURRENT_ITEM(tensor_ptr, tensor_iter, data) tensor_ptr[CURRENT_ITEM_OFFSET(tensor_iter)] = data
-
-#define VLOAD2(return_type, tensor_ptr, offset) \
-    return_type(LOAD(tensor_ptr, offset),       \
-                LOAD(tensor_ptr, (offset) + uint(1)))
-
-#define VSTORE2(tensor_ptr, offset, data) \
-    STORE(tensor_ptr, offset, data[0]);   \
-    STORE(tensor_ptr, (offset) + uint(1), data[1])
-
-#define VLOAD2_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD2(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define VSTORE2_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE2(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-#define VLOAD3(return_type, tensor_ptr, offset)       \
-    return_type(LOAD(tensor_ptr, offset),             \
-                LOAD(tensor_ptr, (offset) + uint(1)), \
-                LOAD(tensor_ptr, (offset) + uint(2)))
-
-#define VSTORE3(tensor_ptr, offset, data)           \
-    STORE(tensor_ptr, offset, data[0]);             \
-    STORE(tensor_ptr, (offset) + uint(1), data[1]); \
-    STORE(tensor_ptr, (offset) + uint(2), data[2])
-
-#define VLOAD3_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD3(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define VSTORE3_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE3(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-#define VLOAD4(return_type, tensor_ptr, offset)       \
-    return_type(LOAD(tensor_ptr, offset),             \
-                LOAD(tensor_ptr, (offset) + uint(1)), \
-                LOAD(tensor_ptr, (offset) + uint(2)), \
-                LOAD(tensor_ptr, (offset) + uint(3)))
-
-#define VSTORE4(tensor_ptr, offset, data)           \
-    STORE(tensor_ptr, offset, data[0]);             \
-    STORE(tensor_ptr, (offset) + uint(1), data[1]); \
-    STORE(tensor_ptr, (offset) + uint(2), data[2]); \
-    STORE(tensor_ptr, (offset) + uint(3), data[3])
-
-#define VLOAD4_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD4(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define VSTORE4_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE4(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-#define VLOAD5(return_type, tensor_ptr, offset)       \
-    return_type(LOAD(tensor_ptr, offset),             \
-                LOAD(tensor_ptr, (offset) + uint(1)), \
-                LOAD(tensor_ptr, (offset) + uint(2)), \
-                LOAD(tensor_ptr, (offset) + uint(3)), \
-                LOAD(tensor_ptr, (offset) + uint(4)))
-
-#define VSTORE5(tensor_ptr, offset, data)           \
-    STORE(tensor_ptr, offset, data[0]);             \
-    STORE(tensor_ptr, (offset) + uint(1), data[1]); \
-    STORE(tensor_ptr, (offset) + uint(2), data[2]); \
-    STORE(tensor_ptr, (offset) + uint(3), data[3]); \
-    STORE(tensor_ptr, (offset) + uint(4), data[4])
-
-#define VLOAD5_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD5(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define VSTORE5_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE5(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-/** Converting the vec4 object to 4 half-precision (16-bits) floating point values and packing into a uvec2 object
- *
- * @param[in] data The vec4 object to be packed
- *
- * @return The packed uvec2 object
- */
-highp uvec2 pack4_half(mediump vec4 data)
-{
-    return uvec2(packHalf2x16(data.xy), packHalf2x16(data.zw));
-}
-
-/** Unpacking the uvec2 object to 4 half-precision (16-bits) floating point values and converting to a vec4 object
- *
- * @param[in] packed_data The uvec2 object to be unpacked
- *
- * @return The unpacked vec4 object
- */
-mediump vec4 unpack4_half(highp uvec2 packed_data)
-{
-    return vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y));
-}
-
-/** Unpacking the uvec3 object to 6 half-precision (16-bits) floating point values and converting to a vec2[3] object
- *
- * @param[in] packed_data The uvec3 object to be unpacked
- *
- * @return The unpacked vec2[3] object
- */
-mediump vec2[3] unpack6_half(highp uvec3 packed_data)
-{
-    return vec2[3](unpackHalf2x16(packed_data[0]),
-                   unpackHalf2x16(packed_data[1]),
-                   unpackHalf2x16(packed_data[2]));
-}
-
-/** Converting the vec4[2] object to 8 half-precision (16-bits) floating point values and packing into a uvec4 object
- *
- * @param[in] data The vec4[2] object to be packed
- *
- * @return The packed uvec4 object
- */
-highp uvec4 pack8_half(mediump vec4 data[2])
-{
-    return uvec4(packHalf2x16(data[0].xy), packHalf2x16(data[0].zw),
-                 packHalf2x16(data[1].xy), packHalf2x16(data[1].zw));
-}
-
-/** Unpacking the uvec4 object to 8 half-precision (16-bits) floating point values and converting to a vec4[2] object
- *
- * @param[in] packed_data The uvec4 object to be unpacked
- *
- * @return The unpacked vec4[2] object
- */
-mediump vec4[2] unpack8_half(highp uvec4 packed_data)
-{
-    return vec4[2](vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y)),
-                   vec4(unpackHalf2x16(packed_data.z), unpackHalf2x16(packed_data.w)));
-}
-
-/** Unpacking the uvec2[3] object to 12 half-precision (16-bits) floating point values and converting to a vec4[3] object
- *
- * @param[in] packed_data The uvec2[3] object to be unpacked
- *
- * @return The unpacked vec4[3] object
- */
-mediump vec4[3] unpack12_half(highp uvec2[3] packed_data)
-{
-    return vec4[3](vec4(unpackHalf2x16(packed_data[0].x), unpackHalf2x16(packed_data[0].y)),
-                   vec4(unpackHalf2x16(packed_data[1].x), unpackHalf2x16(packed_data[1].y)),
-                   vec4(unpackHalf2x16(packed_data[2].x), unpackHalf2x16(packed_data[2].y)));
-}
-
-// For half-precision (16-bits) floating point packed into a "uint" element
-#define LOAD_UNPACK2_HALF(tensor_ptr, offset) unpackHalf2x16(uint(LOAD(tensor_ptr, offset)))
-#define STORE_PACK2_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, packHalf2x16(data))
-#define LOAD_UNPACK2_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK2_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define STORE_PACK2_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK2_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-#define VLOAD2_UNPACK4_HALF(tensor_ptr, offset) unpack4_half(VLOAD2(uvec2, tensor_ptr, offset))
-#define VSTORE2_PACK4_HALF(tensor_ptr, offset, data) VSTORE2(tensor_ptr, offset, pack4_half(data))
-#define VLOAD2_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define VSTORE2_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-#define VLOAD3_UNPACK6_HALF(tensor_ptr, offset) unpack6_half(VLOAD3(uvec3, tensor_ptr, offset))
-#define VLOAD3_UNPACK6_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD3_UNPACK6_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-
-#define VLOAD4_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD4(uvec4, tensor_ptr, offset))
-#define VSTORE4_PACK8_HALF(tensor_ptr, offset, data) VSTORE4(tensor_ptr, offset, pack8_half(data))
-#define VLOAD4_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD4_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define VSTORE4_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE4_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-// For half-precision (16-bits) floating point packed into a "uvec2" element
-#define LOAD_UNPACK4_HALF(tensor_ptr, offset) unpack4_half(uvec2(LOAD(tensor_ptr, offset)))
-#define STORE_PACK4_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack4_half(data))
-#define LOAD_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define STORE_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-#define VLOAD2_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD2(uvec4, tensor_ptr, offset))
-#define VSTORE2_PACK8_HALF(tensor_ptr, offset, data) VSTORE2(tensor_ptr, offset, pack8_half(data))
-#define VLOAD2_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define VSTORE2_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-#define VLOAD3_UNPACK12_HALF(tensor_ptr, offset) unpack12_half(VLOAD3(uvec2[3], tensor_ptr, offset))
-#define VLOAD3_UNPACK12_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD3_UNPACK12_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-
-// For half-precision (16-bits) floating point packed into a "uvec4" element
-#define LOAD_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(uvec4(LOAD(tensor_ptr, offset)))
-#define STORE_PACK8_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack8_half(data))
-#define LOAD_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define STORE_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-/** Converting the uvec4 object to 4 low-precision uint values and packing into a uint object
- *
- * @param[in] data The uvec4 object to be packed
- *
- * @return The packed uint object
- */
-highp uint pack4_u8(lowp uvec4 data)
-{
-    highp uint r = uint(0);
-
-    for(int i = 0; i < 4; i++)
-    {
-        r |= data[i] << uint(i * 8);
-    }
-
-    return r;
-}
-
-/** Unpacking the uint object to 4 low-precision uint values and converting to a uvec4 object
- *
- * @param[in] packed_data The uint object to be unpacked
- *
- * @return The unpacked uvec4 object
- */
-lowp uvec4 unpack4_u8(highp uint packed_data)
-{
-    lowp uvec4 uvec;
-
-    for(int i = 0; i < 4; i++)
-    {
-        uvec[i] = (packed_data >> uint(i * 8)) & uint(0xFF);
-    }
-
-    return uvec;
-}
-
-#define LOAD_UNPACK4_U8(tensor_ptr, offset) unpack4_u8(uint(LOAD(tensor_ptr, offset)))
-#define STORE_PACK4_U8(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack4_u8(data))
-#define LOAD_UNPACK4_CURRENT_ITEM_U8(tensor_ptr, tensor_iter) LOAD_UNPACK4_U8(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
-#define STORE_PACK4_CURRENT_ITEM_U8(tensor_ptr, tensor_iter, data) STORE_PACK4_U8(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
-
-#endif // ARM_COMPUTE_HELPER_CS_H
diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
deleted file mode 100644
index a5ec68c0c5..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-/** Apply cross map normalization and in map normalization
- *
- * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x"
- * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x"
- * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
- * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
- *
- * @param[in]  src1_ptr   Pointer to the first source tensor. Supported data types: F32
- * @param[in]  src1_attrs The attributes of the first source tensor
- * @param[in]  src2_ptr   Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_attrs The attributes of the second source tensor
- * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_attrs  The attributes of the destination tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src1_attrs;
-    Tensor3DAttributes src2_attrs;
-    Tensor3DAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
-TENSOR_DECLARATION(2, src2Buffer, float, src2_ptr, src2_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-#ifdef CROSS_MAP
-void main(void)
-{
-    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
-    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
-    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    float acc = 0.0;
-
-    int num_of_slices = int(gl_NumWorkGroups.z * gl_WorkGroupSize.z);
-    int current_slice = int(gl_GlobalInvocationID.z);
-
-    int left_slice  = max(current_slice - int(RADIUS), int(0));
-    int right_slice = min(current_slice + int(RADIUS), int(num_of_slices - 1));
-
-    for(int i = left_slice; i <= right_slice; i++)
-    {
-        acc += LOAD(src2_ptr, TENSOR3D_OFFSET(src2_iter, 0, 0, i - current_slice));
-    }
-
-    float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
-
-    float normalized_pixel = (LOAD_CURRENT_ITEM(src1_ptr, src1_iter)) / normalized;
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, normalized_pixel);
-}
-
-#elif defined(IN_MAP_1D)
-void main(void)
-{
-    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
-    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
-    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    float acc = 0.0;
-
-    int num_of_items_x = int(gl_NumWorkGroups.x * gl_WorkGroupSize.x);
-    int current_pos    = int(gl_GlobalInvocationID.x);
-
-    int left_pos  = max(current_pos - int(RADIUS), int(0));
-    int right_pos = min(current_pos + int(RADIUS), int(num_of_items_x + -1));
-
-    for(int i = left_pos; i <= right_pos; i++)
-    {
-        acc += LOAD(src2_ptr, TENSOR3D_OFFSET(src2_iter, i - current_pos, 0, 0));
-    }
-
-    float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA));
-
-    float normalized_pixel = (LOAD_CURRENT_ITEM(src1_ptr, src1_iter)) / normalized;
-
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, normalized_pixel);
-}
-#endif /*CROSS_MAP*/
diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs
deleted file mode 100644
index 6a46845d79..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/normalize_planar_yuv_layer.cs
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-precision mediump float;
-
-/** Apply normalize_planar_yuv layer.
- *
- * @param[in]  src_ptr    Pointer to the first source tensor. Supported data types: F16
- * @param[in]  src_attrs  The attributes of the source tensor
- * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs  The attributes of the destination tensor
- * @param[in]  mean_ptr   Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_attrs The attributes of the mean tensor
- * @param[in]  sd_ptr     Standard deviation values tensor,pointer to the sd tensor. Supported data types: same as @p src_ptr
- * @param[in]  sd_attrs   The attributes of the sd tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-    VectorAttributes   mean_attrs;
-    VectorAttributes   sd_attrs;
-};
-
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-TENSOR_DECLARATION(3, meanBuffer, uvec2, mean_ptr, mean_shift, 3, readonly);
-TENSOR_DECLARATION(4, sdBuffer, uvec2, sd_ptr, sd_shift, 3, readonly);
-
-void main(void)
-{
-    Tensor3DIterator src_iter  = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-    VectorIterator   mean_iter = CONVERT_TO_VECTOR_ITERATOR(mean_attrs, mean_shift);
-    VectorIterator   sd_iter   = CONVERT_TO_VECTOR_ITERATOR(sd_attrs, sd_shift);
-
-    vec4 unpacked_s[3];
-    vec4 tmp;
-    vec4 result;
-
-    uint current_slice = gl_GlobalInvocationID.z;
-    unpacked_s[0]      = LOAD_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    unpacked_s[1]      = LOAD_UNPACK4_HALF(mean_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(mean_iter, current_slice * mean_attrs.stride_x));
-    unpacked_s[2]      = LOAD_UNPACK4_HALF(sd_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(sd_iter, current_slice * sd_attrs.stride_x));
-
-    if((current_slice % uint(4)) == uint(0))
-    {
-        tmp    = unpacked_s[0] - unpacked_s[1].x;
-        result = tmp / unpacked_s[2].x;
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
-    }
-    else if((current_slice % uint(4)) == uint(1))
-    {
-        tmp    = unpacked_s[0] - unpacked_s[1].y;
-        result = tmp / unpacked_s[2].y;
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
-    }
-    else if((current_slice % uint(4)) == uint(2))
-    {
-        tmp    = unpacked_s[0] - unpacked_s[1].z;
-        result = tmp / unpacked_s[2].z;
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
-    }
-    else
-    {
-        tmp    = unpacked_s[0] - unpacked_s[1].w;
-        result = tmp / unpacked_s[2].w;
-
-        STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, result);
-    }
-}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
deleted file mode 100644
index 936839f97e..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-/** Performs a pixelwise multiplication with float scale of float inputs.
- *
- * @param[in]  src1_ptr   Pointer to the first source tensor. Supported data types: F32
- * @param[in]  src1_attrs The attributes of the first source tensor
- * @param[in]  src2_ptr   Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  src2_attrs The attributes of the second source tensor
- * @param[out] dst_ptr    Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
- * @param[in]  dst_attrs  The attributes of the destination tensor
- * @param[in]  scale      Float scaling factor. Supported data types: F32
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src1_attrs;
-    Tensor3DAttributes src2_attrs;
-    Tensor3DAttributes dst_attrs;
-};
-TENSOR_DECLARATION(1, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
-TENSOR_DECLARATION(2, src2Buffer, float, src2_ptr, src2_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main()
-{
-    // Get pixels pointer
-    Tensor3DIterator src1_iter = CONVERT_TO_TENSOR3D_ITERATOR(src1_attrs, src1_shift);
-    Tensor3DIterator src2_iter = CONVERT_TO_TENSOR3D_ITERATOR(src2_attrs, src2_shift);
-    Tensor3DIterator dst_iter  = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    float result = LOAD_CURRENT_ITEM(src1_ptr, src1_iter) * LOAD_CURRENT_ITEM(src2_ptr, src2_iter) * float(SCALE);
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, result);
-}
diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
deleted file mode 100644
index 6ca4265056..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs
+++ /dev/null
@@ -1,1052 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-/** Performs a pooling function
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note The pool size must be passed at compile time using "#define POOLING_LAYER_n". e.g. "#define POOLING_LAYER_2"
- *       n must be one of these: 2, 3, 7, N
- *       Pool size must be passed using POOL_SIZE if POOLING_LAYER_N is defined. e.g. POOL_SIZE=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       POOL_AVG must be provided otherwise max pooling will be performed.
- *       MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions
- *       PAD_X and PAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  src_ptr   Pointer to the source image. Supported data types: F32/F16
- * @param[in]  src_attrs The attributes of the source image
- * @param[out] dst_ptr   Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  src_attrs The attributes of the destination image
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-};
-
-// Common definitions
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define POOL_OP(res, a, b) ((res) = (a) + (b))
-#define POOL_OP_float(res, a, b) (res = a + b)
-#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b))
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define POOL_OP(res, a, b)        \
-    (res) = (a);                  \
-    if(isnan(a.x) || (a.x < b.x)) \
-    {                             \
-        res.x = b.x;              \
-    }                             \
-    if(isnan(a.y) || (a.y < b.y)) \
-    {                             \
-        res.y = b.y;              \
-    }                             \
-    if(isnan(a.z) || (a.z < b.z)) \
-    {                             \
-        res.z = b.z;              \
-    }                             \
-    if(isnan(a.w) || (a.w < b.w)) \
-    {                             \
-        res.w = b.w;              \
-    }
-#define POOL_OP_float(res, a, b) \
-    (res) = (a);                 \
-    if(isnan(a) || (a < b))      \
-    {                            \
-        res = b;                 \
-    }
-#define POOL_OP_vec2(res, a, b)   \
-    (res) = (a);                  \
-    if(isnan(a.x) || (a.x < b.x)) \
-    {                             \
-        res.x = b.x;              \
-    }                             \
-    if(isnan(a.y) || (a.y < b.y)) \
-    {                             \
-        res.y = b.y;              \
-    }
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-#define POW2_OP(x, vec_size) ((x) * (x))
-#else /* defined(POOL_L2) */
-#define POW2_OP(x, vec_size) (x)
-#endif /* defined(POOL_L2) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-#define SQRT_OP(x) sqrt((x))
-
-#if defined(DATA_TYPE_FP32)
-
-float calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
-float calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
-
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-#if defined(POOL_SIZE)
-// Set the initial value for the pooling operation accordingly with the data type
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define INITIAL_VALUE 0.0f
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define INITIAL_VALUE -3.402823466385289e+38
-#endif // POOL_AVG
-#endif //POOL_SIZE
-
-float calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
-    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
-    int end_x   = int(min(start_x + pool_size, upper_bound_w));
-    int end_y   = int(min(start_y + pool_size, upper_bound_h));
-
-    float data_max;
-    data_max = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-
-    for(int i = 0; (start_y + i) < end_y; ++i)
-    {
-        for(int j = 0; (start_x + j) < end_x; ++j)
-        {
-            float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
-            POOL_OP_float(data_max, data_max, data);
-        }
-    }
-
-    return data_max;
-}
-
-float calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
-    int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
-    int end_x   = int(min(start_x + pool_size, upper_bound_w));
-    int end_y   = int(min(start_y + pool_size, upper_bound_h));
-
-    float data_total = 0.0f;
-    for(int i = 0; (start_x + i) < end_x; i++)
-    {
-        for(int j = 0; (start_y + j) < end_y; ++j)
-        {
-            float data = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, i, j, 0));
-            if(isnan(data))
-            {
-                data = 0.0f;
-            }
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data = POW2_OP(data, 1);
-#endif /* defined(POOL_L2) */
-            data_total = data_total + data;
-        }
-    }
-
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-
-    return data_total / float((end_y - start_y) * (end_x - start_x));
-}
-
-#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7)
-
-#if defined(POOLING_LAYER_2)
-#define POOL_SIZE 2
-#elif defined(POOLING_LAYER_3)
-#define POOL_SIZE 3
-#elif defined(POOLING_LAYER_7)
-#define POOL_SIZE 7
-#else // POOLING_LAYER_n
-#error Please define POOLING_LAYER_N instead.
-#endif // POOLING_LAYER_n
-
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    //Load and calculate data
-    float res;
-#if defined(POOL_AVG) || defined(POOL_L2)
-    res = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else  /*POOL_AVG*/
-    res = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, res);
-}
-
-#elif defined(POOLING_LAYER_3_OPTIMIZED)
-
-#define POOLING3x3_STRIDE1(res, input_ptr, input_iter)                                                             \
-    vec4 data00 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                                   \
-    vec2 data01 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));                         \
-    vec4 data10 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                                   \
-    vec2 data11 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));                         \
-    vec4 data20 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                                   \
-    vec2 data21 = VLOAD2(vec2, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));                         \
-    data00      = POW2_OP(data00, 4);                                                                              \
-    data01      = POW2_OP(data01, 2);                                                                              \
-    data10      = POW2_OP(data10, 4);                                                                              \
-    data11      = POW2_OP(data11, 2);                                                                              \
-    data20      = POW2_OP(data20, 4);                                                                              \
-    data21      = POW2_OP(data21, 2);                                                                              \
-    \
-    vec4 values000;                                                                                                \
-    vec4 values001;                                                                                                \
-    vec4 values010;                                                                                                \
-    vec4 values100;                                                                                                \
-    vec4 values101;                                                                                                \
-    vec4 values11;                                                                                                 \
-    vec4 values200;                                                                                                \
-    vec4 values201;                                                                                                \
-    vec4 values21;                                                                                                 \
-    values000.xyzw = data00.xyzy;                                                                                  \
-    values001.xyzw = data00.zwzw;                                                                                  \
-    values010.x    = data01.x;                                                                                     \
-    values010.y    = data00.w;                                                                                     \
-    values010.zw   = data01.xy;                                                                                    \
-    values100.xyzw = data10.xyzy;                                                                                  \
-    values101.xyzw = data10.zwzw;                                                                                  \
-    values11.x     = data11.x;                                                                                     \
-    values11.y     = data10.w;                                                                                     \
-    values11.zw    = data11.xy;                                                                                    \
-    values200.xyzw = data20.xyzy;                                                                                  \
-    values201.xyzw = data20.zwzw;                                                                                  \
-    values21.x     = data21.x;                                                                                     \
-    values21.y     = data20.w;                                                                                     \
-    values21.zw    = data21.xy;                                                                                    \
-    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
-    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
-    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
-    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
-    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
-    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
-    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
-    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
-
-#define POOLING3x3_STRIDE2(res, input_ptr, input_iter)                                                             \
-    vec4  data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                                 \
-    vec4  data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));                       \
-    float data010 = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8));                               \
-    vec4  data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                                 \
-    vec4  data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));                       \
-    float data11  = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8));                               \
-    vec4  data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                                 \
-    vec4  data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));                       \
-    float data21  = LOAD(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8));                               \
-    data000       = POW2_OP(data000, 4);                                                                           \
-    data001       = POW2_OP(data001, 4);                                                                           \
-    data010       = POW2_OP(data010, 1);                                                                           \
-    data100       = POW2_OP(data100, 4);                                                                           \
-    data101       = POW2_OP(data101, 4);                                                                           \
-    data11        = POW2_OP(data11, 1);                                                                            \
-    data200       = POW2_OP(data200, 4);                                                                           \
-    data201       = POW2_OP(data201, 4);                                                                           \
-    data21        = POW2_OP(data21, 1);                                                                            \
-    \
-    vec4 values000;                                                                                                \
-    vec4 values001;                                                                                                \
-    vec4 values010;                                                                                                \
-    vec4 values100;                                                                                                \
-    vec4 values101;                                                                                                \
-    vec4 values11;                                                                                                 \
-    vec4 values200;                                                                                                \
-    vec4 values201;                                                                                                \
-    vec4 values21;                                                                                                 \
-    values000.xyzw = data000.xyzz;                                                                                 \
-    values001.xyzw = vec4(data000.w, data001.xxy);                                                                 \
-    values010.xyzw = vec4(data001.zzw, data010);                                                                   \
-    values100.xyzw = data100.xyzz;                                                                                 \
-    values101.xyzw = vec4(data100.w, data101.xxy);                                                                 \
-    values11.xyzw  = vec4(data101.zzw, data11);                                                                    \
-    values200.xyzw = data200.xyzz;                                                                                 \
-    values201.xyzw = vec4(data200.w, data201.xxy);                                                                 \
-    values21.xyzw  = vec4(data201.zzw, data21);                                                                    \
-    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
-    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
-    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
-    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
-    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
-    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
-    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
-    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
-
-#define POOLING3x3_STRIDE3(res, input_ptr, input_iter)                                                 \
-    vec4 data000 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                      \
-    vec4 data001 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));            \
-    vec4 data010 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(8));            \
-    vec4 data100 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                      \
-    vec4 data101 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));            \
-    vec4 data11  = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(8));            \
-    vec4 data200 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                      \
-    vec4 data201 = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));            \
-    vec4 data21  = VLOAD4(vec4, input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(8));            \
-    data000      = POW2_OP(data000, 4);                                                                \
-    data001      = POW2_OP(data001, 4);                                                                \
-    data010      = POW2_OP(data010, 4);                                                                \
-    data100      = POW2_OP(data100, 4);                                                                \
-    data101      = POW2_OP(data101, 4);                                                                \
-    data11       = POW2_OP(data11, 4);                                                                 \
-    data200      = POW2_OP(data200, 4);                                                                \
-    data201      = POW2_OP(data201, 4);                                                                \
-    data21       = POW2_OP(data21, 4);                                                                 \
-    \
-    POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw);                                                 \
-    POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw);                                                 \
-    POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw);                                                  \
-    POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw);                                                 \
-    POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw);                                                 \
-    POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw);                                                  \
-    POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
-    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
-
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    vec4 res;
-    // Perform pooling 3x3 for 4 output elements
-#if STRIDE_X == 1
-    POOLING3x3_STRIDE1(res, src_ptr, src_iter);
-#elif STRIDE_X == 2
-    POOLING3x3_STRIDE2(res, src_ptr, src_iter);
-#elif STRIDE_X == 3
-    POOLING3x3_STRIDE3(res, src_ptr, src_iter);
-#endif /*STRIDE_X == 1*/
-
-    // Divide by pool region in case of average pooling
-#if defined(POOL_AVG) || defined(POOL_L2)
-    ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
-    int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
-    ivec4 end_x   = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
-    int   end_y   = min((start_y + 3), MAX_HEIGHT);
-#if defined(EXCLUDE_PADDING)
-    start_x       = max(ivec4(0), start_x);
-    start_y       = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, res);
-}
-
-#elif defined(POOLING_LAYER_N)
-
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    vec4  vdata0 = vec4(INITIAL_VALUE);
-    vec4  vdata1 = vec4(INITIAL_VALUE);
-    float sdata  = float(INITIAL_VALUE);
-
-    for(int y = 0; y < int(POOL_SIZE); y++)
-    {
-        int x = 0;
-        for(; x <= (int(POOL_SIZE) - 8); x += 8)
-        {
-            vec4 data2 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
-            vec4 data3 = VLOAD4(vec4, src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(4));
-
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data2 *= data2;
-            data3 *= data3;
-#endif /* defined(POOL_L2) */
-
-            POOL_OP(vdata0, vdata0, data2);
-            POOL_OP(vdata1, vdata1, data3);
-        }
-
-        // Leftover
-        for(; x < int(POOL_SIZE); ++x)
-        {
-            float data4 = LOAD(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data4 *= data4;
-#endif /* defined(POOL_L2) */
-            POOL_OP_float(sdata, sdata, data4);
-        }
-    }
-
-    //Reduce result
-    vec4 reduce4;
-    POOL_OP(reduce4, vdata0.xyzw, vdata1.xyzw);
-    vec2 reduce2;
-    POOL_OP_vec2(reduce2, reduce4.xy, reduce4.zw);
-    float res;
-    POOL_OP_float(res, reduce2.x, reduce2.y);
-    POOL_OP_float(res, res, sdata);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    {
-        // Divide by pool region in case of average pooling
-        int start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X;
-        int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
-        int end_x   = int(min(start_x + POOL_SIZE, MAX_WIDTH));
-        int end_y   = int(min(start_y + POOL_SIZE, MAX_HEIGHT));
-#if defined(EXCLUDE_PADDING)
-        start_x     = max(0, start_x);
-        start_y     = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-        float res1  = float((end_y - start_y) * (end_x - start_x));
-        res         = DIV_OP(res, res1);
-    }
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, res);
-}
-#endif // POOLING_LAYER_N
-
-#elif defined(DATA_TYPE_FP16)
-
-vec2 calculate_max(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
-vec2 calculate_avg(const int, Tensor3DIterator, const int, const int, const int, const int, const int, const int);
-
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
-
-#if defined(POOL_SIZE)
-// Set the initial value for the pooling operation accordingly with the data type
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define INITIAL_VALUE 0.0f
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define INITIAL_VALUE -65504.0f
-#endif //POOL_AVG
-#endif //POOL_SIZE
-
-vec2 calculate_max(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x;
-    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
-    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
-    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
-
-    int start_x2 = start_x1 + stride_x;
-    int start_y2 = start_y1;
-    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
-    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
-
-    //Initialize maximum
-    vec2 data_max = vec2(0);
-
-    //Load and Set initial maximum1
-    vec2 data_init1 = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    data_max.x      = data_init1.x;
-
-    //Load and Set initial maximum2
-    if(end_x1 < upper_bound_w)
-    {
-        if((stride_x % 2) == 0)
-        {
-            vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x, 0, 0));
-            data_max.y      = data_init2.x;
-        }
-        else
-        {
-            vec2 data_init2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, stride_x - 1, 0, 0));
-            data_max.y      = data_init2.y;
-        }
-    }
-
-    for(int i = 0; (start_y1 + i) < end_y1; i++)
-        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
-        {
-            //Calculate maximum1
-            if((start_x1 + j + 1) < end_x1)
-            {
-                vec2  data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
-                float data_mr1;
-                POOL_OP_float(data_mr1, data1.x, data1.y);
-                POOL_OP_float(data_max.x, data_max.x, data_mr1);
-            }
-            else
-            {
-                vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
-                POOL_OP_float(data_max.x, data_max.x, data1.x);
-            }
-
-            //Calculate maximum2
-            if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w)
-            {
-                if((stride_x % 2) == 0)
-                {
-                    vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0));
-
-                    if((start_x2 + j + 1) < end_x2)
-                    {
-                        float data_mr2;
-                        POOL_OP_float(data_mr2, data2.x, data2.y);
-                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
-                    }
-                    else
-                    {
-                        POOL_OP_float(data_max.y, data_max.y, data2.x);
-                    }
-                }
-                else
-                {
-                    vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0));
-                    vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0));
-                    if((start_x2 + j + 1) < end_x2)
-                    {
-                        float data_mr2;
-                        POOL_OP_float(data_mr2, data3.x, data2.y);
-                        POOL_OP_float(data_max.y, data_max.y, data_mr2);
-                    }
-                    else
-                    {
-                        POOL_OP_float(data_max.y, data_max.y, data2.y);
-                    }
-                }
-            }
-        }
-    return data_max;
-}
-
-vec2 calculate_avg(const int pool_size, Tensor3DIterator src_iter, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * stride_x - pad_x;
-    int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y;
-    int end_x1   = int(min(start_x1 + pool_size, upper_bound_w));
-    int end_y1   = int(min(start_y1 + pool_size, upper_bound_h));
-
-    int start_x2 = start_x1 + stride_x;
-    int start_y2 = start_y1;
-    int end_x2   = int(min(start_x2 + pool_size, upper_bound_w));
-    int end_y2   = int(min(start_y2 + pool_size, upper_bound_h));
-
-    //Initialize sum
-    float data_total1 = float(0);
-    float data_total2 = float(0);
-    for(int i = 0; (start_y1 + i) < end_y1; i++)
-        for(int j = 0; (start_x1 + j) < end_x1; j = j + 2)
-        {
-            vec2 data1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, j, i, 0));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data1 = POW2_OP(data1, 2);
-#endif /* defined(POOL_L2) */
-            //Calculate sum1
-            if((start_x1 + j + 1) < end_x1)
-            {
-                data_total1 = data_total1 + data1.x + data1.y;
-            }
-            else
-            {
-                data_total1 = data_total1 + data1.x;
-            }
-
-            //Calculate sum2
-            if((start_x2 + j) < end_x2 && end_x1 <= upper_bound_w)
-            {
-                if((stride_x % 2) == 0)
-                {
-                    vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x), i, 0));
-#if defined(POOL_L2)
-                    // Raise to power of 2 for L2 Pooling
-                    data2 = POW2_OP(data2, 2);
-#endif /* defined(POOL_L2) */
-                    if((start_x2 + j + 1) < end_x2)
-                    {
-                        data_total2 = data_total2 + data2.x + data2.y;
-                    }
-                    else
-                    {
-                        data_total2 = data_total2 + data2.x;
-                    }
-                }
-                else
-                {
-                    vec2 data2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x - 1), i, 0));
-                    vec2 data3 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (j + stride_x + 1), i, 0));
-#if defined(POOL_L2)
-                    // Raise to power of 2 for L2 Pooling
-                    data2 = POW2_OP(data2, 2);
-                    data3 = POW2_OP(data3, 2);
-#endif /* defined(POOL_L2) */
-                    if((start_x2 + j + 1) < end_x2)
-                    {
-                        data_total2 = data_total2 + data3.x + data2.y;
-                    }
-                    else
-                    {
-                        data_total2 = data_total2 + data2.y;
-                    }
-                }
-            }
-        }
-#if defined(EXCLUDE_PADDING)
-    start_x1 = max(0, start_x1);
-    start_y1 = max(0, start_y1);
-    start_x2 = max(0, start_x2);
-    start_y2 = max(0, start_y2);
-#endif /* defined(EXCLUDE_PADDING) */
-
-    //Calculate average
-    vec2 data_avg;
-    data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1));
-    data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2));
-
-    return data_avg;
-}
-
-#if defined(POOLING_LAYER_2) || defined(POOLING_LAYER_3) || defined(POOLING_LAYER_7)
-
-#if defined(POOLING_LAYER_2)
-#define POOL_SIZE 2
-#elif defined(POOLING_LAYER_3)
-#define POOL_SIZE 3
-#elif defined(POOLING_LAYER_7)
-#define POOL_SIZE 7
-#else // POOLING_LAYER_n
-#error Please define POOLING_LAYER_N instead.
-#endif // POOLING_LAYER_n
-
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    //Load and calculate data
-    vec2 data;
-#if defined(POOL_AVG) || defined(POOL_L2)
-    data = calculate_avg(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#else  /*POOL_AVG*/
-    data = calculate_max(POOL_SIZE, src_iter, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    data = SQRT_OP(data);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
-}
-
-#elif defined(POOLING_LAYER_3_OPTIMIZED)
-
-#define POOLING3x3_STRIDE1_fp16(res, input_ptr, input_iter)                                                        \
-    vec4 data00 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                            \
-    vec2 data01 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2));                    \
-    vec4 data10 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                            \
-    vec2 data11 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2));                    \
-    vec4 data20 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                            \
-    vec2 data21 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2));                    \
-    data00      = POW2_OP(data00, 4);                                                                              \
-    data01      = POW2_OP(data01, 2);                                                                              \
-    data10      = POW2_OP(data10, 4);                                                                              \
-    data11      = POW2_OP(data11, 2);                                                                              \
-    data20      = POW2_OP(data20, 4);                                                                              \
-    data21      = POW2_OP(data21, 2);                                                                              \
-    \
-    vec4 values000;                                                                                                \
-    vec4 values001;                                                                                                \
-    vec4 values010;                                                                                                \
-    vec4 values100;                                                                                                \
-    vec4 values101;                                                                                                \
-    vec4 values11;                                                                                                 \
-    vec4 values200;                                                                                                \
-    vec4 values201;                                                                                                \
-    vec4 values21;                                                                                                 \
-    values000.xyzw = data00.xyzy;                                                                                  \
-    values001.xyzw = data00.zwzw;                                                                                  \
-    values010.x    = data01.x;                                                                                     \
-    values010.y    = data00.w;                                                                                     \
-    values010.zw   = data01.xy;                                                                                    \
-    values100.xyzw = data10.xyzy;                                                                                  \
-    values101.xyzw = data10.zwzw;                                                                                  \
-    values11.x     = data11.x;                                                                                     \
-    values11.y     = data10.w;                                                                                     \
-    values11.zw    = data11.xy;                                                                                    \
-    values200.xyzw = data20.xyzy;                                                                                  \
-    values201.xyzw = data20.zwzw;                                                                                  \
-    values21.x     = data21.x;                                                                                     \
-    values21.y     = data20.w;                                                                                     \
-    values21.zw    = data21.xy;                                                                                    \
-    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
-    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
-    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
-    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
-    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
-    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
-    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
-    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
-
-#define POOLING3x3_STRIDE2_fp16(res, input_ptr, input_iter)                                                        \
-    vec4  data000;                                                                                                 \
-    vec4  data001;                                                                                                 \
-    float data010;                                                                                                 \
-    vec4  data100;                                                                                                 \
-    vec4  data101;                                                                                                 \
-    float data11;                                                                                                  \
-    vec4  data200;                                                                                                 \
-    vec4  data201;                                                                                                 \
-    float data21;                                                                                                  \
-    vec2  datamiddle0;                                                                                             \
-    vec2  datamiddle1;                                                                                             \
-    vec2  datamiddle2;                                                                                             \
-    data000     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));                            \
-    data001     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2));                  \
-    datamiddle0 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));                    \
-    data010     = datamiddle0.x;                                                                                   \
-    data100     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));                            \
-    data101     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2));                  \
-    datamiddle1 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));                    \
-    data11      = datamiddle1.x;                                                                                   \
-    data200     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));                            \
-    data201     = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2));                  \
-    datamiddle2 = LOAD_UNPACK2_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));                    \
-    data21      = datamiddle2.x;                                                                                   \
-    data000     = POW2_OP(data000, 4);                                                                             \
-    data001     = POW2_OP(data001, 4);                                                                             \
-    data010     = POW2_OP(data010, 1);                                                                             \
-    data100     = POW2_OP(data100, 4);                                                                             \
-    data101     = POW2_OP(data101, 4);                                                                             \
-    data11      = POW2_OP(data11, 1);                                                                              \
-    data200     = POW2_OP(data200, 4);                                                                             \
-    data201     = POW2_OP(data201, 4);                                                                             \
-    data21      = POW2_OP(data21, 1);                                                                              \
-    \
-    vec4 values000;                                                                                                \
-    vec4 values001;                                                                                                \
-    vec4 values010;                                                                                                \
-    vec4 values100;                                                                                                \
-    vec4 values101;                                                                                                \
-    vec4 values11;                                                                                                 \
-    vec4 values200;                                                                                                \
-    vec4 values201;                                                                                                \
-    vec4 values21;                                                                                                 \
-    values000.xyzw = data000.xyzz;                                                                                 \
-    values001.xyzw = vec4(data000.w, data001.xxy);                                                                 \
-    values010.xyzw = vec4(data001.zzw, data010);                                                                   \
-    values100.xyzw = data100.xyzz;                                                                                 \
-    values101.xyzw = vec4(data100.w, data101.xxy);                                                                 \
-    values11.xyzw  = vec4(data101.zzw, data11);                                                                    \
-    values200.xyzw = data200.xyzz;                                                                                 \
-    values201.xyzw = vec4(data200.w, data201.xxy);                                                                 \
-    values21.xyzw  = vec4(data201.zzw, data21);                                                                    \
-    POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw);                                                       \
-    POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw);                                                       \
-    POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw);                                                        \
-    POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw);                                                       \
-    POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw);                                                       \
-    POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw);                                                        \
-    POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \
-    POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw))
-
-#define POOLING3x3_STRIDE3_fp16(res, input_ptr, input_iter)                                            \
-    vec4 data000 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0));               \
-    vec4 data001 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(2));     \
-    vec4 data010 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 0, 0) + uint(4));     \
-    vec4 data100 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0));               \
-    vec4 data101 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(2));     \
-    vec4 data11  = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 1, 0) + uint(4));     \
-    vec4 data200 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0));               \
-    vec4 data201 = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(2));     \
-    vec4 data21  = VLOAD2_UNPACK4_HALF(input_ptr, TENSOR3D_OFFSET(input_iter, 0, 2, 0) + uint(4));     \
-    data000      = POW2_OP(data000, 4);                                                                \
-    data001      = POW2_OP(data001, 4);                                                                \
-    data010      = POW2_OP(data010, 4);                                                                \
-    data100      = POW2_OP(data100, 4);                                                                \
-    data101      = POW2_OP(data101, 4);                                                                \
-    data11       = POW2_OP(data11, 4);                                                                 \
-    data200      = POW2_OP(data200, 4);                                                                \
-    data201      = POW2_OP(data201, 4);                                                                \
-    data21       = POW2_OP(data21, 4);                                                                 \
-    \
-    POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw);                                                 \
-    POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw);                                                 \
-    POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw);                                                  \
-    POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw);                                                 \
-    POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw);                                                 \
-    POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw);                                                  \
-    POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \
-    POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y, data010.xw))
-
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    vec4 res;
-    // Perform pooling 3x3 for 4 output elements
-#if STRIDE_X == 1
-    POOLING3x3_STRIDE1_fp16(res, src_ptr, src_iter);
-#elif STRIDE_X == 2
-    POOLING3x3_STRIDE2_fp16(res, src_ptr, src_iter);
-#elif STRIDE_X == 3
-    POOLING3x3_STRIDE3_fp16(res, src_ptr, src_iter);
-#endif /*STRIDE_X == 1*/
-
-    // Divide by pool region in case of average pooling
-#if defined(POOL_AVG) || defined(POOL_L2)
-    ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X));
-    int   start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
-    ivec4 end_x   = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH)));
-    int   end_y   = min((start_y + 3), MAX_HEIGHT);
-#if defined(EXCLUDE_PADDING)
-    start_x       = max(ivec4(0), start_x);
-    start_y       = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x)));
-#endif /*POOL_AVG*/
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, res);
-}
-
-#elif defined(POOLING_LAYER_N)
-
-void main(void)
-{
-    // Get pixels pointer
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    vec4 vdata00 = vec4(INITIAL_VALUE);
-    vec4 vdata01 = vec4(INITIAL_VALUE);
-    vec4 vdata10 = vec4(INITIAL_VALUE);
-    vec4 vdata11 = vec4(INITIAL_VALUE);
-    vec2 sdata   = vec2(INITIAL_VALUE);
-
-    for(int y = 0; y < int(POOL_SIZE); y++)
-    {
-        int x = 0;
-        for(; x <= (int(POOL_SIZE) - 8); x += 8)
-        {
-            vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
-            vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0) + uint(2));
-
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data2 *= data2;
-            data3 *= data3;
-#endif /* defined(POOL_L2) */
-
-            POOL_OP(vdata00, vdata00, data2);
-            POOL_OP(vdata10, vdata10, data3);
-        }
-
-        // Leftover
-        for(; x < int(POOL_SIZE); x = x + 2)
-        {
-            vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x, y, 0));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data4middle *= data4middle;
-#endif /* defined(POOL_L2) */
-            if((x + 1) >= int(POOL_SIZE))
-            {
-                POOL_OP_float(sdata.x, sdata.x, data4middle.x);
-            }
-            else
-            {
-                float data4;
-                POOL_OP_float(data4, data4middle.x, data4middle.y);
-                POOL_OP_float(sdata.x, sdata.x, data4);
-            }
-        }
-    }
-
-    for(int y = 0; y < int(POOL_SIZE); y++)
-    {
-        if((STRIDE_X % 2) == 0)
-        {
-            int x1 = STRIDE_X;
-            for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
-            {
-                vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
-                vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2));
-
-#if defined(POOL_L2)
-                // Raise to power of 2 for L2 Pooling
-                data2 *= data2;
-                data3 *= data3;
-#endif /* defined(POOL_L2) */
-
-                POOL_OP(vdata01, vdata01, data2);
-                POOL_OP(vdata11, vdata11, data3);
-            }
-
-            // Leftover
-            for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
-            {
-                vec2 data4middle;
-                data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
-#if defined(POOL_L2)
-                // Raise to power of 2 for L2 Pooling
-                data4middle *= data4middle;
-#endif /* defined(POOL_L2) */
-                if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
-                {
-                    POOL_OP_float(sdata.y, sdata.y, data4middle.x);
-                }
-                else
-                {
-                    float data4;
-                    POOL_OP_float(data4, data4middle.x, data4middle.y);
-                    POOL_OP_float(sdata.y, sdata.y, data4);
-                }
-            }
-        }
-        else
-        {
-            vec2 dataorigin2;
-            dataorigin2 = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, (STRIDE_X - 1), y, 0));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            dataorigin2.y *= dataorigin2.y;
-#endif /* defined(POOL_L2) */
-            POOL_OP_float(sdata.y, sdata.y, dataorigin2.y);
-
-            int x1 = STRIDE_X + 1;
-            for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8)
-            {
-                vec4 data2 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
-                vec4 data3 = VLOAD2_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0) + uint(2));
-
-#if defined(POOL_L2)
-                // Raise to power of 2 for L2 Pooling
-                data2 *= data2;
-                data3 *= data3;
-#endif /* defined(POOL_L2) */
-
-                POOL_OP(vdata01, vdata01, data2);
-                POOL_OP(vdata11, vdata11, data3);
-            }
-
-            // Leftover
-            for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2)
-            {
-                vec2 data4middle = LOAD_UNPACK2_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, x1, y, 0));
-#if defined(POOL_L2)
-                // Raise to power of 2 for L2 Pooling
-                data4middle *= data4middle;
-#endif /* defined(POOL_L2) */
-                if((x1 + 1) >= int(POOL_SIZE + STRIDE_X))
-                {
-                    POOL_OP_float(sdata.y, sdata.y, data4middle.x);
-                }
-                else
-                {
-                    float data4;
-                    POOL_OP_float(data4, data4middle.x, data4middle.y);
-                    POOL_OP_float(sdata.y, sdata.y, data4);
-                }
-            }
-        }
-    }
-
-    //Reduce result
-    vec4 reduce40;
-    POOL_OP(reduce40, vdata00.xyzw, vdata10.xyzw);
-    vec2 reduce20;
-    POOL_OP_vec2(reduce20, reduce40.xy, reduce40.zw);
-    vec4 reduce41;
-    POOL_OP(reduce41, vdata01.xyzw, vdata11.xyzw);
-    vec2 reduce21;
-    POOL_OP_vec2(reduce21, reduce41.xy, reduce41.zw);
-    vec2 data;
-    POOL_OP_float(data.x, reduce20.x, reduce20.y);
-    POOL_OP_float(data.x, data.x, sdata.x);
-    POOL_OP_float(data.y, reduce21.x, reduce21.y);
-    POOL_OP_float(data.y, data.y, sdata.y);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    {
-        // Divide by pool region in case of average pooling
-        int start_x1 = (2 * int(gl_GlobalInvocationID.x)) * STRIDE_X - PAD_X;
-        int start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y;
-        int end_x1   = int(min(start_x1 + POOL_SIZE, MAX_WIDTH));
-        int end_y1   = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT));
-        int start_x2 = start_x1 + STRIDE_X;
-        int start_y2 = start_y1;
-        int end_x2   = int(min(start_x2 + POOL_SIZE, MAX_WIDTH));
-        int end_y2   = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT));
-#if defined(EXCLUDE_PADDING)
-        start_x1     = max(0, start_x1);
-        start_y1     = max(0, start_y1);
-        start_x2     = max(0, start_x2);
-        start_y2     = max(0, start_y2);
-#endif /* defined(EXCLUDE_PADDING) */
-        vec2 res1;
-        res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1));
-        res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2));
-        data.x = DIV_OP(data.x, res1.x);
-        data.y = DIV_OP(data.y, res1.y);
-    }
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    data = SQRT_OP(data);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
-}
-#endif // POOLING_LAYER_N
-
-#else // DATA_TYPE_FP32
-#error Data type not supported
-#endif // DATA_TYPE_FP32
diff --git a/src/core/GLES_COMPUTE/cs_shaders/scale.cs b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
deleted file mode 100644
index 63be478053..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/scale.cs
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2016-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-// We DO have to use highp for DATA_TYPE_FP16 float here to calculate the coordinates of source tensor. float is highp by default, but we still write it down here to make it more clearly, and mediump is only used for src/dst tensor in shader body.
-precision highp float;
-
-/** Performs an affine transformation on an tensor interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel FP16.
- *
- * @param[in]  src_ptr      Pointer to the source tensor. Supported data types: FP16.
- * @param[in]  src_attrs    The attributes of the source tensor
- * @param[out] dst_ptr      Pointer to the destination tensor. Supported data types: FP16. (Must be the same as the input)
- * @param[in]  dst_attrs    The attributes of the destination tensor
- * @param[in]  input_width  Input tensor width
- * @param[in]  input_height Input tensor height
- * @param[in]  scale        The scale factor along x/y dimension
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-    float              input_width;
-    float              input_height;
-    vec2               scale;
-};
-
-#if defined(DATA_TYPE_FP16)
-#if defined(SCALE_NEAREST_GENERIC)
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-
-vec4[2] transform_nearest(vec2 coord, vec2 scale)
-{
-    vec4 in_x_coords = vec4(coord.x, 1.f + coord.x, 2.f + coord.x, 3.f + coord.x);
-
-    vec4[2] t;
-#if defined(SAMPLING_POLICY_CENTER) /* SAMPLING_POLICY_CENTER */
-    t[0] = (in_x_coords + (vec4(0.5f))) * scale.x;
-    t[1] = vec4((coord.y + 0.5f) * scale.y);
-#elif defined(SAMPLING_POLICY_TOP_LEFT) /* SAMPLING_POLICY_TOP_LEFT */
-    t[0] = in_x_coords * scale.x;
-    t[1] = vec4(coord.y) * scale.y;
-#else                                   /* Unsupported sampling policy */
-#error Unsupported sampling policy
-#endif /* SAMPLING_POLICY */
-
-    return t;
-}
-
-vec4[2] clamp_to_border_with_size(vec4[2] coords, float width, float height, float border_size)
-{
-    vec4[2] c;
-    c[0] = clamp(coords[0], 0.0f - border_size, width - 1.f + border_size);
-    c[1] = clamp(coords[1], 0.0f - border_size, height - 1.f + border_size);
-
-    return c;
-}
-
-void main()
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    vec4[2] tc = clamp_to_border_with_size(transform_nearest(vec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y), scale), input_width, input_height, float(BORDER_SIZE));
-
-    mediump vec2 s = vec2(0.0f);
-    mediump vec4 d = vec4(0.0f);
-
-    for(int i = 0; i < 4; i++)
-    {
-        uint offset_in_bytes = tensor3D_offset_in_bytes(src_iter, int(tc[0][i]), int(tc[1][i]), int(gl_GlobalInvocationID.z));
-
-        s = LOAD_UNPACK2_HALF(src_ptr, uint(offset_in_bytes >> src_shift));
-
-        if(offset_in_bytes % uint(4) == uint(0))
-        {
-            d[i] = s.x;
-        }
-        else
-        {
-            d[i] = s.y;
-        }
-    }
-
-    STORE_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, d);
-}
-#elif defined(SCALE_NEAREST_8X) /* SCALE_NEAREST_GENERIC */
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-
-void main()
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(src_attrs, src_shift);
-    Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
-
-    uvec2 tc = uvec2(gl_GlobalInvocationID.x << uint(2), gl_GlobalInvocationID.y >> uint(1));
-
-    mediump vec4 s = vec4(0.0f);
-    mediump      vec4[2] d;
-
-    s = LOAD_UNPACK4_HALF(src_ptr, TENSOR3D_OFFSET(src_iter, int(tc[0]), int(tc[1]), int(gl_GlobalInvocationID.z)));
-
-    d[0] = vec4(s.x, s.x, s.y, s.y);
-    d[1] = vec4(s.z, s.z, s.w, s.w);
-
-    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, d);
-}
-#endif                          /* SCALE_NEAREST_GENERIC */
-
-#else /* DATA_TYPE_FP16 */
-#error Data type not supported
-#endif /* DATA_TYPE_FP16 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
deleted file mode 100644
index 0293943da1..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
+++ /dev/null
@@ -1,363 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-// Common definitions
-#define MAX_OP(x, y) max((x), (y))
-#define ADD_OP(x, y) ((x) + (y))
-#define SUB_OP(x, y) ((x) - (y))
-#define DIV_OP(x, y) ((x) / (y))
-#define EXP_OP(x) exp((x))
-
-const float float_min = -1.0 / 0.0;
-const vec4  vec4_min  = vec4(float_min);
-
-#ifdef SOFTMAX_LAYER_MAX
-
-/** Identifies the maximum value across the 1st dimension.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note In case the input is not multiple of 8 NON_MULTIPLE_OF_8 must be passed.
- *
- * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_attrs The attributes of the source tensor
- * @param[out] dst_ptr   Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination tensor
- * @param[in]  width     Input image width
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes dst_attrs;
-    uint               width;
-};
-
-#if defined(DATA_TYPE_FP32)
-
-TENSOR_DECLARATION(1, srcBuffer, vec4[2], src_ptr, src_shift, 5, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    // Initialize local maximum
-    vec4 max_val = vec4_min;
-
-    // Calculate max of row
-    uint width3 = width >> 3;
-    for(int i = 0; i < int(width3); i++)
-    {
-        vec4 data[2] = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
-        max_val      = MAX_OP(data[0], max_val);
-        max_val      = MAX_OP(data[1], max_val);
-    }
-
-#ifdef NON_MULTIPLE_OF_8
-    // Handle non multiple of 8
-    vec4 data[2] = LOAD(src_ptr, IMAGE_OFFSET(src_iter, width3 << 3, 0));
-    int  idx     = 0;
-    if(width >> 2 != width3 << 1)
-    {
-        max_val = MAX_OP(data[0], max_val);
-        idx     = 1;
-    }
-    for(int i = 0; i < int(width) % 4; i++)
-    {
-        max_val.x = MAX_OP(data[idx][i], max_val.x);
-    }
-#endif /* NON_MULTIPLE_OF_8 */
-
-    // Perform max reduction
-    max_val.xy = MAX_OP(max_val.xy, max_val.zw);
-    max_val.x  = MAX_OP(max_val.x, max_val.y);
-
-    // Store result
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, max_val.x);
-}
-#elif defined(DATA_TYPE_FP16)
-
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-
-    // Initialize local maximum
-    vec4 max_val = vec4_min;
-
-    // Calculate max of row
-    uint width3 = width >> 3;
-    for(int i = 0; i < int(width3); i++)
-    {
-        vec4 data[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
-        max_val      = MAX_OP(data[0], max_val);
-        max_val      = MAX_OP(data[1], max_val);
-    }
-
-#ifdef NON_MULTIPLE_OF_8
-    // Handle non multiple of 8
-    vec4 data[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, width3 << 3, 0));
-    int  idx     = 0;
-    if(width >> 2 != width3 << 1)
-    {
-        max_val = MAX_OP(data[0], max_val);
-        idx     = 1;
-    }
-    for(int i = 0; i < int(width) % 4; i++)
-    {
-        max_val.x = MAX_OP(data[idx][i], max_val.x);
-    }
-#endif /* NON_MULTIPLE_OF_8 */
-
-    // Perform max reduction
-    max_val.xy = MAX_OP(max_val.xy, max_val.zw);
-    max_val.x  = MAX_OP(max_val.x, max_val.y);
-
-    STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, max_val.xy);
-}
-#else  // DATA_TYPE_FP32
-#error Data type not supported
-#endif // DATA_TYPE_FP32
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
-
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note In case the input is not multiple of 8 NON_MULTIPLE_OF_8 must be passed.
- *
- * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_attrs The attributes of the source tensor
- * @param[in]  max_ptr   Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  max_attrs The attributes of the max values tensor
- * @param[out] dst_ptr   Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination tensor
- * @param[out] sum_ptr   Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_attrs The attributes of the sum values tensor
- * @param[in]  width     Input image width
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes max_attrs;
-    Tensor3DAttributes dst_attrs;
-    Tensor3DAttributes sum_attrs;
-    uint               width;
-};
-#if defined(DATA_TYPE_FP32)
-
-TENSOR_DECLARATION(1, srcBuffer, vec4[2], src_ptr, src_shift, 5, readonly);
-TENSOR_DECLARATION(2, maxBuffer, float, max_ptr, max_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, vec4[2], dst_ptr, dst_shift, 5, writeonly);
-TENSOR_DECLARATION(4, sumBuffer, float, sum_ptr, sum_shift, 2, writeonly);
-
-void main(void)
-{
-    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-    ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
-    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
-
-    // Load max value of 1D logits vector (row)
-    vec4 max_val = vec4(LOAD_CURRENT_ITEM(max_ptr, max_iter));
-
-    // Set sum vector
-    vec4 sum1D = vec4(0);
-
-    // Shift values, exp and sum
-    uint width3 = width >> 3;
-    for(int i = 0; i < int(width3); i++)
-    {
-        vec4 data[2];
-        data    = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
-        data[0] = SUB_OP(data[0], max_val);
-        data[1] = SUB_OP(data[1], max_val);
-        data[0] = EXP_OP(data[0]);
-        data[1] = EXP_OP(data[1]);
-        STORE(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data);
-        sum1D = ADD_OP(sum1D, data[0]);
-        sum1D = ADD_OP(sum1D, data[1]);
-    }
-
-#ifdef NON_MULTIPLE_OF_8
-    // Handle non multiple of 8
-    vec4 data[2] = LOAD(src_ptr, IMAGE_OFFSET(src_iter, width3 << 3, 0));
-    int  idx     = 0;
-    if(width >> 2 != width3 << 1)
-    {
-        data[0] = SUB_OP(data[0], max_val);
-        data[0] = EXP_OP(data[0]);
-        sum1D   = ADD_OP(sum1D, data[0]);
-        idx     = 1;
-    }
-    for(int i = 0; i < int(width) % 4; i++)
-    {
-        data[idx][i] = SUB_OP(data[idx][i], max_val.x);
-        data[idx][i] = EXP_OP(data[idx][i]);
-        sum1D.x      = ADD_OP(sum1D.x, data[idx][i]);
-    }
-    STORE(dst_ptr, IMAGE_OFFSET(dst_iter, width3 << 3, 0), data);
-#endif /* NON_MULTIPLE_OF_8 */
-
-    // Perform min/max reduction
-    sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
-    sum1D.x  = ADD_OP(sum1D.x, sum1D.y);
-
-    // Calculate and store result
-    STORE_CURRENT_ITEM(sum_ptr, sum_iter, sum1D.x);
-}
-#elif defined(DATA_TYPE_FP16)
-
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, maxBuffer, uint, max_ptr, max_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-TENSOR_DECLARATION(4, sumBuffer, uint, sum_ptr, sum_shift, 2, writeonly);
-
-void main(void)
-{
-    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-    ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
-    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
-
-    // Load max value of 1D logits vector (row)
-    vec2 datamaxinit = LOAD_UNPACK2_CURRENT_ITEM_HALF(max_ptr, max_iter);
-    vec4 max_val     = vec4(datamaxinit.x);
-
-    // Set sum vector
-    vec4 sum1D = vec4(0.f);
-
-    // Shift values, exp and sum
-    uint width3 = width >> 3;
-    for(int i = 0; i < int(width3); i++)
-    {
-        vec4 data[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 3, 0));
-        data[0]      = SUB_OP(data[0], max_val);
-        data[1]      = SUB_OP(data[1], max_val);
-        data[0]      = EXP_OP(data[0]);
-        data[1]      = EXP_OP(data[1]);
-        STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i << 3, 0), data);
-        sum1D = ADD_OP(sum1D, data[0]);
-        sum1D = ADD_OP(sum1D, data[1]);
-    }
-
-#ifdef NON_MULTIPLE_OF_8
-    // Handle non multiple of 8
-    vec4 data[2] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, width3 << 3, 0));
-    int  idx     = 0;
-    if(width >> 2 != width3 << 1)
-    {
-        data[0] = SUB_OP(data[0], max_val);
-        data[0] = EXP_OP(data[0]);
-        sum1D   = ADD_OP(sum1D, data[0]);
-        idx     = 1;
-    }
-    for(int i = 0; i < int(width) % 4; i++)
-    {
-        data[idx][i] = SUB_OP(data[idx][i], max_val.x);
-        data[idx][i] = EXP_OP(data[idx][i]);
-        sum1D.x      = ADD_OP(sum1D.x, data[idx][i]);
-    }
-    STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, width3 << 3, 0), data);
-#endif /* NON_MULTIPLE_OF_8 */
-    // Perform min/max reduction
-    sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
-    sum1D.x  = ADD_OP(sum1D.x, sum1D.y);
-
-    // Calculate and store result
-    STORE_PACK2_CURRENT_ITEM_HALF(sum_ptr, sum_iter, sum1D.xy);
-}
-#else  // DATA_TYPE_FP32
-#error Data type not supported
-#endif // DATA_TYPE_FP32
-#elif defined(SOFTMAX_LAYER_NORM)
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- *
- * @param[in]  src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_attrs The attributes of the source tensor
- * @param[in]  sum_ptr   Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_attrs The attributes of the sum values tensor
- * @param[out] dst_ptr   Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_attrs The attributes of the destination tensor
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    Tensor3DAttributes sum_attrs;
-    Tensor3DAttributes dst_attrs;
-};
-#if defined(DATA_TYPE_FP32)
-TENSOR_DECLARATION(1, srcBuffer, vec4[2], src_ptr, src_shift, 5, readonly);
-TENSOR_DECLARATION(2, sumBuffer, float, sum_ptr, sum_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, vec4[2], dst_ptr, dst_shift, 5, writeonly);
-void main(void)
-{
-    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
-
-    // Load max value of 1D logits vector (row)
-    vec4 sum_val = vec4(LOAD(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)));
-
-    vec4 data[2] = LOAD_CURRENT_ITEM(src_ptr, src_iter);
-    data[0]      = DIV_OP(data[0], sum_val);
-    data[1]      = DIV_OP(data[1], sum_val);
-    STORE_CURRENT_ITEM(dst_ptr, dst_iter, data);
-}
-#elif defined(DATA_TYPE_FP16)
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, sumBuffer, uint, sum_ptr, sum_shift, 2, readonly);
-TENSOR_DECLARATION(3, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-void main(void)
-{
-    ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
-    ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
-
-    // Load max value of 1D logits vector (row)
-    vec4 sum_val = vec4(LOAD_UNPACK2_HALF(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)).x);
-
-    vec4 data[2] = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
-    data[0]      = DIV_OP(data[0], sum_val);
-    data[1]      = DIV_OP(data[1], sum_val);
-    STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, data);
-}
-#else // DATA_TYPE_FP32
-#error Data type not supported
-#endif // DATA_TYPE_FP32
-#endif // SOFTMAX_LAYER_MAX
diff --git a/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs b/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs
deleted file mode 100644
index cd2dcdeb5b..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/tensor_shift.cs
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-/** This kernel performs a shift to move "pad_x" columns to the right.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note The width must be passed at compile time using "#define WIDTH n" e.g. "#define WIDTH 1"
- *
- * @param[in,out] src_ptr   Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]     src_attrs The attributes of the source tensor
- * @param[in]     pad_x     The padding of the source tensor in x dimension
- */
-SHADER_PARAMS_DECLARATION
-{
-    Tensor3DAttributes src_attrs;
-    uint               pad_x;
-};
-
-#if defined(DATA_TYPE_FP16)
-TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, restrict);
-
-void main()
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-    int              n        = int(pad_x) % 2;
-
-    if(n == 1)
-    {
-        int i = 0;
-        if((WIDTH % 2) == 1)
-        {
-            i = WIDTH + int(pad_x) - 2;
-        }
-        else
-        {
-            vec2 s0_end = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH - 2))));
-            vec2 s_end  = vec2(s0_end.y, 0.f);
-            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH + int(pad_x) - 1))), s_end);
-            i = WIDTH + int(pad_x) - 3;
-        }
-        for(; i >= (int(pad_x) + 1); i = i - 2)
-        {
-            vec2 s0 = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (i - int(pad_x) - 1))));
-            vec2 s1 = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (i - int(pad_x) + 1))));
-            vec2 s  = vec2(s0.y, s1.x);
-            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * i)), s);
-        }
-        for(int j = 0; j < (int(pad_x) - 1); j = j + 2)
-        {
-            vec2 s_origin = vec2(0.f);
-            STORE_PACK2_CURRENT_ITEM_HALF(src_ptr, src_iter, s_origin);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, 4);
-        }
-        vec2 s0_origin = LOAD_UNPACK2_CURRENT_ITEM_HALF(src_ptr, src_iter);
-        vec2 s_origin  = vec2(0.f, s0_origin.x);
-        STORE_PACK2_CURRENT_ITEM_HALF(src_ptr, src_iter, s_origin);
-    }
-    else
-    {
-        int i = 0;
-        if((WIDTH % 2) == 0)
-        {
-            i = WIDTH + int(pad_x) - 2;
-        }
-        else
-        {
-            vec2 s0_end = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH - 1))));
-            vec2 s_end  = vec2(s0_end.x, 0.f);
-            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (WIDTH + int(pad_x) - 1))), s_end);
-            i = WIDTH + int(pad_x) - 3;
-        }
-        for(; i >= (int(pad_x)); i = i - 2)
-        {
-            vec2 s = LOAD_UNPACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * (i - int(pad_x)))));
-            STORE_PACK2_HALF(src_ptr, TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, (2 * i)), s);
-        }
-        for(int j = 0; j < int(pad_x); j = j + 2)
-        {
-            vec2 s = vec2(0.f);
-            STORE_PACK2_CURRENT_ITEM_HALF(src_ptr, src_iter, s);
-            TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, 4);
-        }
-    }
-}
-#elif defined(DATA_TYPE_FP32)
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, restrict);
-
-void main()
-{
-    Tensor3DIterator src_iter = CONVERT_TO_TENSOR3D_ITERATOR(src_attrs, src_shift);
-
-    for(int i = (WIDTH + int(pad_x) - 1); i >= int(pad_x); i--)
-    {
-        float sorigin = LOAD(src_ptr, TENSOR_OFFSET_ADVANCE(src_iter, (i - int(pad_x))));
-        STORE(src_ptr, TENSOR_OFFSET_ADVANCE(src_iter, i), sorigin);
-    }
-    for(int j = 0; j < int(pad_x); j++)
-    {
-        STORE_CURRENT_ITEM(src_ptr, src_iter, 0.f);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, 4);
-    }
-}
-#else /* DATA_TYPE_FP16 */
-#error Data type not supported
-#endif /* DATA_TYPE_FP16 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
deleted file mode 100755
index 72ade20c80..0000000000
--- a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) 2017, 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-
-#include "helpers_cs.h"
-
-#if defined(DATA_TYPE_FP16)
-precision mediump float;
-#endif // DATA_TYPE_FP16
-
-#define SWAP_ROW_func(u0, l0) \
-    {                         \
-        tmp_swap = u0;        \
-        u0       = l0;        \
-        l0       = tmp_swap;  \
-    }
-
-#define SWAP_4x4_func(u0, u1, u2, u3, l0, l1, l2, l3) \
-    {                                                 \
-        vec4 tmp_swap;                                \
-        SWAP_ROW_func(u0, l0);                        \
-        SWAP_ROW_func(u1, l1);                        \
-        SWAP_ROW_func(u2, l2);                        \
-        SWAP_ROW_func(u3, l3);                        \
-    }
-
-#define TRANSPOSE_4x4_func(u0, u1, u2, u3) \
-    {                                      \
-        mat4x4 matin, matout;              \
-        matin[0] = u0;                     \
-        matin[1] = u1;                     \
-        matin[2] = u2;                     \
-        matin[3] = u3;                     \
-        matout   = transpose(matin);       \
-        u0       = matout[0];              \
-        u1       = matout[1];              \
-        u2       = matout[2];              \
-        u3       = matout[3];              \
-    }
-
-/** This OpenGL ES kernel computes the matrix transposition of input matrix
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
- * @note Optimization name must be passed using "#define OPTIMIZATION_NAME" for F16. e.g. "#define TRANSPOSE_8X8"
- *
- * @param[in]  src_ptr   Pointer to the source matrix. Supported data types: F32/F16
- * @param[in]  src_attrs The attributes of the source matrix
- * @param[out] dst_ptr   Pointer to the destination matrix Supported data type: same as src_ptr
- * @param[in]  dst_attrs The attributes of the destination matrix
- */
-SHADER_PARAMS_DECLARATION
-{
-    ImageAttributes src_attrs;
-    ImageAttributes dst_attrs;
-};
-
-#ifdef DATA_TYPE_FP32
-TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
-TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
-
-void main(void)
-{
-    // compute source address
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    // load the NxN block at (x, y)
-    vec4 u0 = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
-    vec4 u1 = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
-    vec4 u2 = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
-    vec4 u3 = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
-
-    // transpose the block
-    TRANSPOSE_4x4_func(u0, u1, u2, u3);
-
-    // store the block at (y, x)
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, uint(16) * uint(gl_GlobalInvocationID.y) + uint(4) * uint(gl_GlobalInvocationID.x) * (dst_attrs.stride_y));
-
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), u0);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), u1);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), u2);
-    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), u3);
-}
-
-#elif defined(DATA_TYPE_FP16) /* DATA_TYPE_FP16 */
-
-#if defined(TRANSPOSE_4X4)
-TENSOR_DECLARATION(1, srcBuffer, uvec2, src_ptr, src_shift, 3, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec2, dst_ptr, dst_shift, 3, writeonly);
-
-void main(void)
-{
-    // compute source address
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    // load the NxN block at (x, y)
-    vec4 u0 = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 0));
-    vec4 u1 = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 1));
-    vec4 u2 = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 2));
-    vec4 u3 = LOAD_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, 3));
-
-    // transpose the block
-    TRANSPOSE_4x4_func(u0, u1, u2, u3);
-
-    // store the block at (y, x)
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, uint(8) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_attrs.step_y));
-
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 0), u0);
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), u1);
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), u2);
-    STORE_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), u3);
-}
-
-#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_8X8 */
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-
-void main(void)
-{
-    // compute source address
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    vec4 u[8][2];
-
-    for(int i = 0; i < 8; i++)
-    {
-        u[i] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, i));
-    }
-
-    // transpose the block
-    TRANSPOSE_4x4_func(u[0][0], u[1][0], u[2][0], u[3][0]);
-    TRANSPOSE_4x4_func(u[0][1], u[1][1], u[2][1], u[3][1]);
-    TRANSPOSE_4x4_func(u[4][0], u[5][0], u[6][0], u[7][0]);
-    TRANSPOSE_4x4_func(u[4][1], u[5][1], u[6][1], u[7][1]);
-    SWAP_4x4_func(u[0][1], u[1][1], u[2][1], u[3][1], u[4][0], u[5][0], u[6][0], u[7][0]);
-
-    // store the block at (y, x)
-    TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, uint(16) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_attrs.step_y));
-
-    for(int i = 0; i < 8; i++)
-    {
-        STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, i), u[i]);
-    }
-}
-
-#elif defined(TRANSPOSE_8X8_SQUARE) /* TRANSPOSE_8x8_SQUARE */
-TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
-TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
-
-void main(void)
-{
-    ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
-    ImageIterator dst_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(dst_attrs, dst_shift);
-
-    if(gl_GlobalInvocationID.x <= gl_GlobalInvocationID.y)
-    {
-        uint blk1_offset_in_bytes = CURRENT_ITEM_OFFSET_IN_BYTES(src_iter);
-        TENSOR_ITERATOR_ADVANCE_IN_BYTES(dst_iter, uint(16) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_attrs.step_y));
-        uint blk2_offset_in_bytes = CURRENT_ITEM_OFFSET_IN_BYTES(dst_iter);
-
-        // load block1
-        vec4 u1[8][2];
-
-        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(src_iter, blk1_offset_in_bytes);
-        for(int i = 0; i < 8; i++)
-        {
-            u1[i] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, i));
-        }
-
-        // transpose block1
-        TRANSPOSE_4x4_func(u1[0][0], u1[1][0], u1[2][0], u1[3][0]);
-        TRANSPOSE_4x4_func(u1[0][1], u1[1][1], u1[2][1], u1[3][1]);
-        TRANSPOSE_4x4_func(u1[4][0], u1[5][0], u1[6][0], u1[7][0]);
-        TRANSPOSE_4x4_func(u1[4][1], u1[5][1], u1[6][1], u1[7][1]);
-        SWAP_4x4_func(u1[0][1], u1[1][1], u1[2][1], u1[3][1], u1[4][0], u1[5][0], u1[6][0], u1[7][0]);
-
-        // write to block2
-        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(dst_iter, blk2_offset_in_bytes);
-        for(int i = 0; i < 8; i++)
-        {
-            STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, i), u1[i]);
-        }
-
-        // load block2
-        vec4 u2[8][2];
-
-        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(src_iter, blk2_offset_in_bytes);
-        for(int i = 0; i < 8; i++)
-        {
-            u2[i] = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 0, i));
-        }
-
-        // transpose block2
-        TRANSPOSE_4x4_func(u2[0][0], u2[1][0], u2[2][0], u2[3][0]);
-        TRANSPOSE_4x4_func(u2[0][1], u2[1][1], u2[2][1], u2[3][1]);
-        TRANSPOSE_4x4_func(u2[4][0], u2[5][0], u2[6][0], u2[7][0]);
-        TRANSPOSE_4x4_func(u2[4][1], u2[5][1], u2[6][1], u2[7][1]);
-        SWAP_4x4_func(u2[0][1], u2[1][1], u2[2][1], u2[3][1], u2[4][0], u2[5][0], u2[6][0], u2[7][0]);
-
-        // write to block1
-        SET_TENSOR_ITERATOR_OFFSET_IN_BYTES(dst_iter, blk1_offset_in_bytes);
-        for(int i = 0; i < 8; i++)
-        {
-            STORE_PACK8_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, 0, i), u2[i]);
-        }
-    }
-}
-
-#endif /* TRANSPOSE_4X4 */
-
-#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/egl_entries.in b/src/core/GLES_COMPUTE/egl_entries.in
deleted file mode 100644
index 2fff31530d..0000000000
--- a/src/core/GLES_COMPUTE/egl_entries.in
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-EGL_ENTRY(eglGetProcAddress)
-EGL_ENTRY(eglBindAPI)
-EGL_ENTRY(eglChooseConfig)
-EGL_ENTRY(eglCreateContext)
-EGL_ENTRY(eglDestroyContext)
-EGL_ENTRY(eglGetDisplay)
-EGL_ENTRY(eglInitialize)
-EGL_ENTRY(eglMakeCurrent)
-EGL_ENTRY(eglTerminate)
-EGL_ENTRY(eglGetError)
-EGL_ENTRY(eglQueryString)
diff --git a/src/core/GLES_COMPUTE/gl_entries.in b/src/core/GLES_COMPUTE/gl_entries.in
deleted file mode 100644
index 80bdb91557..0000000000
--- a/src/core/GLES_COMPUTE/gl_entries.in
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-GL_ENTRY(glAttachShader)
-GL_ENTRY(glCompileShader)
-GL_ENTRY(glCreateProgram)
-GL_ENTRY(glCreateShader)
-GL_ENTRY(glDeleteProgram)
-GL_ENTRY(glDeleteShader)
-GL_ENTRY(glDetachShader)
-GL_ENTRY(glGetProgramInfoLog)
-GL_ENTRY(glGetProgramiv)
-GL_ENTRY(glGetShaderInfoLog)
-GL_ENTRY(glGetShaderiv)
-GL_ENTRY(glLinkProgram)
-GL_ENTRY(glShaderSource)
-GL_ENTRY(glUseProgram)
-GL_ENTRY(glBindBuffer)
-GL_ENTRY(glBindBufferBase)
-GL_ENTRY(glBufferData)
-GL_ENTRY(glDeleteBuffers)
-GL_ENTRY(glDispatchCompute)
-GL_ENTRY(glFlush)
-GL_ENTRY(glGenBuffers)
-GL_ENTRY(glGetProgramResourceIndex)
-GL_ENTRY(glGetUniformLocation)
-GL_ENTRY(glMapBufferRange)
-GL_ENTRY(glMemoryBarrier)
-GL_ENTRY(glUniform1ui)
-GL_ENTRY(glUnmapBuffer)
-GL_ENTRY(glGetError)
-GL_ENTRY(glGetActiveUniformBlockiv)
-GL_ENTRY(glUniformBlockBinding)
-GL_ENTRY(glGetUniformBlockIndex)
-GL_ENTRY(glGenTextures)
-GL_ENTRY(glDeleteTextures)
-GL_ENTRY(glBindTexture)
-GL_ENTRY(glTexImage2D)
-GL_ENTRY(glGenFramebuffers)
-GL_ENTRY(glDeleteFramebuffers)
-GL_ENTRY(glBindFramebuffer)
-GL_ENTRY(glFramebufferTexture2D)
-GL_ENTRY(glGetString)
diff --git a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
deleted file mode 100644
index 5e8accc95d..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-GCAbsoluteDifferenceKernel::GCAbsoluteDifferenceKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void GCAbsoluteDifferenceKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 4;
-
-    // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("absdiff", build_opts));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowRectangle input1_access(input1->info(), 0, 0, 4, 1);
-    AccessWindowRectangle input2_access(input2->info(), 0, 0, 4, 1);
-    AccessWindowRectangle output_access(output->info(), 0, 0, 4, 1);
-
-    update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    IGCKernel::configure(win);
-}
-
-void GCAbsoluteDifferenceKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window slice = window.first_slice_window_2D();
-    do
-    {
-        unsigned int idx     = 0;
-        unsigned int binding = 1; // SSBO binding starts from 1.
-        add_2D_tensor_argument(idx, _input1, binding++, slice);
-        add_2D_tensor_argument(idx, _input2, binding++, slice);
-        add_2D_tensor_argument(idx, _output, binding++, slice);
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
deleted file mode 100644
index 0173b81cf8..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-GCActivationLayerKernel::GCActivationLayerKernel(GCCoreRuntimeContext *ctx)
-    : _input(nullptr), _output(nullptr), _ctx(ctx)
-{
-}
-
-void GCActivationLayerKernel::configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
-    // Make sure _kernel is initialized before calling the parent's configure
-    _input  = input;
-    _output = input;
-
-    if(output != nullptr)
-    {
-        // Output auto inizialitation if not yet initialized
-        auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
-
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-        _output = output;
-    }
-
-    unsigned int num_elems_processed_per_iteration = 4 / input->info()->element_size();
-
-    // Set build options
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + string_from_activation_func(act_info.activation())));
-    build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace(("#define A_VAL " + float_to_string_with_full_precision(act_info.a())));
-    build_opts.emplace(("#define B_VAL " + float_to_string_with_full_precision(act_info.b())));
-    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
-    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
-    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
-
-    // Create kernel
-    _kernel = create_opengl_kernel(_ctx, "activation_layer", build_opts);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    if(output != nullptr)
-    {
-        AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-        update_window_and_padding(win,
-                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
-                                  output_access);
-
-        output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    }
-    else
-    {
-        update_window_and_padding(win,
-                                  AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-    }
-
-    IGCKernel::configure(win);
-}
-
-void GCActivationLayerKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    _output->set_needs_shifting(true);
-
-    Window collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
-    Window slice     = collapsed.first_slice_window_3D();
-    Window slice_in  = collapsed.first_slice_window_3D();
-
-    slice.shift(Window::DimX, -(_output->info()->padding()).left);
-
-    if(_input == _output)
-    {
-        slice_in.shift(Window::DimX, -(_input->info()->padding()).left);
-    }
-
-    do
-    {
-        unsigned int idx     = 0;
-        unsigned int binding = 1;
-        add_3D_tensor_argument(idx, _input, binding++, slice);
-        add_3D_tensor_argument(idx, _output, binding++, slice_in);
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(collapsed.slide_window_slice_3D(slice) && collapsed.slide_window_slice_3D(slice_in));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
deleted file mode 100644
index f31c8ca156..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2);
-
-    // Validate in case of configured output
-    if((output != nullptr) && (output->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
-{
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-
-    Window win = calculate_max_window(*input1, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->valid_region(),
-                                                       input2->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-GCArithmeticAdditionKernel::GCArithmeticAdditionKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void GCArithmeticAdditionKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, ConvertPolicy policy)
-{
-    ARM_COMPUTE_UNUSED(policy);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-        set_format_if_unknown(*output->info(), Format::F16);
-    }
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1->info(), input2->info(), output->info(), policy));
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input1->info(), input2->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IGCKernel::configure(win_config.second);
-}
-
-Status GCArithmeticAdditionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, policy));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-void GCArithmeticAdditionKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    _output->set_needs_shifting(true);
-
-    Window slice    = window.first_slice_window_3D();
-    Window slice_in = window.first_slice_window_3D();
-
-    slice.shift(Window::DimX, -(_output->info()->padding()).left);
-
-    do
-    {
-        unsigned int idx     = 0;
-        unsigned int binding = 1; // SSBO binding starts from 1.
-        add_3D_tensor_argument(idx, _input1, binding++, slice_in);
-        add_3D_tensor_argument(idx, _input2, binding++, slice_in);
-        add_3D_tensor_argument(idx, _output, binding++, slice);
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
deleted file mode 100644
index 9281ce5ffb..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ITensorInfo *mean, const ITensorInfo *var,
-                          const ITensorInfo *beta, const ITensorInfo *gamma,
-                          float epsilon, ActivationLayerInfo act_info)
-{
-    ARM_COMPUTE_UNUSED(epsilon);
-    ARM_COMPUTE_UNUSED(var);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    if(beta != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
-    }
-    if(gamma != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
-    }
-    if(act_info.enabled())
-    {
-        ARM_COMPUTE_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
-        ARM_COMPUTE_ERROR_ON(act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                             && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                             && act_info.activation() != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
-        ARM_COMPUTE_ERROR_ON(act_info.b() > act_info.a());
-    }
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output,
-                                                        ITensorInfo *mean, ITensorInfo *var,
-                                                        ITensorInfo *beta, ITensorInfo *gamma)
-{
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
-
-    unsigned int num_elems_processed_per_iteration = 1;
-    if(input->data_type() == DataType::F16)
-    {
-        num_elems_processed_per_iteration = 4;
-    }
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     mean_access(mean, 0, 0, mean->dimension(0) + 3, mean->dimension(1));
-    AccessWindowStatic     var_access(var, 0, 0, var->dimension(0) + 3, var->dimension(1));
-
-    bool window_changed = false;
-    if(beta != nullptr)
-    {
-        AccessWindowStatic beta_access(beta, 0, 0, beta->dimension(0) + 3, beta->dimension(1));
-        if(gamma != nullptr)
-        {
-            AccessWindowStatic gamma_access(gamma, 0, 0, gamma->dimension(0) + 3, gamma->dimension(1));
-            window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access);
-        }
-    }
-    else
-    {
-        if(gamma != nullptr)
-        {
-            AccessWindowStatic gamma_access(gamma, 0, 0, gamma->dimension(0) + 3, gamma->dimension(1));
-            window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access, gamma_access);
-        }
-        else
-        {
-            window_changed = update_window_and_padding(win, input_access, output_access, mean_access, var_access);
-        }
-    }
-    output_access.set_valid_region(win, input->valid_region());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-GCBatchNormalizationLayerKernel::GCBatchNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0.0f)
-{
-}
-
-void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, var);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), var->info(),
-                                                  (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr,
-                                                  epsilon, act_info));
-
-    _input   = input;
-    _output  = output;
-    _mean    = mean;
-    _var     = var;
-    _beta    = beta;
-    _gamma   = gamma;
-    _epsilon = epsilon;
-
-    // Set build options
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace(("#define ESPILON " + float_to_string_with_full_precision(_epsilon)));
-    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
-    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
-    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
-    if(beta == nullptr)
-    {
-        build_opts.emplace("#define USE_DEFAULT_BETA");
-    }
-    if(gamma == nullptr)
-    {
-        build_opts.emplace("#define USE_DEFAULT_GAMMA");
-    }
-
-    if(act_info.enabled())
-    {
-        build_opts.emplace("#define " + string_from_activation_func(act_info.activation()));
-        build_opts.emplace("#define A_VAL " + float_to_string_with_full_precision(act_info.a()));
-        build_opts.emplace("#define B_VAL " + float_to_string_with_full_precision(act_info.b()));
-    }
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), var->info(),
-                                                    (beta != nullptr) ? beta->info() : nullptr, (gamma != nullptr) ? gamma->info() : nullptr);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-
-    IGCKernel::configure(win_config.second);
-}
-
-Status GCBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
-                                                              mean->clone().get(), var->clone().get(),
-                                                              beta->clone().get(), gamma->clone().get())
-                                .first);
-
-    return Status{};
-}
-
-void GCBatchNormalizationLayerKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    _kernel.use();
-
-    _output->set_needs_shifting(true);
-
-    Window slice    = window.first_slice_window_3D();
-    Window slice_in = window.first_slice_window_3D();
-
-    Window vector_slice = window.first_slice_window_1D();
-    vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    unsigned int idx           = 2 * num_arguments_per_3D_tensor();
-    unsigned int binding_point = 3;
-    add_1D_tensor_argument(idx, _mean, binding_point, vector_slice);
-    add_1D_tensor_argument(idx, _var, ++binding_point, vector_slice);
-    if(_beta != nullptr)
-    {
-        add_1D_tensor_argument(idx, _beta, ++binding_point, vector_slice);
-    }
-    if(_gamma != nullptr)
-    {
-        add_1D_tensor_argument(idx, _gamma, ++binding_point, vector_slice);
-    }
-
-    slice.shift(Window::DimX, -(_output->info()->padding()).left);
-
-    do
-    {
-        idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice_in);
-        add_3D_tensor_argument(idx, _output, 2, slice);
-
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
deleted file mode 100644
index 5781c564ea..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-GCCol2ImKernel::GCCol2ImKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims()
-{
-}
-
-void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor    *output,
-                               std::pair<unsigned int, unsigned int> convolved_dims)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, convolved_dims.first);
-    output_shape.set(1, convolved_dims.second);
-    output_shape.set(2, input->info()->tensor_shape()[0]);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input          = input;
-    _output         = output;
-    _convolved_dims = convolved_dims;
-
-    const DataType     dt         = input->info()->data_type();
-    const unsigned int local_size = 1;
-
-    // Create kernel
-    std::set<std::string> build_opts;
-    build_opts.emplace("#define COL2IM ");
-    build_opts.emplace("#define WIDTH_OUTPUT " + support::cpp11::to_string(_convolved_dims.first));
-    const std::string dt_name = (dt == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(local_size));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(local_size));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(local_size));
-
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("col2im", build_opts));
-
-    // Configure window
-    const unsigned int num_elems_processed_per_iteration = (dt == DataType::F32) ? 1 : 2;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    const int              input_padding = ceil_to_multiple(input->info()->dimension(0), 2) - input->info()->dimension(0);
-
-    AccessWindowStatic input_access(input->info(), 0, 0, input->info()->dimension(0) + input_padding, input->info()->dimension(1) + 1);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, output->info()->valid_region());
-
-    IGCKernel::configure(win);
-}
-
-void GCCol2ImKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window collapsed_window = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
-    Window slice            = collapsed_window.first_slice_window_3D();
-
-    // Set static kernel arguments
-    unsigned int idx = 2 * num_arguments_per_3D_tensor();
-    //_kernel.set_argument(idx++, _output->info()->strides_in_bytes()[3]);
-    _kernel.set_argument(idx++, uint(_output->info()->dimension(2)));
-    _kernel.set_argument(idx++, _input->info()->strides_in_bytes()[2]);
-
-    do
-    {
-        // Set inputs
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, 1, slice);
-        add_3D_tensor_argument(idx, _output, 2, slice);
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(collapsed_window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
deleted file mode 100644
index 3256f11e74..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-GCDepthConcatenateLayerKernel::GCDepthConcatenateLayerKernel()
-    : _input(nullptr), _output(nullptr), _depth_offset(0)
-{
-}
-void GCDepthConcatenateLayerKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) != output->info()->dimension(Window::DimX));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) != output->info()->dimension(Window::DimY));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
-
-    _input        = input;
-    _output       = output;
-    _depth_offset = depth_offset;
-
-    // Add build options
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts));
-
-    unsigned int num_elems_processed_per_iteration = 1;
-    if(input->info()->data_type() == DataType::F16)
-    {
-        num_elems_processed_per_iteration = 4;
-    }
-
-    // The window needs to be based on input as we copy all the depths of input
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1));
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    IGCKernel::configure(win);
-}
-
-void GCDepthConcatenateLayerKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    _output->set_needs_shifting(true);
-
-    Window slice_in  = window.first_slice_window_3D();
-    Window slice_out = window.first_slice_window_3D();
-
-    slice_out.set(Window::DimZ, Window::Dimension(_depth_offset));
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice_in);
-        add_3D_tensor_argument(idx, _output, 2, slice_out);
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, slice_in);
-    }
-    while(window.slide_window_slice_3D(slice_in));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
deleted file mode 100644
index 95d487b4dd..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-GCDepthwiseConvolutionLayer3x3Kernel::GCDepthwiseConvolutionLayer3x3Kernel()
-    : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_left(0), _conv_pad_top(0), _lws(gles::NDRange(1U, 1U, 1U))
-{
-}
-
-BorderSize GCDepthwiseConvolutionLayer3x3Kernel::border_size() const
-{
-    return _border_size;
-}
-
-void GCDepthwiseConvolutionLayer3x3Kernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                                                     unsigned int depth_multiplier)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != 3 || weights->info()->dimension(1) != 3);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(2));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
-
-    // Get convolved dimensions
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(),
-                       output_shape,
-                       1,
-                       input->info()->data_type());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(2) != weights->info()->dimension(2));
-
-    _input         = input;
-    _output        = output;
-    _weights       = weights;
-    _biases        = biases;
-    _conv_stride_x = conv_info.stride().first;
-    _conv_stride_y = conv_info.stride().second;
-    _conv_pad_left = conv_info.pad_left();
-    _conv_pad_top  = conv_info.pad_top();
-    _border_size   = BorderSize(_conv_pad_top, conv_info.pad_right(), conv_info.pad_bottom(), _conv_pad_left);
-
-    // Set build options
-    ARM_COMPUTE_ERROR_ON(_conv_stride_x < 1 || _conv_stride_x > 3);
-    std::set<std::string> options;
-
-    options.emplace("#define DEPTH_MULTIPLIER " + support::cpp11::to_string(depth_multiplier));
-    options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
-    options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
-    options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
-    options.emplace("#define STRIDE_X " + support::cpp11::to_string(_conv_stride_x));
-    options.emplace("#define STRIDE_Y " + support::cpp11::to_string(_conv_stride_y));
-
-    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    options.emplace(("#define " + dt_name));
-
-    unsigned int num_elems_read_per_iteration_x    = 8;
-    unsigned int num_elems_read_per_iteration_y    = 1;
-    unsigned int num_elems_written_per_iteration_x = 4;
-    unsigned int num_elems_written_per_iteration_y = 1;
-    unsigned int num_elems_written_per_iteration_z = 1;
-
-    if((_conv_stride_x == 1) && (_conv_stride_y == 1))
-    {
-        switch(input->info()->data_type())
-        {
-#define PROCESS_4X_3Y_1Z
-
-            case DataType::F16:
-#if defined(PROCESS_4X_3Y_1Z)
-                options.emplace("#define PROCESS_4X_3Y_1Z");
-                num_elems_read_per_iteration_y    = 5;
-                num_elems_written_per_iteration_y = 3;
-#endif /* PROCESS_4X_3Y_1Z */
-#undef PROCESS_4X_3Y_1Z
-                break;
-
-            default:
-                ARM_COMPUTE_ERROR("Current data type is not supported");
-                break;
-        }
-    }
-    else
-    {
-        switch(input->info()->data_type())
-        {
-            case DataType::F16:
-                options.emplace("#define PROCESS_4X_1Y_1Z");
-                break;
-
-            default:
-                ARM_COMPUTE_ERROR("Current data type is not supported");
-                break;
-        }
-    }
-
-    if(_biases != nullptr)
-    {
-        options.emplace("#define BIAS");
-    }
-
-    // Create kernel
-    std::string kernel_name = "depthwise_convolution_3x3";
-    _kernel                 = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, options));
-
-    // Calculate output right and bottom border
-    const int output_width          = output->info()->dimension(0);
-    const int output_height         = output->info()->dimension(1);
-    const int output_padding_right  = ceil_to_multiple(output_width, num_elems_written_per_iteration_x * _lws[0]) - output_width;
-    const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
-
-    // Calculate input right and bottom border
-    const int input_width  = input->info()->dimension(0);
-    const int input_height = input->info()->dimension(1);
-
-    const int input_total_width  = std::max(int(input->info()->padding().left), int(_conv_pad_left)) + input_width + std::max(int(input->info()->padding().right), int(_conv_pad_left));
-    const int input_total_height = std::max(int(input->info()->padding().top), int(_conv_pad_top)) + input_height + std::max(int(input->info()->padding().bottom), int(_conv_pad_top));
-
-    const int input_padding_right  = ceil_to_multiple(input_total_width, num_elems_read_per_iteration_x * _lws[0]) - input_width - _conv_pad_left;
-    const int input_padding_bottom = ceil_to_multiple(input_total_height, num_elems_read_per_iteration_y * _lws[1]) - input_height - _conv_pad_top;
-
-    BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
-
-    Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border);
-
-    AccessWindowStatic input_access(input->info(), -_conv_pad_left, -_conv_pad_top, input_width + input_padding_right, input_height + input_padding_bottom);
-    AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0);
-    AccessWindowStatic bias_access    = AccessWindowStatic(nullptr, 0, 0, 0, 1);
-
-    switch(weights->info()->data_type())
-    {
-        case DataType::F16:
-            weights_access = AccessWindowStatic(weights->info(), 0, 0, 4, 3);
-            if(_biases != nullptr)
-            {
-                bias_access = AccessWindowStatic(_biases->info(), 0, 0, _biases->info()->dimension(0) + 1, 1);
-            }
-            break;
-
-        default:
-            ARM_COMPUTE_ERROR("Current data type is not supported");
-            break;
-    }
-
-    AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
-
-    if(_biases != nullptr)
-    {
-        update_window_and_padding(win, input_access, weights_access, bias_access, output_access);
-    }
-    else
-    {
-        update_window_and_padding(win, input_access, weights_access, output_access);
-    }
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    IGCKernel::configure(win);
-}
-
-void GCDepthwiseConvolutionLayer3x3Kernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    _kernel.use();
-
-    _output->set_needs_shifting(true);
-
-    // Create input window and adjust
-    Window win_in = window;
-    win_in.adjust(Window::DimX, -_conv_pad_left, true);
-    win_in.adjust(Window::DimY, -_conv_pad_top, true);
-    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
-    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
-
-    Window slice_in      = win_in.first_slice_window_3D();
-    Window slice_out     = window.first_slice_window_3D();
-    Window slice_weights = window.first_slice_window_3D();
-    slice_weights.set_dimension_step(Window::DimX, 0);
-    slice_weights.set_dimension_step(Window::DimY, 0);
-
-    // Set biases
-    if(_biases != nullptr)
-    {
-        unsigned int idx = 3 * num_arguments_per_3D_tensor();
-        Window       slice_biases;
-        slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
-        add_1D_tensor_argument(idx, _biases, 4, slice_biases);
-    }
-
-    slice_out.shift(Window::DimX, -(_output->info()->padding()).left);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice_in);
-        add_3D_tensor_argument(idx, _output, 2, slice_out);
-        add_3D_tensor_argument(idx, _weights, 3, slice_weights);
-
-        _kernel.update_shader_params();
-        enqueue(*this, slice_out, _lws);
-    }
-    while(window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
deleted file mode 100644
index 9ce8acea09..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ /dev/null
@@ -1,450 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-template <unsigned int kernel_size>
-GCDirectConvolutionLayerKernel<kernel_size>::GCDirectConvolutionLayerKernel()
-    : _input(nullptr), _bias(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0), _lws(gles::NDRange(1U, 1U, 1U))
-{
-}
-
-template <unsigned int kernel_size>
-BorderSize             GCDirectConvolutionLayerKernel<kernel_size>::border_size() const
-{
-    return _border_size;
-}
-
-template <unsigned int kernel_size>
-void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output,
-                                                            const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-    ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.stride()) > 2), "Strides larger than 2 not supported in 3x3 direct convolution!");
-    ARM_COMPUTE_ERROR_ON(kernel_size != weights->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(act_info.enabled() && act_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && act_info.activation() != ActivationLayerInfo::ActivationFunction::LOGISTIC);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
-        // FIXME: Bug in framework, workaround it in tests currently.
-        //ARM_COMPUTE_ERROR_ON(bias->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
-    }
-
-    // Get convolved dimensions
-    unsigned int owidth  = 0;
-    unsigned int oheight = 0;
-    std::tie(owidth, oheight) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, owidth);
-    output_shape.set(1, oheight);
-    output_shape.set(2, weights->info()->dimension(3));
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(!conv_info.padding_is_symmetric());
-
-    _conv_stride_x = std::get<0>(conv_info.stride());
-    _conv_stride_y = std::get<1>(conv_info.stride());
-    _conv_pad_x    = std::get<0>(conv_info.pad());
-    _conv_pad_y    = std::get<1>(conv_info.pad());
-
-    _input       = input;
-    _weights     = weights;
-    _output      = output;
-    _bias        = bias;
-    _border_size = BorderSize(_conv_pad_y, _conv_pad_x);
-
-    std::set<std::string> options;
-
-    options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
-    options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
-    options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
-    options.emplace("#define STRIDE_X " + support::cpp11::to_string(_conv_stride_x));
-    options.emplace("#define STRIDE_Y " + support::cpp11::to_string(_conv_stride_y));
-
-    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    options.emplace(("#define " + dt_name));
-
-    // Activation information in case of a fused activation
-    if(act_info.enabled())
-    {
-        options.emplace("#define FUSED_ACTIVATION");
-        options.emplace(("#define " + string_from_activation_func(act_info.activation())));
-        options.emplace(("#define ACT_OP  " + lower_string(string_from_activation_func(act_info.activation())) + "_op"));
-        options.emplace(("#define A_VAL " + float_to_string_with_full_precision(act_info.a())));
-        options.emplace(("#define B_VAL " + float_to_string_with_full_precision(act_info.b())));
-    }
-
-    unsigned int num_elems_read_per_iteration_x    = kernel_size * _conv_stride_x;
-    unsigned int num_elems_read_per_iteration_y    = 1;
-    unsigned int num_elems_written_per_iteration_x = 1;
-    unsigned int num_elems_written_per_iteration_y = 1;
-    unsigned int num_elems_written_per_iteration_z = 1;
-
-    if(kernel_size == 3)
-    {
-        if((_conv_stride_x == 1) && (_conv_stride_y == 1))
-        {
-            switch(input->info()->data_type())
-            {
-                case DataType::F16:
-                    // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
-#define PROCESS_4X_3Y_1Z
-
-#if defined(PROCESS_8X_3Y_1Z)
-                    options.emplace("#define PROCESS_8X_3Y_1Z");
-                    num_elems_read_per_iteration_x    = 16;
-                    num_elems_read_per_iteration_y    = 5;
-                    num_elems_written_per_iteration_x = 8;
-                    num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_4X_3Y_1Z)
-                    options.emplace("#define PROCESS_4X_3Y_1Z");
-                    num_elems_read_per_iteration_x    = 8;
-                    num_elems_read_per_iteration_y    = 5;
-                    num_elems_written_per_iteration_x = 4;
-                    num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_4X_4Y_1Z)
-                    options.emplace("#define PROCESS_4X_4Y_1Z");
-                    num_elems_read_per_iteration_x    = 8;
-                    num_elems_read_per_iteration_y    = 6;
-                    num_elems_written_per_iteration_x = 4;
-                    num_elems_written_per_iteration_y = 4;
-#elif defined(PROCESS_4X_3Y_2Z)
-                    options.emplace("#define PROCESS_4X_3Y_2Z");
-                    num_elems_read_per_iteration_x    = 8;
-                    num_elems_read_per_iteration_y    = 5;
-                    num_elems_written_per_iteration_x = 4;
-                    num_elems_written_per_iteration_y = 3;
-                    num_elems_written_per_iteration_z = 2;
-#endif /* PROCESS_nX_nY_nZ */
-#undef PROCESS_8X_3Y_1Z
-#undef PROCESS_4X_3Y_1Z
-#undef PROCESS_4X_4Y_1Z
-#undef PROCESS_4X_3Y_2Z
-                    break;
-
-                case DataType::F32:
-                    options.emplace("#define PROCESS_4X_3Y_1Z");
-                    num_elems_read_per_iteration_x    = 8;
-                    num_elems_read_per_iteration_y    = 5;
-                    num_elems_written_per_iteration_x = 4;
-                    num_elems_written_per_iteration_y = 3;
-                    break;
-
-                default:
-                    ARM_COMPUTE_ERROR("Current data type is not supported");
-                    break;
-            }
-        }
-        // FIXME: Just keep one in release
-        else
-        {
-            switch(input->info()->data_type())
-            {
-                case DataType::F16:
-                    options.emplace("#define PROCESS_4X_1Y_1Z");
-                    num_elems_read_per_iteration_x    = 8;
-                    num_elems_written_per_iteration_x = 4;
-                    break;
-
-                case DataType::F32:
-                    // TODO(APPBROWSER-299): Choose the most optimal path and remove others.
-#define PROCESS_4X_1Y_1Z
-
-#if defined(PROCESS_1X_1Y_1Z)
-                    options.emplace("#define PROCESS_1X_1Y_1Z");
-                    num_elems_read_per_iteration_x    = 3;
-                    num_elems_written_per_iteration_x = 1;
-#elif defined(PROCESS_4X_1Y_1Z)
-                    options.emplace("#define PROCESS_4X_1Y_1Z");
-                    num_elems_read_per_iteration_x    = 8;
-                    num_elems_written_per_iteration_x = 4;
-#elif defined(PROCESS_8X_1Y_1Z)
-                    options.emplace("#define PROCESS_8X_1Y_1Z");
-                    num_elems_read_per_iteration_x    = 12;
-                    num_elems_written_per_iteration_x = 8;
-#else /* PROCESS_nX_nY_nZ */
-#error Have to declare how many elements to process in one thread.
-#endif /* PROCESS_nX_nY_nZ */
-#undef PROCESS_1X_1Y_1Z
-#undef PROCESS_4X_1Y_1Z
-#undef PROCESS_8X_1Y_1Z
-                    break;
-
-                default:
-                    ARM_COMPUTE_ERROR("Current data type is not supported");
-                    break;
-            }
-        }
-    }
-    else if(kernel_size == 1)
-    {
-        if(weights->info()->dimension(2) % 2 == 0)
-        {
-            options.emplace("#define WEIGHTS_OPTIMIZATION");
-        }
-        switch(input->info()->data_type())
-        {
-            case DataType::F16:
-#define PROCESS_8X_2Y_1Z
-
-#if defined(PROCESS_4X_1Y_1Z)
-                options.emplace("#define PROCESS_4X_1Y_1Z");
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_written_per_iteration_x = 4;
-#elif defined(PROCESS_4X_2Y_1Z)
-                options.emplace("#define PROCESS_4X_2Y_1Z");
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_read_per_iteration_y    = 2;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 2;
-#elif defined(PROCESS_4X_3Y_1Z)
-                options.emplace("#define PROCESS_4X_3Y_1Z");
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_read_per_iteration_y    = 3;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_4X_4Y_1Z)
-                options.emplace("#define PROCESS_4X_4Y_1Z");
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_read_per_iteration_y    = 4;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 4;
-#elif defined(PROCESS_4X_2Y_2Z)
-                ARM_COMPUTE_ERROR_ON_MSG((weights->info()->dimension(4) % 2) == 1, "Current 'weights->info()->dimension(4) % 2) == 1' is not supported");
-                options.emplace("#define PROCESS_4X_2Y_2Z");
-                num_elems_read_per_iteration_x    = 4;
-                num_elems_read_per_iteration_y    = 2;
-                num_elems_written_per_iteration_x = 4;
-                num_elems_written_per_iteration_y = 2;
-                num_elems_written_per_iteration_z = 2;
-#elif defined(PROCESS_8X_1Y_1Z)
-                options.emplace("#define PROCESS_8X_1Y_1Z");
-                num_elems_read_per_iteration_x    = 8;
-                num_elems_written_per_iteration_x = 8;
-#elif defined(PROCESS_8X_2Y_1Z)
-                options.emplace("#define PROCESS_8X_2Y_1Z");
-                num_elems_read_per_iteration_x    = 8;
-                num_elems_read_per_iteration_y    = 2;
-                num_elems_written_per_iteration_x = 8;
-                num_elems_written_per_iteration_y = 2;
-#else /* PROCESS_4X_1Y_1Z */
-#error Have to declare how many elements to process in one thread.
-#endif /* PROCESS_4X_1Y_1Z */
-#undef PROCESS_4X_1Y_1Z
-#undef PROCESS_4X_2Y_1Z
-#undef PROCESS_4X_3Y_1Z
-#undef PROCESS_4X_4Y_1Z
-#undef PROCESS_4X_2Y_2Z
-#undef PROCESS_8X_1Y_1Z
-#undef PROCESS_8X_2Y_1Z
-                break;
-
-            case DataType::F32:
-                num_elems_read_per_iteration_x    = 1;
-                num_elems_written_per_iteration_x = 1;
-                break;
-
-            default:
-                break;
-        }
-    }
-    else if(kernel_size == 5)
-    {
-        switch(input->info()->data_type())
-        {
-            case DataType::F16:
-                options.emplace("#define PROCESS_4X_1Y_1Z");
-                num_elems_read_per_iteration_x    = 8;
-                num_elems_written_per_iteration_x = 4;
-
-            default:
-                break;
-        }
-    }
-    else
-    {
-    }
-
-    if(_bias != nullptr)
-    {
-        options.emplace("#define BIAS");
-    }
-
-    std::stringstream kernel_name;
-    kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
-
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
-
-    unsigned int idx = (_bias == nullptr) ? 3 * num_arguments_per_3D_tensor() : (num_arguments_per_1D_tensor() + 3 * num_arguments_per_3D_tensor());
-
-    // Calculate output right and bottom border
-    const int output_width          = output->info()->dimension(0);
-    const int output_height         = output->info()->dimension(1);
-    const int output_padding_right  = ceil_to_multiple(output_width, num_elems_written_per_iteration_x * _lws[0]) - output_width;
-    const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
-
-    // Calculate input right and bottom border
-    const int input_width        = input->info()->dimension(0);
-    const int input_height       = input->info()->dimension(1);
-    const int input_total_width  = std::max(int(input->info()->padding().left), int(_conv_pad_x)) + input_width + std::max(int(input->info()->padding().right), int(_conv_pad_x));
-    const int input_total_height = std::max(int(input->info()->padding().top), int(_conv_pad_y)) + input_height + std::max(int(input->info()->padding().bottom), int(_conv_pad_y));
-    const int padding_right1     = ceil_to_multiple(input_total_width, num_elems_read_per_iteration_x * _lws[0]) - input_width - _conv_pad_x;
-    const int padding_bottom1    = ceil_to_multiple(input_total_height, num_elems_read_per_iteration_y * _lws[1]) - input_height - _conv_pad_y;
-
-    const int upper_bound_w   = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x - input_width;
-    const int upper_bound_h   = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y - input_height;
-    const int padding_right2  = std::max(upper_bound_w, _conv_pad_x);
-    const int padding_bottom2 = std::max(upper_bound_h, _conv_pad_y);
-
-    const int padding_right  = std::max(padding_right1, padding_right2);
-    const int padding_bottom = std::max(padding_bottom1, padding_bottom2);
-
-    BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0);
-
-    Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border);
-
-    AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + padding_right, input_height + padding_bottom);
-    AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0);
-    AccessWindowStatic bias_access    = AccessWindowStatic(nullptr, 0, 0, 0, 1);
-
-    switch(weights->info()->data_type())
-    {
-        case DataType::F16:
-            if((weights->info()->dimension(2) % 2 != 0) || (kernel_size != 1))
-            {
-                weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size + 1, kernel_size);
-            }
-            if(_bias != nullptr)
-            {
-                bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0) + 1, 1);
-            }
-            break;
-
-        case DataType::F32:
-            weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size, kernel_size);
-            if(_bias != nullptr)
-            {
-                bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0), 1);
-            }
-            break;
-
-        default:
-            ARM_COMPUTE_ERROR("Current data type is not supported");
-            break;
-    }
-
-    AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
-
-    if(_bias != nullptr)
-    {
-        update_window_and_padding(win, input_access, weights_access, bias_access, output_access);
-    }
-    else
-    {
-        update_window_and_padding(win, input_access, weights_access, output_access);
-    }
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    _kernel.set_argument(idx++, _weights->info()->strides_in_bytes()[3]); // weights_stride_w
-    _kernel.set_argument(idx++, _weights->info()->dimension(2));          // weights_depth
-
-    IGCKernel::configure(win);
-}
-
-template <unsigned int kernel_size>
-void GCDirectConvolutionLayerKernel<kernel_size>::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    _kernel.use();
-
-    _output->set_needs_shifting(true);
-
-    // Get initial windows
-    Window slice  = window.first_slice_window_3D();
-    Window win_in = window;
-
-    win_in.adjust(Window::DimX, -_conv_pad_x, true);
-    win_in.adjust(Window::DimY, -_conv_pad_y, true);
-    win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x);
-    win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y);
-
-    Window slice_in = win_in.first_slice_window_3D();
-
-    unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
-    add_3D_tensor_argument(idx1, _weights, 3, slice);
-
-    if(_bias != nullptr)
-    {
-        Window slice_bias;
-        slice_bias.use_tensor_dimensions(_bias->info()->tensor_shape());
-        add_1D_tensor_argument(idx1, _bias, 4, slice_bias);
-    }
-
-    slice.shift(Window::DimX, -(_output->info()->padding()).left);
-
-    do
-    {
-        unsigned int idx = 0;
-
-        add_3D_tensor_argument(idx, _input, 1, slice_in);
-        add_3D_tensor_argument(idx, _output, 2, slice);
-
-        _kernel.update_shader_params();
-        enqueue(*this, slice, _lws);
-    }
-    while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
-}
-
-template class arm_compute::GCDirectConvolutionLayerKernel<1>;
-template class arm_compute::GCDirectConvolutionLayerKernel<3>;
-template class arm_compute::GCDirectConvolutionLayerKernel<5>;
diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
deleted file mode 100644
index bda6599f86..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-#include <random>
-#include <tuple>
-
-using namespace arm_compute;
-
-GCDropoutLayerKernel::GCDropoutLayerKernel()
-    : _input(nullptr), _mask(nullptr), _output(nullptr), _num_elems_processed_per_iteration(0)
-{
-}
-
-void GCDropoutLayerKernel::configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mask, output);
-
-    _input  = input;
-    _mask   = mask;
-    _output = output;
-
-    std::set<std::string>                 build_opts;
-    std::string                           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    std::string                           fporbp  = forward ? "FORWARD" : "BACKWARD";
-    std::random_device                    rd;
-    std::mt19937                          mt(rd());
-    std::uniform_real_distribution<float> dist(0.f, 1.f);
-
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.emplace("#define RATIO " + support::cpp11::to_string(ratio));
-    build_opts.emplace("#define SCALE " + support::cpp11::to_string(1. / (1. - ratio)));
-    build_opts.emplace("#define SEED " + support::cpp11::to_string(dist(mt)));
-    build_opts.emplace("#define " + dt_name);
-    build_opts.emplace("#define " + fporbp);
-
-    _num_elems_processed_per_iteration = 4 / input->info()->element_size();
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("dropout", build_opts));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    IGCKernel::configure(win);
-}
-
-void GCDropoutLayerKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-
-        add_3D_tensor_argument(idx, _input, 1, slice);
-        add_3D_tensor_argument(idx, _mask, 2, slice);
-        add_3D_tensor_argument(idx, _output, 3, slice);
-
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
deleted file mode 100644
index 7ffcdd2f3f..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cstdint>
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-GCFillBorderKernel::GCFillBorderKernel()
-    : IGCKernel(), _tensor(nullptr)
-{
-}
-
-bool GCFillBorderKernel::is_parallelisable() const
-{
-    return false;
-}
-
-template <class T>
-void GCFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value)
-{
-    T value;
-    constant_border_value.get(value);
-    _kernel.set_argument(idx, static_cast<T>(value));
-}
-
-void GCFillBorderKernel::configure(const IGCTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
-
-    border_size.limit(tensor->info()->padding());
-
-    // If there is no border: early exit
-    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
-    {
-        return;
-    }
-
-    // Select appropriate kernel
-    std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
-
-    // Define build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.emplace("#define BORDER_SIZE_TOP " + support::cpp11::to_string(border_size.top));
-    build_opts.emplace("#define BORDER_SIZE_BOTTOM " + support::cpp11::to_string(border_size.bottom));
-    build_opts.emplace("#define BORDER_SIZE_LEFT " + support::cpp11::to_string(border_size.left));
-    build_opts.emplace("#define BORDER_SIZE_RIGHT " + support::cpp11::to_string(border_size.right));
-
-    if(border_mode == BorderMode::REPLICATE)
-    {
-        build_opts.emplace("#define FILL_IMAGE_BORDERS_REPLICATE\n");
-    }
-    else
-    {
-        build_opts.emplace("#define FILL_IMAGE_BORDERS_CONSTANT\n");
-    }
-
-    switch(tensor->info()->data_type())
-    {
-        case DataType::F16:
-            build_opts.emplace("#define DATA_TYPE_FP16");
-            break;
-
-        case DataType::F32:
-            build_opts.emplace("#define DATA_TYPE_FP32");
-            break;
-
-        default:
-            ARM_COMPUTE_ERROR("Current data type is not supported");
-            break;
-    }
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, build_opts));
-    _tensor = tensor;
-
-    // Create static kernel arguments
-    const unsigned int valid_width       = tensor->info()->valid_region().shape[0];
-    const unsigned int valid_height      = tensor->info()->valid_region().shape[1];
-    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
-
-    // Set static kernel arguments
-    unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
-    _kernel.set_argument(idx++, valid_width);
-    _kernel.set_argument(idx++, valid_height);
-    _kernel.set_argument(idx++, tensor->info()->valid_region().anchor[0]);
-    _kernel.set_argument(idx++, tensor->info()->valid_region().anchor[1]);
-
-    if(BorderMode::CONSTANT == border_mode)
-    {
-        set_constant_border<float>(idx++, constant_border_value);
-    }
-
-    // Configure kernel window
-    Window win;
-    win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
-    win.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ);
-
-    IGCKernel::configure(win);
-}
-
-void GCFillBorderKernel::run(const Window &window)
-{
-    // Border mode undefined or border width == 0
-    if(_kernel.get_program() == 0)
-    {
-        return;
-    }
-
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
-
-    _kernel.use();
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _tensor, 1, slice);
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
deleted file mode 100644
index d395759558..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-GCGEMMInterleave4x4Kernel::GCGEMMInterleave4x4Kernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void GCGEMMInterleave4x4Kernel::configure(const IGCTensor *input, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, input->info()->dimension(0) * 4);
-    output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f));
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-
-    // Create kernel
-    build_opts.emplace("#define GEMM_INTERLEAVE4x4");
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts));
-
-    // Configure kernel window
-    const unsigned int     num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input->info()->data_type());
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-    const unsigned int     num_elems_written_per_iteration     = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    IGCKernel::configure(win);
-}
-
-void GCGEMMInterleave4x4Kernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    /*
-     * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
-     *         |a00 a01 a02 a03|
-     *         |a10 a11 a12 a13|
-     *         |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
-     *         |a30 a31 a32 a33|
-     *
-     * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
-     */
-    Window in_slice  = window.first_slice_window_2D();
-    Window out_slice = window.first_slice_window_2D();
-
-    // Change x and y steps for the slide of output tensor
-    out_slice.scale(Window::DimX, 4.f);
-    out_slice.scale(Window::DimY, 0.25f);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, 1, in_slice);
-        add_2D_tensor_argument(idx, _output, 2, out_slice);
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, in_slice);
-    }
-    while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
deleted file mode 100644
index 66fdde5473..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-GCGEMMMatrixAccumulateBiasesKernel::GCGEMMMatrixAccumulateBiasesKernel()
-    : _accum(nullptr), _biases(nullptr), _lws(gles::NDRange(1U, 1U, 1U))
-{
-}
-
-void GCGEMMMatrixAccumulateBiasesKernel::configure(IGCTensor *accum, const IGCTensor *biases)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
-    ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
-
-    _biases = biases;
-    _accum  = accum;
-
-    std::set<std::string> build_opts;
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
-
-    // Create kernel
-    build_opts.emplace("#define GEMM_ACCUMULATE_BIASES");
-
-#define ACCUM_PROCESS_4X
-
-#if defined(ACCUM_PROCESS_4X)
-    build_opts.emplace("#define ACCUM_PROCESS_4X");
-#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
-    build_opts.emplace("#define ACCUM_PROCESS_8X");
-#endif                          /* ACCUM_PROCESS_4X */
-    std::string dt_name = (accum->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + dt_name));
-
-    _kernel = GCKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts);
-
-    // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = 1;
-
-    if(_accum->info()->data_type() == DataType::F32)
-    {
-        num_elems_processed_per_iteration = 16;
-    }
-    else if(_accum->info()->data_type() == DataType::F16)
-    {
-#if defined(ACCUM_PROCESS_4X)
-        num_elems_processed_per_iteration = 4;
-#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */
-        num_elems_processed_per_iteration = 8;
-#endif                          /* ACCUM_PROCESS_4X */
-    }
-
-    const int  accum_width         = accum->info()->dimension(0);
-    const int  accum_padding_right = ceil_to_multiple(accum_width, num_elems_processed_per_iteration * _lws[0]) - accum_width;
-    BorderSize border              = BorderSize(0, accum_padding_right, 0, 0);
-
-    Window win = calculate_max_enlarged_window(*_accum->info(), Steps(num_elems_processed_per_iteration), border);
-
-    AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration * _lws[0]), biases->info()->dimension(1));
-    AccessWindowStatic accum_access(_accum->info(), 0, 0, accum_width + accum_padding_right, _accum->info()->dimension(1));
-
-    update_window_and_padding(win, biases_access, accum_access);
-
-    IGCKernel::configure(win);
-}
-
-void GCGEMMMatrixAccumulateBiasesKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window accum_slice = window.first_slice_window_2D();
-
-    Window biases_slice(accum_slice);
-    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    // Run kernel
-    do
-    {
-        // Set arguments
-        unsigned int idx = 0;
-
-        add_2D_tensor_argument(idx, _accum, 1, accum_slice);
-        add_1D_tensor_argument(idx, _biases, 2, biases_slice);
-        _kernel.update_shader_params();
-
-        enqueue(*this, accum_slice, _lws);
-    }
-    while(window.slide_window_slice_2D(accum_slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
deleted file mode 100644
index daad70bba9..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-GCGEMMMatrixAdditionKernel::GCGEMMMatrixAdditionKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-void GCGEMMMatrixAdditionKernel::configure(const IGCTensor *input, IGCTensor *output, float beta)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
-
-    _input                                               = input;
-    _output                                              = output;
-    const unsigned int num_elems_processed_per_iteration = max_gc_vector_width / data_size_from_type(input->info()->data_type());
-
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.emplace("#define BETA " + float_to_string_with_full_precision(beta));
-
-    // Create kernel
-    build_opts.emplace("#define GEMM_MATRIXADDITION");
-    std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
-    _kernel                    = GCKernelLibrary::get().create_kernel(("gemm_ma"), build_opts);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    IGCKernel::configure(win);
-}
-
-void GCGEMMMatrixAdditionKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, 1, slice);
-        add_2D_tensor_argument(idx, _output, 2, slice);
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
deleted file mode 100644
index 2f69728b61..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,338 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/AccessWindowTranspose.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
-{
-    ARM_COMPUTE_UNUSED(reshape_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
-
-    if(!is_interleaved_transposed)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
-
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0));
-            ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-        }
-    }
-    else
-    {
-        const int m                         = reshape_info.m();
-        const int n                         = reshape_info.n();
-        const int k                         = reshape_info.k();
-        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-
-        TensorShape tensor_shape0{ input0->tensor_shape() };
-        tensor_shape0.set(0, k);
-        tensor_shape0.set(1, m);
-
-        TensorShape tensor_shape1{ input1->tensor_shape() };
-        tensor_shape1.set(0, n);
-        tensor_shape1.set(1, k);
-
-        const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
-        const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
-
-        const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
-        const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
-
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
-
-        if(output->total_size() != 0)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast<size_t>(n));
-            ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast<size_t>(m));
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
-        }
-    }
-
-    return Status{};
-}
-
-inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output,
-                                                               bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
-                                                               GPUTarget gpu_target, ElementsProcessed &num_elements_processed)
-{
-    ARM_COMPUTE_UNUSED(gpu_target);
-
-    // Output tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ input0->tensor_shape() };
-    tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->dimension(0));
-    tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->dimension(1));
-
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(tensor_shape));
-
-    bool   window_changed = false;
-    Window win{};
-
-    const DataType data_type                           = input0->data_type();
-    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-
-    if(is_interleaved_transposed)
-    {
-        // Configure window kernel
-        num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
-        num_elems_processed_per_iteration_y = 4;
-
-        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
-        AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-    }
-    else // The input tensors have not been reshaped
-    {
-        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor.
-        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
-
-        switch(data_type)
-        {
-            case DataType::F16:
-                num_elems_processed_per_iteration_x = 4;
-                break;
-
-            case DataType::F32:
-                num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
-                break;
-
-            default:
-                ARM_COMPUTE_ERROR("Current data type is not supported");
-                break;
-        }
-
-        win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-        AccessWindowStatic    input0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y));
-        AccessWindowStatic    input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-
-        update_window_and_padding(win, input0_access, input1_access, output_access);
-
-        Coordinates coord;
-        coord.set_num_dimensions(output->num_dimensions());
-        output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr)
-{
-}
-
-void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-    // Perform validate step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info));
-
-    _input0 = input0;
-    _input1 = input1;
-    _output = output;
-
-    // Get target architecture
-    GPUTarget gpu_target = get_target();
-
-    ElementsProcessed num_elements_processed{};
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, gpu_target, num_elements_processed);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    IGCKernel::configure(win_config.second);
-
-    // Create build options
-    std::set<std::string> build_opts;
-    std::string           kernel_name;
-
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.emplace("#define COLS_A " + support::cpp11::to_string(input0->info()->dimension(0)));
-    build_opts.emplace("#define COLS_B " + support::cpp11::to_string(input1->info()->dimension(0)));
-    build_opts.emplace("#define ALPHA " + float_to_string_with_full_precision(alpha));
-
-    // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if(is_interleaved_transposed)
-    {
-        const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
-        const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
-
-        build_opts.emplace("#define MULT_TRANSPOSE1XW_WIDTH " + support::cpp11::to_string(mult_transpose1xW_width));
-        build_opts.emplace("#define MULT_INTERLEAVE4X4_HEIGHT " + support::cpp11::to_string(mult_interleave4x4_height));
-
-        switch(input0->info()->data_type())
-        {
-            case DataType::F16:
-                build_opts.emplace("#define DATA_TYPE_FP16");
-                break;
-
-            case DataType::F32:
-                build_opts.emplace("#define DATA_TYPE_FP32");
-                break;
-
-            default:
-                ARM_COMPUTE_ERROR("Current data type is not supported");
-                break;
-        }
-
-        build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED");
-
-        kernel_name = "gemm_mm_interleaved_transposed";
-    }
-    else
-    {
-        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
-
-        GPUTarget arch_target = get_arch_from_target(gpu_target);
-        switch(input0->info()->data_type())
-        {
-            case DataType::F16:
-                build_opts.emplace("#define DATA_TYPE_FP16");
-                build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
-                build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
-                break;
-
-            case DataType::F32:
-                build_opts.emplace("#define DATA_TYPE_FP32");
-
-                if(arch_target == GPUTarget::BIFROST && input0->info()->num_dimensions() != 1)
-                {
-                    build_opts.emplace("#define GEMM_MM_FLOATING_POINT_BIFROST");
-                }
-                else
-                {
-                    build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
-                }
-                break;
-
-            default:
-                ARM_COMPUTE_ERROR("Current data type is not supported");
-                break;
-        }
-
-        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elements_processed.x()));
-        build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elements_processed.y()));
-
-        kernel_name = "gemm_mm_floating_point";
-    }
-
-    // Create kernel
-    _kernel = GCKernelLibrary::get().create_kernel(kernel_name, build_opts);
-}
-
-Status GCGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed,
-                                            const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target)
-{
-    ARM_COMPUTE_UNUSED(alpha);
-    ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
-                                                              input1->clone().get(),
-                                                              output->clone().get(),
-                                                              is_interleaved_transposed,
-                                                              reshape_info,
-                                                              gpu_target,
-                                                              num_elements_processed)
-                                .first);
-    return Status{};
-}
-
-void GCGEMMMatrixMultiplyKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window slice          = window.first_slice_window_2D();
-    Window slice_matrix_b = slice;
-
-    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-    do
-    {
-        Window slice_b = slice;
-        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_input1->info()->num_dimensions() < 3)
-        {
-            slice_b = slice_matrix_b;
-        }
-
-        unsigned int idx = 0;
-
-        add_2D_tensor_argument(idx, _input0, 1, slice);
-        add_2D_tensor_argument(idx, _input1, 2, slice_b);
-        add_2D_tensor_argument(idx, _output, 3, slice);
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
deleted file mode 100644
index 1d6ef3d0e8..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowTranspose.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-
-using namespace arm_compute;
-
-void GCGEMMTranspose1xWKernel::configure(const IGCTensor *input, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape  output_shape{ input->info()->tensor_shape() };
-    const size_t transpose_w = 16 / input->info()->element_size();
-    output_shape.set(0, input->info()->dimension(1) * transpose_w);
-    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
-    const int          scale_x                           = num_elems_processed_per_iteration;
-
-    _input  = input;
-    _output = output;
-
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    /*
-     * Following an example of how the transposition1xW works when the input data type is F32
-     *
-     *         |a00 a01 a02 a03|
-     *         |a10 a11 a12 a13|
-     *         |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
-     *         |a30 a31 a32 a33|
-     *
-     * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
-     */
-    // Create kernel
-    build_opts.emplace("#define GEMM_TRANSPOSE1xW");
-    _kernel = GCKernelLibrary::get().create_kernel("gemm_transpose1x4", build_opts);
-
-    // Configure window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowTranspose  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    IGCKernel::configure(win);
-}
-
-void GCGEMMTranspose1xWKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    // Output is transposed
-    Window out_window(window);
-    out_window.set(Window::DimX, window.y());
-    out_window.set(Window::DimY, window.x());
-
-    Window in_slice  = window.first_slice_window_2D();
-    Window out_slice = out_window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_2D_tensor_argument(idx, _input, 1, in_slice);
-        add_2D_tensor_argument(idx, _output, 2, out_slice);
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, in_slice);
-    }
-    while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
deleted file mode 100644
index c12dd38cb4..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-#include <tuple>
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-} // namespace
-
-GCIm2ColKernel::GCIm2ColKernel()
-    : _input(nullptr), _output(nullptr), _convolved_dims(), _kernel_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr)
-{
-}
-
-void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
-
-    // Create kernel
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.insert("#define " + dt_name);
-
-    if(has_bias)
-    {
-        build_opts.emplace("#define HAS_BIAS");
-    }
-
-    int stride_x = 0;
-    int stride_y = 0;
-
-    std::tie(stride_x, stride_y) = conv_info.stride();
-    _kernel_dims = std::make_pair(kernel_dims.width, kernel_dims.height);
-
-    const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
-                                     && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                                    input->info()->tensor_shape().cend(),
-                                                    output->info()->tensor_shape().cbegin() + 1))
-                                     && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding())
-                                     && (dilation == Size2D(1U, 1U));
-
-    std::string kernel_name = "im2col_generic";
-    if(!run_img2col_reduced)
-    {
-        if(input->info()->data_type() == DataType::F16 && _kernel_dims == std::pair<unsigned int, unsigned int>(1, 1))
-        {
-            build_opts.emplace("#define KERNEL_1x1");
-        }
-
-        build_opts.emplace("#define IM2COL_GENERIC");
-        _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
-                                            kernel_dims.width, kernel_dims.height,
-                                            conv_info, dilation);
-        _num_elems_processed_per_iteration = (input->info()->data_type() == DataType::F32) ? 1 : 2;
-
-        build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.width));
-        build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.height));
-        build_opts.emplace("#define KERNEL_DEPTH " + support::cpp11::to_string(input->info()->dimension(2)));
-        build_opts.emplace("#define CONVOLVED_WIDTH " + support::cpp11::to_string(_convolved_dims.first));
-        build_opts.emplace("#define CONVOLVED_HEIGHT " + support::cpp11::to_string(_convolved_dims.second));
-        build_opts.emplace("#define STRIDE_X " + support::cpp11::to_string(conv_info.stride().first));
-        build_opts.emplace("#define STRIDE_Y " + support::cpp11::to_string(conv_info.stride().second));
-        build_opts.emplace("#define PAD_LEFT " + support::cpp11::to_string(conv_info.pad_left()));
-        build_opts.emplace("#define PAD_TOP " + support::cpp11::to_string(conv_info.pad_top()));
-        build_opts.emplace("#define PAD_RIGHT " + support::cpp11::to_string(conv_info.pad_right()));
-        build_opts.emplace("#define PAD_BOTTOM " + support::cpp11::to_string(conv_info.pad_bottom()));
-        build_opts.emplace("#define SRC_WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
-        build_opts.emplace("#define SRC_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1)));
-        build_opts.emplace("#define DILATION_X " + support::cpp11::to_string(dilation.x()));
-        build_opts.emplace("#define DILATION_Y " + support::cpp11::to_string(dilation.y()));
-
-        _run_func = &GCIm2ColKernel::run_generic;
-    }
-    else
-    {
-        build_opts.emplace("#define IM2COL_REDUCED");
-        kernel_name = "im2col_reduced";
-
-        if(input->info()->data_type() == DataType::F32)
-        {
-            _num_elems_processed_per_iteration = 4 / input->info()->element_size();
-        }
-        else if(input->info()->data_type() == DataType::F16)
-        {
-            int input_width  = input->info()->dimension(0);
-            int input_height = input->info()->dimension(1);
-
-            build_opts.emplace("#define IMAGE_SIZE " + support::cpp11::to_string(input_width * input_height));
-            if(input_width % 8 == 0)
-            {
-                _num_elems_processed_per_iteration = 8;
-                build_opts.emplace("#define IM2COL_REDUCED_8X");
-            }
-            else if(input_width % 4 == 0)
-            {
-                _num_elems_processed_per_iteration = 4;
-                build_opts.emplace("#define IM2COL_REDUCED_4X");
-            }
-            else if(input_width % 2 == 0)
-            {
-                _num_elems_processed_per_iteration = 2;
-                build_opts.emplace("#define IM2COL_REDUCED_2X");
-            }
-            else
-            {
-                _num_elems_processed_per_iteration = 2;
-                build_opts.emplace("#define IM2COL_REDUCED_GENERIC");
-            }
-        }
-
-        _run_func = &GCIm2ColKernel::run_reduced;
-    }
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, build_opts));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
-
-    if(input->info()->data_type() == DataType::F16)
-    {
-        // Calculate input right and bottom border
-        const int input_width         = input->info()->dimension(0);
-        const int input_height        = input->info()->dimension(1);
-        int       input_total_width   = input->info()->padding().left + input_width + input->info()->padding().right;
-        int       input_padding_right = ceil_to_multiple(input_total_width, _num_elems_processed_per_iteration) - input_total_width;
-        input_total_width             = input_width + input_padding_right + input->info()->padding().right;
-        AccessWindowStatic input_access(input->info(), 0, 0, input_total_width, input_height);
-
-        // Calculate output right and bottom border
-        const int          output_width         = output->info()->dimension(0);
-        const int          output_height        = output->info()->dimension(1);
-        const int          output_padding_right = ceil_to_multiple(output_width, _num_elems_processed_per_iteration) - output_width;
-        AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height);
-
-        update_window_and_padding(win, input_access, output_access);
-    }
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    if(!run_img2col_reduced)
-    {
-        // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
-        win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
-    }
-
-    IGCKernel::configure(win);
-}
-
-Status GCIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
-{
-    ARM_COMPUTE_UNUSED(kernel_dims);
-    ARM_COMPUTE_UNUSED(conv_info);
-    ARM_COMPUTE_UNUSED(has_bias);
-    ARM_COMPUTE_UNUSED(dilation);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    return Status{};
-}
-
-void GCIm2ColKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
-    (this->*_run_func)(window);
-}
-
-void GCIm2ColKernel::run_generic(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
-
-    // Get initial windows
-    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
-
-    // Change the Z dimension's step back to 1
-    window_collapsed.set_dimension_step(Window::DimZ, 1);
-
-    Window slice     = window_collapsed.first_slice_window_3D();
-    Window slice_in  = window_collapsed.first_slice_window_3D();
-    Window slice_out = window_collapsed.first_slice_window_3D();
-
-    // Setup slice
-    slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
-    slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
-
-    // Setup output slice
-    slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration));
-    slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
-    slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
-
-    // we need top/left pad to be included in valid region
-    if(_input->info()->data_type() == DataType::F16)
-    {
-        (dynamic_cast<TensorInfo *>(_input->info()))->init(_input->info()->tensor_shape(), _input->info()->num_channels(), _input->info()->data_type(), _input->info()->strides_in_bytes(), 0,
-                                                           _input->info()->total_size());
-    }
-
-    _kernel.use();
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice_in);
-        add_2D_tensor_argument(idx, _output, 2, slice_out);
-        _kernel.set_argument(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
-        _kernel.set_argument(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
-        _kernel.update_shader_params();
-
-        enqueue(*this, slice);
-    }
-    while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
-}
-
-void GCIm2ColKernel::run_reduced(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    Window out_slice = out_window.first_slice_window_1D();
-    Window in_slice  = window.first_slice_window_3D();
-
-    _kernel.use();
-
-    // Run kernel
-    do
-    {
-        // Set arguments
-        unsigned int idx = 0;
-
-        add_3D_tensor_argument(idx, _input, 1, in_slice);
-        add_1D_tensor_argument(idx, _output, 2, out_slice);
-        _kernel.set_argument(idx++, _input->info()->dimension(0));
-        _kernel.set_argument(idx++, _input->info()->dimension(1));
-        _kernel.update_shader_params();
-
-        enqueue(*this, in_slice);
-    }
-    while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
deleted file mode 100644
index c29d9fc4d5..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <string>
-
-using namespace arm_compute;
-
-GCNormalizationLayerKernel::GCNormalizationLayerKernel()
-    : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0)
-{
-}
-
-BorderSize GCNormalizationLayerKernel::border_size() const
-{
-    return _border_size;
-}
-
-void GCNormalizationLayerKernel::configure(const IGCTensor *input, const IGCTensor *squared_input, IGCTensor *output, NormalizationLayerInfo norm_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
-    ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
-
-    // Set build options
-    std::set<std::string> build_opts;
-
-    _input         = input;
-    _squared_input = squared_input;
-    _output        = output;
-
-    const bool         is_in_map    = norm_info.is_in_map();
-    const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
-    _border_size                    = BorderSize(0, border_width);
-
-    // Set kernel static arguments
-    std::string func_name = ((norm_info.type() == NormType::IN_MAP_1D) ? "IN_MAP_1D" : "CROSS_MAP");
-    build_opts.emplace(("#define " + func_name));
-    build_opts.emplace(("#define COEFF " + float_to_string_with_full_precision(norm_info.scale_coeff())));
-    build_opts.emplace(("#define BETA " + float_to_string_with_full_precision(norm_info.beta())));
-    build_opts.emplace(("#define KAPPA " + float_to_string_with_full_precision(norm_info.kappa())));
-    build_opts.emplace(("#define RADIUS " + support::cpp11::to_string(norm_info.norm_size() / 2)));
-    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
-    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
-    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("normalization_layer", build_opts));
-
-    // Configure kernel window
-    const unsigned int num_elems_processed_per_iteration = 1;
-    const unsigned int num_elems_read_per_iteration      = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
-    AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, squared_input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    IGCKernel::configure(win);
-}
-
-void GCNormalizationLayerKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    _kernel.use();
-
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx     = 0;
-        unsigned int binding = 1;
-        add_3D_tensor_argument(idx, _input, binding++, slice);
-        add_3D_tensor_argument(idx, _squared_input, binding++, slice);
-        add_3D_tensor_argument(idx, _output, binding++, slice);
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
deleted file mode 100644
index 971b540a83..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
-
-    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *mean, ITensorInfo *std)
-{
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, *input->clone());
-
-    const unsigned int num_elems_processed_per_iteration = 4;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    const int              mean_padding = ceil_to_multiple(mean->dimension(0), num_elems_processed_per_iteration) - mean->dimension(0);
-    const int              std_padding  = ceil_to_multiple(std->dimension(0), num_elems_processed_per_iteration) - std->dimension(0);
-    AccessWindowStatic     mean_access(mean, 0, 0, mean->dimension(0) + mean_padding, mean->dimension(1));
-    AccessWindowStatic     std_access(std, 0, 0, std->dimension(0) + std_padding, std->dimension(1));
-
-    const bool window_changed = update_window_and_padding(win, input_access, output_access, mean_access, std_access);
-    output_access.set_valid_region(win, input->valid_region());
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-GCNormalizePlanarYUVLayerKernel::GCNormalizePlanarYUVLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _std(nullptr)
-{
-}
-
-void GCNormalizePlanarYUVLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), *input->info()->clone());
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), mean->info(), std->info()));
-
-    _input  = input;
-    _output = output;
-    _mean   = mean;
-    _std    = std;
-
-    // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)));
-    build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)));
-    build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)));
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("normalize_planar_yuv_layer", build_opts));
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), mean->info(), std->info());
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    IGCKernel::configure(std::get<1>(win_config));
-}
-
-Status GCNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), mean->clone().get(), std->clone().get())));
-    return Status{};
-}
-
-void GCNormalizePlanarYUVLayerKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    _kernel.use();
-
-    _output->set_needs_shifting(true);
-
-    Window slice = window.first_slice_window_3D();
-
-    Window slice_in;
-    //slice_in.use_tensor_dimensions(_mean->info()->tensor_shape());
-    slice_in = window.first_slice_window_1D();
-    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-
-    unsigned int idx = 2 * num_arguments_per_3D_tensor();
-    add_1D_tensor_argument(idx, _mean, 3, slice_in);
-    add_1D_tensor_argument(idx, _std, 4, slice_in);
-
-    slice_in = window.first_slice_window_3D();
-
-    slice.shift(Window::DimX, -(_output->info()->padding()).left);
-
-    do
-    {
-        idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice_in);
-        add_3D_tensor_argument(idx, _output, 2, slice);
-
-        _kernel.update_shader_params();
-
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
deleted file mode 100644
index 76559146ae..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <cmath>
-#include <cstdlib>
-#include <set>
-#include <string>
-using namespace arm_compute;
-
-GCPixelWiseMultiplicationKernel::GCPixelWiseMultiplicationKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void GCPixelWiseMultiplicationKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
-
-    // Auto initialize output if not initialized
-    {
-        set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-        set_format_if_unknown(*output->info(), Format::F32);
-    }
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    std::string data_type;
-    std::string compute_type;
-
-    // Set kernel build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-
-    build_opts.emplace("#define SCALE " + support::cpp11::to_string(scale));
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("pixelwise_mul_float", build_opts));
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-    Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input1_access, input2_access, output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-    output_access.set_valid_region(win, valid_region);
-
-    IGCKernel::configure(win);
-}
-
-void GCPixelWiseMultiplicationKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window slice = window.first_slice_window_3D();
-
-    do
-    {
-        unsigned int idx     = 0;
-        unsigned int binding = 1;
-        add_3D_tensor_argument(idx, _input1, binding++, slice);
-        add_3D_tensor_argument(idx, _input2, binding++, slice);
-        add_3D_tensor_argument(idx, _output, binding++, slice);
-
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
deleted file mode 100644
index 13efd10532..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-#include <tuple>
-
-using namespace arm_compute;
-
-namespace
-{
-// Internal window config info
-using GCPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
-
-void auto_init(const ITensorInfo *input, ITensorInfo *output, unsigned int pooled_w, unsigned int pooled_h)
-{
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(0, pooled_w);
-    output_shape.set(1, pooled_h);
-
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices, "Indices not supported in GLES backend");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type == PoolingType::L2),
-                                    "Unsupported combination of parameters!");
-    ARM_COMPUTE_RETURN_ERROR_ON(!pool_info.pad_stride_info.padding_is_symmetric());
-
-    const bool         is_global_pooling = pool_info.is_global_pooling;
-    const unsigned int pool_size         = is_global_pooling ? input->tensor_shape().x() : pool_info.pool_size.width;
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_global_pooling && (input->tensor_shape().x() != input->tensor_shape().y()),
-                                    "Global pooling is supported only with rectangular inputs!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!is_global_pooling && ((pool_info.pad_stride_info.pad().first >= pool_size) || (pool_info.pad_stride_info.pad().second >= pool_size)),
-                                    "Invalid pool size and pool pad combination!");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_size.width != pool_info.pool_size.height, "Invalid Pool size, width not equal to height!");
-
-    // Checks performed when output is configured
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-        unsigned int pooled_w = 0;
-        unsigned int pooled_h = 0;
-        std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
-                                                         input->dimension(1),
-                                                         pool_size,
-                                                         pool_size,
-                                                         pool_info.pad_stride_info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != pooled_w) || (output->dimension(1) != pooled_h),
-                                        "Invalid output pooling dimensions!");
-    }
-
-    return Status{};
-}
-
-std::tuple<Status, Window, GCPoolingConfig> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PoolingLayerInfo &pool_info)
-{
-    int                 pool_pad_x      = 0;
-    int                 pool_pad_y      = 0;
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    unsigned int        pooled_w        = 0;
-    unsigned int        pooled_h        = 0;
-    int                 pool_size       = pool_info.pool_size.width;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Update pool size in case of global pooling
-    pool_size = pool_info.is_global_pooling ? input->dimension(0) : pool_size;
-
-    // Check output dimensions
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->dimension(0),
-                                                     input->dimension(1),
-                                                     pool_size,
-                                                     pool_size,
-                                                     pad_stride_info);
-
-    auto_init(input, output, pooled_w, pooled_h);
-
-    BorderSize border_size = BorderSize(pool_pad_y, pool_pad_x);
-
-    const int input_width  = input->dimension(0);
-    const int input_height = input->dimension(1);
-
-    unsigned int num_elems_processed_per_iteration = 1;
-
-    // Create kernel
-    if(pool_size == 3)
-    {
-        // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenGLES kernel where
-        // each thread computes 4 output elements
-        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3);
-
-        int num_elems_read_per_iteration = pool_size;
-
-        if(input->data_type() == DataType::F32)
-        {
-            if(is_pool3x3_stride_le3)
-            {
-                // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
-                num_elems_processed_per_iteration = 4;
-                num_elems_read_per_iteration      = pool_size * (pool_stride_x + 1);
-            }
-        }
-        else
-        {
-            if(is_pool3x3_stride_le3)
-            {
-                num_elems_processed_per_iteration = 4;
-            }
-            else
-            {
-                num_elems_processed_per_iteration = 2;
-            }
-        }
-
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
-
-        border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
-    }
-    else // Run general case
-    {
-        if(input->data_type() == DataType::F32)
-        {
-            num_elems_processed_per_iteration = 1;
-        }
-        else
-        {
-            num_elems_processed_per_iteration = 2;
-        }
-
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
-        const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
-
-        border_size.right  = std::max(upper_bound_w, pool_pad_x);
-        border_size.bottom = std::max(upper_bound_h, pool_pad_y);
-    }
-    // Configure kernel window
-    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    if(input->data_type() == DataType::F32)
-    {
-        AccessWindowStatic     input_access(input, -pool_pad_x, -pool_pad_y, input_width + border_size.right, input_height + border_size.bottom);
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-        return std::make_tuple(err, win, GCPoolingConfig(num_elems_processed_per_iteration, border_size));
-    }
-    else
-    {
-        // Calculate output right and bottom border
-        const int output_width          = output->dimension(0);
-        const int output_height         = output->dimension(1);
-        const int output_padding_right  = ceil_to_multiple(output_width, num_elems_processed_per_iteration) - output_width;
-        const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height;
-
-        const int input_total_width    = std::max(int(input->padding().left), int(pool_pad_x)) + input_width + std::max(int(input->padding().right), int(pool_pad_x));
-        const int input_padding_right  = ceil_to_multiple(input_total_width, num_elems_processed_per_iteration) - input_width - pool_pad_x;
-        const int input_total_height   = std::max(int(input->padding().top), int(pool_pad_y)) + input_height + std::max(int(input->padding().bottom), int(pool_pad_y));
-        const int input_padding_bottom = input_total_height - input_height - pool_pad_y;
-
-        // Configure kernel window
-        AccessWindowStatic input_access(input, -pool_pad_x, -pool_pad_y, input_width + input_padding_right, input_height + input_padding_bottom);
-        AccessWindowStatic output_access(output, 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
-        bool               window_changed = update_window_and_padding(win, input_access, output_access);
-        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-        return std::make_tuple(err, win, GCPoolingConfig(num_elems_processed_per_iteration, border_size));
-    }
-}
-} // namespace
-
-GCPoolingLayerKernel::GCPoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _indices(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
-{
-}
-
-BorderSize GCPoolingLayerKernel::border_size() const
-{
-    return _border_size;
-}
-
-void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices)
-{
-    int                 pool_pad_x      = 0;
-    int                 pool_pad_y      = 0;
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    unsigned int        pooled_w        = 0;
-    unsigned int        pooled_h        = 0;
-    const PoolingType   pool_type       = pool_info.pool_type;
-    int                 pool_size       = pool_info.pool_size.width;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const bool          exclude_padding = pool_info.exclude_padding;
-    std::tie(pool_pad_x, pool_pad_y)       = pad_stride_info.pad();
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Update pool size in case of global pooling
-    pool_size = pool_info.is_global_pooling ? input->info()->dimension(0) : pool_size;
-
-    // Check output dimensions
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
-                                                     input->info()->dimension(1),
-                                                     pool_size,
-                                                     pool_size,
-                                                     pad_stride_info);
-
-    auto_init(input->info(), output->info(), pooled_w, pooled_h);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr));
-
-    // Set instance variables
-    _input     = input;
-    _output    = output;
-    _pool_info = pool_info;
-    _indices   = indices;
-    // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    if(input->info()->data_type() == DataType::F32)
-    {
-        build_opts.insert("#define DATA_TYPE_FP32");
-    }
-    else
-    {
-        build_opts.insert("#define DATA_TYPE_FP16");
-    }
-    if(exclude_padding)
-    {
-        build_opts.emplace("#define EXCLUDE_PADDING");
-    }
-    build_opts.emplace(("#define POOL_" + string_from_pooling_type(pool_type)));
-    build_opts.emplace(("#define STRIDE_X " + support::cpp11::to_string(pool_stride_x)));
-    build_opts.emplace(("#define MAX_WIDTH " + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x))));
-    build_opts.emplace(("#define MAX_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y))));
-    build_opts.emplace(("#define STRIDE_Y " + support::cpp11::to_string(pool_stride_y)));
-    build_opts.emplace(("#define PAD_X " + support::cpp11::to_string(pool_pad_x)));
-    build_opts.emplace(("#define PAD_Y " + support::cpp11::to_string(pool_pad_y)));
-
-    // Create kernel
-    if((pool_size == 2) || (pool_size == 3) || (pool_size == 7))
-    {
-        // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenGLES kernel where
-        // each thread computes 4 output elements
-        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3);
-
-        std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
-        if(is_pool3x3_stride_le3)
-        {
-            build_opts.insert("#define POOLING_LAYER_3_OPTIMIZED");
-            _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts));
-        }
-        else
-        {
-            build_opts.insert("#define POOLING_LAYER_" + support::cpp11::to_string(pool_size));
-            _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, build_opts));
-        }
-    }
-    else // Run general case
-    {
-        build_opts.emplace(("#define POOL_SIZE " + support::cpp11::to_string(pool_size)));
-
-        build_opts.insert("#define POOLING_LAYER_N");
-        _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("pooling_layer_n", build_opts));
-    }
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), pool_info);
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-    IGCKernel::configure(std::get<1>(win_config));
-    GCPoolingConfig pooling_config     = std::get<2>(win_config);
-    _num_elems_processed_per_iteration = pooling_config.first;
-    _border_size                       = pooling_config.second;
-}
-
-Status GCPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get(), pool_info)));
-
-    return Status{};
-}
-
-void GCPoolingLayerKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    unsigned int pool_pad_x;
-    unsigned int pool_pad_y;
-    unsigned int pool_stride_x;
-    unsigned int pool_stride_y;
-    std::tie(pool_pad_x, pool_pad_y)       = _pool_info.pad_stride_info.pad();
-    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
-
-    _kernel.use();
-
-    _output->set_needs_shifting(true);
-
-    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
-
-    Window slice         = window_collapsed.first_slice_window_3D();
-    Window slice_in_orig = window_collapsed.first_slice_window_3D();
-
-    slice.shift(Window::DimX, -(_output->info()->padding()).left);
-
-    do
-    {
-        // Upsample input by pool size
-        Window in_slice(slice_in_orig); // NOLINT
-        in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration));
-        in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
-
-        // Set inputs
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, in_slice);
-        add_3D_tensor_argument(idx, _output, 2, slice);
-
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_in_orig));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
deleted file mode 100644
index a0795c668f..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-BorderSize GCScaleKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void GCScaleKernel::configure(const IGCTensor *input, IGCTensor *output, const ScaleKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(output == input);
-    ARM_COMPUTE_ERROR_ON(info.interpolation_policy != InterpolationPolicy::NEAREST_NEIGHBOR);
-
-    _input  = input;
-    _output = output;
-
-    // Compute the ratio between source width/height and destination width/height
-    const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
-    const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
-
-    // Compute actual border size
-    const bool border_undefined = info.border_mode == BorderMode::UNDEFINED;
-    BorderSize border           = border_undefined ? BorderSize(0) : border_size();
-
-    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    auto interpolation_policy_to_use = info.interpolation_policy;
-    if(interpolation_policy_to_use == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
-    {
-        interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR_ON(interpolation_policy_to_use == InterpolationPolicy::AREA);
-    }
-
-    // Create kernel
-    std::set<std::string> build_opts;
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-
-    build_opts.emplace("#define DATA_TYPE_FP16");
-    build_opts.emplace("#define BORDER_SIZE " + support::cpp11::to_string(border.right));
-    if(info.sampling_policy == SamplingPolicy::TOP_LEFT)
-    {
-        build_opts.emplace("#define SAMPLING_POLICY_TOP_LEFT");
-    }
-    else
-    {
-        build_opts.emplace("#define SAMPLING_POLICY_CENTER");
-    }
-
-    // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = 4;
-    unsigned int input_width_alignment             = 2;
-
-    // performance optimization for 2x upscaling with no border
-    if((fabs(wr - 0.5) < 1e-6) && (fabs(hr - 0.5) < 1e-6) && border_undefined)
-    {
-        num_elems_processed_per_iteration = 8;
-        input_width_alignment             = 4;
-        build_opts.emplace("#define SCALE_NEAREST_8X");
-    }
-    else
-    {
-        build_opts.emplace("#define SCALE_NEAREST_GENERIC");
-    }
-
-    std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use); // NOLINT
-    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
-    std::string kernel_name = "scale_" + interpolation_name;
-    _kernel                 = GCKernelLibrary::get().create_kernel(kernel_name, build_opts);
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    const ValidRegion &input_valid_region = input->info()->valid_region();
-
-    const int total_width   = border.left + input_valid_region.anchor[0] + input_valid_region.shape[0] + border.right;
-    const int padding_right = ceil_to_multiple(total_width, input_width_alignment) - border.left - input_valid_region.anchor[0] - input_valid_region.shape[0];
-
-    // Reads can occur within the valid region of the input
-    AccessWindowStatic input_access(input->info(),
-                                    input_valid_region.anchor[0] - border.left, input_valid_region.anchor[1] - border.top,
-                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + padding_right,
-                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border.bottom);
-
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, calculate_valid_region_scale(*(input->info()),
-                                                                     output->info()->tensor_shape(),
-                                                                     info.interpolation_policy,
-                                                                     info.sampling_policy,
-                                                                     border_undefined));
-
-    IGCKernel::configure(win);
-
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the tensor parameters
-    _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(0)));
-    _kernel.set_argument<float>(idx++, static_cast<float>(input->info()->dimension(1)));
-    _kernel.set_argument<float>(idx++, wr);
-    _kernel.set_argument<float>(idx++, hr);
-}
-
-void GCScaleKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    _kernel.use();
-
-    _output->set_needs_shifting(true);
-
-    Window slice    = window.first_slice_window_3D();
-    Window slice_in = window.first_slice_window_3D();
-
-    slice.shift(Window::DimX, -(_output->info()->padding()).left);
-
-    do
-    {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, slice_in);
-        add_3D_tensor_argument(idx, _output, 2, slice);
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_in));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
deleted file mode 100644
index 39d586da72..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    // Softmax across the x dimension
-    TensorShape output_shape{ input->info()->tensor_shape() };
-    output_shape.set(0, 1);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-
-    _input  = input;
-    _output = output;
-
-    // Set build options
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.insert("#define " + dt_name);
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.insert("#define SOFTMAX_LAYER_MAX");
-
-    // Tell the kernel that the width is not a multiple of 8
-    if((input->info()->dimension(0) % 8) != 0)
-    {
-        build_opts.insert("#define NON_MULTIPLE_OF_8");
-    }
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
-
-    // Set fixed arguments
-    unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
-    _kernel.set_argument(idx++, input->info()->dimension(0));
-
-    // Configure kernel window
-    // The kernel loops over all elements in steps of 8
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8);
-    unsigned int       num_elems_written_per_iteration   = 1;
-    if(input->info()->data_type() == DataType::F16)
-    {
-        num_elems_written_per_iteration = 2;
-    }
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    IGCKernel::configure(win);
-}
-
-GCLogits1DShiftExpSumKernel::GCLogits1DShiftExpSumKernel()
-    : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
-{
-}
-
-void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTensor *max, IGCTensor *output, IGCTensor *sum)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type());
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum);
-
-    _input  = input;
-    _max    = max;
-    _output = output;
-    _sum    = sum;
-
-    // Set build options
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.insert("#define " + dt_name);
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.insert("#define SOFTMAX_LAYER_SHIFT_EXP_SUM");
-
-    // Tell the kernel that the width is not a multiple of 8
-    if((input->info()->dimension(0) % 8) != 0)
-    {
-        build_opts.insert("#define NON_MULTIPLE_OF_8");
-    }
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
-
-    // Set fixed arguments
-    unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
-    _kernel.set_argument(idx++, input->info()->dimension(0));
-
-    // Configure window
-    // The kernel loops over all elements in steps of 8
-    const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 8);
-    unsigned int       num_elems_written_per_iteration   = 1;
-    if(input->info()->data_type() == DataType::F16)
-    {
-        num_elems_written_per_iteration = 2;
-    }
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal max_access(max->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, input_access, max_access, output_access, sum_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-    sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
-
-    IGCKernel::configure(win);
-}
-
-void GCLogits1DShiftExpSumKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    _kernel.use();
-
-    do
-    {
-        unsigned int idx     = 0;
-        unsigned int binding = 1; // SSBO binding starts from 1.
-        // Set inputs
-        add_3D_tensor_argument(idx, _input, binding++, slice);
-        add_3D_tensor_argument(idx, _max, binding++, slice);
-        add_3D_tensor_argument(idx, _output, binding++, slice);
-        add_3D_tensor_argument(idx, _sum, binding++, slice);
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
-
-GCLogits1DNormKernel::GCLogits1DNormKernel()
-    : _input(nullptr), _sum(nullptr), _output(nullptr)
-{
-}
-
-void GCLogits1DNormKernel::configure(const IGCTensor *input, const IGCTensor *sum, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output);
-
-    // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-    _input  = input;
-    _sum    = sum;
-    _output = output;
-
-    // Set build options
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.insert("#define " + dt_name);
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.insert("#define SOFTMAX_LAYER_NORM");
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
-
-    // Configure window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    unsigned int           num_elems_written_per_iteration   = 1;
-    if(input->info()->data_type() == DataType::F16)
-    {
-        num_elems_written_per_iteration = 2;
-    }
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowStatic     sum_access(sum->info(), 0, 0, num_elems_written_per_iteration, sum->info()->dimension(1));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, sum_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    IGCKernel::configure(win);
-}
-
-void GCLogits1DNormKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ);
-    Window slice            = window_collapsed.first_slice_window_3D();
-
-    _kernel.use();
-
-    do
-    {
-        Window sum_slice = slice;
-        sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        unsigned int idx     = 0;
-        unsigned int binding = 1; // SSBO binding starts from 1.
-        // Set inputs
-        add_3D_tensor_argument(idx, _input, binding++, slice);
-        add_3D_tensor_argument(idx, _sum, binding++, slice);
-        add_3D_tensor_argument(idx, _output, binding++, slice);
-
-        _kernel.update_shader_params();
-        enqueue(*this, slice);
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
deleted file mode 100644
index 78b008484e..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-using namespace arm_compute;
-
-GCTensorShiftKernel::GCTensorShiftKernel()
-    : _input(nullptr), _lws(gles::NDRange(1U, 1U, 1U)), _left_padding(0)
-{
-}
-
-void GCTensorShiftKernel::configure(IGCTensor *input)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-
-    _input = input;
-
-    std::set<std::string> options;
-    options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0]));
-    options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1]));
-    options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2]));
-    options.emplace("#define WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
-
-    std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    options.emplace(("#define " + dt_name));
-
-    unsigned int num_elems_written_per_iteration_x = input->info()->dimension(0) + input->info()->padding().left + input->info()->padding().right;
-
-    std::stringstream kernel_name;
-    kernel_name << "tensorshift";
-
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
-
-    Window win;
-    win.set(Window::DimX, Window::Dimension(0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_x));
-    win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimY);
-    win.use_tensor_dimensions(input->info()->tensor_shape(), Window::DimZ);
-
-    _left_padding = _input->info()->padding().left;
-
-    IGCKernel::configure(win);
-}
-
-void GCTensorShiftKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    if(int(_left_padding) == 0 || !_input->needs_shifting())
-    {
-        return;
-    }
-
-    _kernel.use();
-
-    // Get initial windows
-    Window slice = window.first_slice_window_3D();
-    slice.shift(Window::DimX, -(_input->info()->padding()).left);
-
-    do
-    {
-        unsigned int idx = 0;
-
-        add_3D_tensor_argument(idx, _input, 1, slice);
-
-        _kernel.set_argument(idx++, static_cast<unsigned int>(_left_padding));
-
-        _kernel.update_shader_params();
-        enqueue(*this, slice, _lws);
-    }
-    while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
deleted file mode 100644
index 3bec05b5f1..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include <set>
-#include <string>
-
-using namespace arm_compute;
-
-void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape  output_shape{ input->info()->tensor_shape() };
-    const size_t w_out = input->info()->dimension(1);
-    const size_t h_out = input->info()->dimension(0);
-    output_shape.set(0, w_out);
-    output_shape.set(1, h_out);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    // for better performance
-    if(w_out < 512 && h_out < 512)
-    {
-        _lws_hint = gles::NDRange(8U, 1U, 1U);
-    }
-    else
-    {
-        _lws_hint = gles::NDRange(1U, 8U, 1U);
-    }
-
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace(("#define " + dt_name));
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws_hint[0]));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws_hint[1]));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws_hint[2]));
-
-    // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = 4;
-
-    if(input->info()->data_type() == DataType::F16)
-    {
-#define TRANSPOSE_8X8
-
-#if defined(TRANSPOSE_4X4)
-        build_opts.emplace(("#define TRANSPOSE_4X4"));
-        num_elems_processed_per_iteration = 4;
-#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */
-        if(w_out != h_out)
-        {
-            build_opts.emplace("#define TRANSPOSE_8X8");
-            num_elems_processed_per_iteration = 8;
-        }
-        else
-        {
-            build_opts.emplace("#define TRANSPOSE_8X8_SQUARE");
-            num_elems_processed_per_iteration = 8;
-        }
-#endif                       /* TRANSPOSE_4X4 */
-    }
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("transpose", build_opts));
-
-    const unsigned int width_aligned  = num_elems_processed_per_iteration * static_cast<unsigned int>(_lws_hint[0]);
-    const unsigned int height_aligned = num_elems_processed_per_iteration * static_cast<unsigned int>(_lws_hint[1]);
-
-    AccessWindowStatic input_access(input->info(), 0, 0,
-                                    ceil_to_multiple(input->info()->dimension(0), width_aligned),
-                                    ceil_to_multiple(input->info()->dimension(1), height_aligned));
-    AccessWindowStatic output_access(output->info(), 0, 0,
-                                     ceil_to_multiple(output->info()->dimension(0), height_aligned),
-                                     ceil_to_multiple(output->info()->dimension(1), width_aligned));
-
-    Window win = calculate_max_window(*input->info(), Steps(width_aligned, height_aligned));
-    win.set_dimension_step(Window::DimX, num_elems_processed_per_iteration);
-    win.set_dimension_step(Window::DimY, num_elems_processed_per_iteration);
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, output->info()->valid_region());
-
-    IGCKernel::configure(win);
-}
-
-void GCTransposeKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window);
-
-    _kernel.use();
-
-    Window slice = window.first_slice_window_2D();
-
-    do
-    {
-        unsigned int idx = 0;
-
-        add_2D_tensor_argument(idx, _input, 1, slice);
-        add_2D_tensor_argument(idx, _output, 2, slice);
-        _kernel.update_shader_params();
-        enqueue(*this, slice, _lws_hint);
-    }
-    while(window.slide_window_slice_2D(slice));
-}
diff --git a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
deleted file mode 100644
index bcdbfb60dc..0000000000
--- a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/StringSupport.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-GCWeightsReshapeKernel::GCWeightsReshapeKernel()
-    : _input(nullptr), _biases(nullptr), _output(nullptr)
-{
-}
-
-void GCWeightsReshapeKernel::configure(const IGCTensor *input, const IGCTensor *biases, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr))));
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->num_dimensions() != 1));
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->num_dimensions() != 2));
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 4) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3]));
-        ARM_COMPUTE_ERROR_ON((input->info()->num_dimensions() == 5) && (biases->info()->dimension(0) != input->info()->tensor_shape()[3] || biases->info()->dimension(1) != input->info()->tensor_shape()[4]));
-    }
-
-    _biases = biases;
-    _output = output;
-    _input  = input;
-
-    // Create build options
-    std::set<std::string> build_opts;
-    std::string           dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
-    build_opts.emplace("#define " + dt_name);
-    build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
-    build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1));
-    build_opts.emplace("#define RESHAPE_TO_COLUMNS");
-    if(biases != nullptr)
-    {
-        build_opts.emplace("#define HAS_BIAS");
-    }
-
-    // Create kernel
-    _kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
-
-    // Set static arguments
-    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
-    idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
-    _kernel.set_argument(idx++, _input->info()->dimension(0));
-    _kernel.set_argument(idx++, _input->info()->dimension(1));
-    _kernel.set_argument(idx++, _input->info()->dimension(2));
-    _kernel.set_argument(idx++, _input->info()->dimension(3));
-
-    // Configure window
-    Window win = calculate_max_window(*input->info(), Steps());
-
-    // The GCWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-    IGCKernel::configure(win);
-}
-
-void GCWeightsReshapeKernel::run(const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window);
-
-    Window out_window;
-    out_window.use_tensor_dimensions(_output->info()->tensor_shape());
-
-    Window in_slice  = window.first_slice_window_3D();
-    Window out_slice = out_window.first_slice_window_2D();
-
-    Window biases_window;
-    Window biases_slice;
-
-    if(_biases != nullptr)
-    {
-        biases_window.use_tensor_dimensions(_biases->info()->tensor_shape());
-        biases_slice = biases_window.first_slice_window_1D();
-    }
-
-    _kernel.use();
-
-    do
-    {
-        // Set arguments
-        unsigned idx = 0;
-        add_3D_tensor_argument(idx, _input, 1, in_slice);
-        add_2D_tensor_argument(idx, _output, 2, out_slice);
-        if(_biases != nullptr)
-        {
-            add_1D_tensor_argument(idx, _biases, 3, biases_slice);
-            biases_window.slide_window_slice_1D(biases_slice);
-        }
-
-        _kernel.update_shader_params();
-        // Run kernel
-        enqueue(*this, in_slice);
-    }
-    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
-}
diff --git a/src/graph/backends/GLES/GCDeviceBackend.cpp b/src/graph/backends/GLES/GCDeviceBackend.cpp
deleted file mode 100644
index dcab2a5697..0000000000
--- a/src/graph/backends/GLES/GCDeviceBackend.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/backends/GLES/GCDeviceBackend.h"
-
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/graph/backends/BackendRegistrar.h"
-#include "arm_compute/graph/backends/GLES/GCFunctionFactory.h"
-#include "arm_compute/graph/backends/GLES/GCNodeValidator.h"
-#include "arm_compute/graph/backends/GLES/GCTensorHandle.h"
-
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/BlobLifetimeManager.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/MemoryManagerOnDemand.h"
-#include "arm_compute/runtime/PoolManager.h"
-
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-namespace backends
-{
-/** Register GLES backend */
-static detail::BackendRegistrar<GCDeviceBackend> GCDeviceBackend_registrar(Target::GC);
-
-GCDeviceBackend::GCDeviceBackend()
-    : _initialized(false), _allocator()
-{
-}
-
-void GCDeviceBackend::initialize_backend()
-{
-    // Setup Scheduler
-    GCScheduler::get().default_init();
-}
-
-void GCDeviceBackend::release_backend_context(GraphContext &ctx)
-{
-    //Nothing to do
-    ARM_COMPUTE_UNUSED(ctx);
-}
-
-void GCDeviceBackend::setup_backend_context(GraphContext &ctx)
-{
-    // Force backend initialization
-    if(!_initialized)
-    {
-        initialize_backend();
-        _initialized = true;
-    }
-
-    // Setup a management backend
-    if(ctx.memory_management_ctx(Target::GC) == nullptr)
-    {
-        MemoryManagerContext mm_ctx;
-        mm_ctx.target      = Target::GC;
-        mm_ctx.intra_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
-        mm_ctx.cross_mm    = create_memory_manager(MemoryManagerAffinity::Buffer);
-        mm_ctx.cross_group = std::make_shared<MemoryGroup>(mm_ctx.cross_mm);
-        mm_ctx.allocator   = &_allocator;
-
-        ctx.insert_memory_management_ctx(std::move(mm_ctx));
-    }
-}
-
-bool GCDeviceBackend::is_backend_supported()
-{
-    return arm_compute::opengles31_is_available();
-}
-
-IAllocator *GCDeviceBackend::backend_allocator()
-{
-    return &_allocator;
-}
-
-std::unique_ptr<ITensorHandle> GCDeviceBackend::create_tensor(const Tensor &tensor)
-{
-    // Get tensor descriptor
-    const TensorDescriptor &tensor_desc = tensor.desc();
-    ARM_COMPUTE_ERROR_ON(tensor_desc.target != Target::GC);
-
-    // Create backend tensor handle
-    TensorInfo info(tensor_desc.shape, 1, tensor_desc.data_type, tensor_desc.quant_info);
-    info.set_data_layout(tensor_desc.layout);
-
-    return std::make_unique<GCTensorHandle>(info);
-}
-
-std::unique_ptr<ITensorHandle> GCDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
-{
-    ARM_COMPUTE_UNUSED(parent, shape, coords, extend_parent);
-    ARM_COMPUTE_ERROR("GLES backend has no sub-tensor support!");
-    return nullptr;
-}
-
-std::unique_ptr<arm_compute::IFunction> GCDeviceBackend::configure_node(INode &node, GraphContext &ctx)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Configuring GC node with ID : " << node.id() << std::endl);
-    ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::GC);
-
-    // Configure node
-    return GCFunctionFactory::create(&node, ctx);
-}
-
-arm_compute::Status GCDeviceBackend::validate_node(INode &node)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating GC node with ID : " << node.id() << std::endl);
-    ARM_COMPUTE_ERROR_ON(node.assigned_target() != Target::GC);
-
-    return GCNodeValidator::validate(&node);
-}
-
-std::shared_ptr<arm_compute::IMemoryManager> GCDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
-{
-    if(affinity == MemoryManagerAffinity::Offset)
-    {
-        ARM_COMPUTE_LOG_GRAPH_WARNING("GC Backend does not support offset affinity memory management!");
-        return nullptr;
-    }
-
-    auto lifetime_mgr = std::make_shared<BlobLifetimeManager>();
-    auto pool_mgr     = std::make_shared<PoolManager>();
-    auto mm           = std::make_shared<MemoryManagerOnDemand>(lifetime_mgr, pool_mgr);
-
-    return mm;
-}
-
-std::shared_ptr<arm_compute::IWeightsManager> GCDeviceBackend::create_weights_manager()
-{
-    return nullptr;
-}
-} // namespace backends
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
deleted file mode 100644
index ac14425ad4..0000000000
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/backends/GLES/GCFunctionFactory.h"
-
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/backends/FunctionHelpers.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
-#include "support/Cast.h"
-
-using namespace arm_compute::utils::cast;
-
-namespace arm_compute
-{
-namespace graph
-{
-namespace backends
-{
-/** Target specific information structure used to pass information to the layer templates */
-struct GCTargetInfo
-{
-    using TensorType    = arm_compute::IGCTensor;
-    using SrcTensorType = TensorType;
-    static Target TargetType;
-};
-
-Target GCTargetInfo::TargetType = Target::GC;
-
-/** Collection of GC convolution functions */
-struct GCConvolutionLayerFunctions
-{
-    using GenericConvolutionLayer = GCConvolutionLayer;
-    using GEMMConvolutionLayer    = GCConvolutionLayer;
-    using DirectConvolutionLayer  = GCDirectConvolutionLayer;
-};
-
-/** Collection of GC depthwise convolution functions */
-struct GCDepthwiseConvolutionLayerFunctions
-{
-    using DepthwiseConvolutionLayer3x3 = GCDepthwiseConvolutionLayer3x3;
-};
-
-/** Collection of GC element-wise functions */
-struct GCEltwiseFunctions
-{
-    using Addition       = GCArithmeticAddition;
-    using Multiplication = GCPixelWiseMultiplication;
-};
-
-namespace detail
-{
-template <>
-std::unique_ptr<IFunction> create_convolution_layer<GCConvolutionLayerFunctions, GCTargetInfo>(ConvolutionLayerNode &node, GraphContext &ctx)
-{
-    validate_node<GCTargetInfo>(node, 3 /* expected inputs */, 1 /* expected outputs */);
-
-    // Extract IO and info
-    GCTargetInfo::TensorType *input   = get_backing_tensor<GCTargetInfo>(node.input(0));
-    GCTargetInfo::TensorType *weights = get_backing_tensor<GCTargetInfo>(node.input(1));
-    GCTargetInfo::TensorType *biases  = get_backing_tensor<GCTargetInfo>(node.input(2));
-    GCTargetInfo::TensorType *output  = get_backing_tensor<GCTargetInfo>(node.output(0));
-
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
-    {
-        biases->info()->set_data_type(DataType::S32);
-    }
-
-    const PadStrideInfo       conv_info      = node.convolution_info();
-    const ConvolutionMethod   conv_algorithm = node.convolution_method();
-    const ActivationLayerInfo fused_act      = node.fused_activation();
-
-    // Create and configure function (we assume that functions have been validated before creation)
-    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, GCTargetInfo::TargetType);
-    std::unique_ptr<IFunction>      func;
-    std::string                     func_name;
-
-    if(conv_algorithm == ConvolutionMethod::Direct)
-    {
-        std::tie(func, func_name) = create_named_function<GCConvolutionLayerFunctions::DirectConvolutionLayer>(
-                                        std::string("DirectConvolutionLayer"),
-                                        input, weights, biases, output, conv_info, fused_act);
-    }
-    else
-    {
-        std::tie(func, func_name) = create_named_memory_managed_function<GCConvolutionLayerFunctions::GenericConvolutionLayer>(
-                                        std::string("ConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act);
-    }
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input QuantInfo: " << input->info()->quantization_info()
-                               << " Weights QuantInfo: " << weights->info()->quantization_info()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return std::move(func);
-}
-
-template <>
-std::unique_ptr<IFunction> create_depthwise_convolution_layer<GCDepthwiseConvolutionLayerFunctions, GCTargetInfo>(DepthwiseConvolutionLayerNode &node)
-{
-    validate_node<GCTargetInfo>(node, 3 /* expected inputs */, 1 /* expected outputs */);
-
-    // Extract IO and info
-    GCTargetInfo::TensorType *input   = get_backing_tensor<GCTargetInfo>(node.input(0));
-    GCTargetInfo::TensorType *weights = get_backing_tensor<GCTargetInfo>(node.input(1));
-    GCTargetInfo::TensorType *biases  = get_backing_tensor<GCTargetInfo>(node.input(2));
-    GCTargetInfo::TensorType *output  = get_backing_tensor<GCTargetInfo>(node.output(0));
-
-    if(is_data_type_quantized_asymmetric(input->info()->data_type()))
-    {
-        biases->info()->set_data_type(DataType::S32);
-    }
-
-    const PadStrideInfo              conv_info        = node.convolution_info();
-    const DepthwiseConvolutionMethod dwc_algorithm    = node.depthwise_convolution_method();
-    const ActivationLayerInfo        fused_act        = node.fused_activation();
-    const int                        depth_multiplier = node.depth_multiplier();
-
-    // Create and configure function (we assume that functions have been validated before creation)
-    std::unique_ptr<IFunction> func;
-    std::string                func_name;
-    if(dwc_algorithm == DepthwiseConvolutionMethod::Optimized3x3)
-    {
-        std::tie(func, func_name) = create_named_function<GCDepthwiseConvolutionLayerFunctions::DepthwiseConvolutionLayer3x3>(
-                                        std::string("DepthwiseConvolutionLayer3x3"),
-                                        input, weights, biases, output, conv_info, depth_multiplier, fused_act);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Generic DepthwiseConvolutionLayer is not supported in GLES backend");
-    }
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Target " << GCTargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input QuantInfo: " << input->info()->quantization_info()
-                               << " Weights QuantInfo: " << weights->info()->quantization_info()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Depth multiplier: " << depth_multiplier
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return std::move(func);
-}
-
-template <>
-std::unique_ptr<IFunction> create_eltwise_layer<GCEltwiseFunctions, GCTargetInfo>(EltwiseLayerNode &node)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
-        "Creating GC EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
-    ARM_COMPUTE_ERROR_ON(node.num_inputs() != 2);
-    ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
-
-    // Extract IO and info
-    GCTargetInfo::TensorType *input1         = get_backing_tensor<GCTargetInfo>(node.input(0));
-    GCTargetInfo::TensorType *input2         = get_backing_tensor<GCTargetInfo>(node.input(1));
-    GCTargetInfo::TensorType *output         = get_backing_tensor<GCTargetInfo>(node.output(0));
-    const EltwiseOperation    eltwise_op     = node.eltwise_operation();
-    const ConvertPolicy       convert_policy = node.convert_policy();
-    ARM_COMPUTE_ERROR_ON(input1 == nullptr);
-    ARM_COMPUTE_ERROR_ON(input2 == nullptr);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-
-    std::unique_ptr<IFunction> func = nullptr;
-    std::string                func_name;
-    if(eltwise_op == EltwiseOperation::Add)
-    {
-        std::tie(func, func_name) = create_named_function<GCEltwiseFunctions::Addition>(
-                                        std::string("GCArithmeticAddition"),
-                                        input1, input2, output, convert_policy);
-    }
-    else if(eltwise_op == EltwiseOperation::Sub)
-    {
-        ARM_COMPUTE_ERROR("Arithmetic subtraction is not supported in GLES backend");
-    }
-    else if(eltwise_op == EltwiseOperation::Mul)
-    {
-        std::tie(func, func_name) = create_named_function<GCEltwiseFunctions::Multiplication>(
-                                        std::string("PixelWiseMultiplication"),
-                                        input1, input2, output, 1.f);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
-    }
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << GCTargetInfo::TargetType
-                               << " Operation: " << func_name
-                               << " Data Type: " << input1->info()->data_type()
-                               << " Shape: " << input1->info()->tensor_shape()
-                               << std::endl);
-
-    return std::move(func);
-}
-} //namespace detail
-
-std::unique_ptr<IFunction> GCFunctionFactory::create(INode *node, GraphContext &ctx)
-{
-    if(node == nullptr)
-    {
-        return nullptr;
-    }
-
-    NodeType type = node->type();
-    switch(type)
-    {
-        case NodeType::ActivationLayer:
-            return detail::create_activation_layer<GCActivationLayer, GCTargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
-        case NodeType::BatchNormalizationLayer:
-            return detail::create_batch_normalization_layer<GCBatchNormalizationLayer, GCTargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
-        case NodeType::ConvolutionLayer:
-            return detail::create_convolution_layer<GCConvolutionLayerFunctions, GCTargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
-        case NodeType::ConcatenateLayer:
-            return detail::create_concatenate_layer<GCConcatenateLayer, GCTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
-        case NodeType::DepthwiseConvolutionLayer:
-            return detail::create_depthwise_convolution_layer<GCDepthwiseConvolutionLayerFunctions, GCTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
-        case NodeType::EltwiseLayer:
-            return detail::create_eltwise_layer<GCEltwiseFunctions, GCTargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
-        case NodeType::FullyConnectedLayer:
-            return detail::create_fully_connected_layer<GCFullyConnectedLayer, GCTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
-        case NodeType::NormalizationLayer:
-            return detail::create_normalization_layer<GCNormalizationLayer, GCTargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
-        case NodeType::NormalizePlanarYUVLayer:
-            return detail::create_normalize_planar_yuv_layer<GCNormalizePlanarYUVLayer, GCTargetInfo>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
-        case NodeType::PoolingLayer:
-            return detail::create_pooling_layer<GCPoolingLayer, GCTargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
-        case NodeType::PrintLayer:
-            return detail::create_print_layer<GCTargetInfo>(*polymorphic_downcast<PrintLayerNode *>(node));
-        case NodeType::ResizeLayer:
-            return detail::create_resize_layer<GCScale, GCTargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
-        case NodeType::SoftmaxLayer:
-            return detail::create_softmax_layer<GCSoftmaxLayer, GCTargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
-        default:
-            return nullptr;
-    }
-}
-} // namespace backends
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
deleted file mode 100644
index a83c1a3506..0000000000
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/backends/GLES/GCNodeValidator.h"
-
-#include "arm_compute/graph/backends/ValidateHelpers.h"
-#include "arm_compute/graph/nodes/Nodes.h"
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
-#include "support/Cast.h"
-
-using namespace arm_compute::utils::cast;
-
-namespace arm_compute
-{
-namespace graph
-{
-namespace backends
-{
-namespace
-{
-/** Validates a Depthwise Convolution layer node
- *
- * @param[in] node Node to validate
- *
- * @return Status
- */
-Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating GCDepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
-    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
-
-    // Extract IO and info
-    arm_compute::ITensorInfo *weights = detail::get_backing_tensor_info(node.input(1));
-    ARM_COMPUTE_ERROR_ON(weights == nullptr);
-
-    // TODO (geopin01) : Switch when validation is implemented
-    // Validate function
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->tensor_shape().x() != 3 && weights->tensor_shape().y() != 3, "Unsupported depthwise convolution");
-
-    return Status{};
-}
-/** Validates a Convolution layer node
- *
- * @param[in] node Node to validate
- *
- * @return Status
- */
-Status validate_convolution_layer(ConvolutionLayerNode &node)
-{
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
-    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
-
-    // Extract IO and info
-    arm_compute::ITensorInfo *weights        = detail::get_backing_tensor_info(node.input(1));
-    const PadStrideInfo       conv_info      = node.convolution_info();
-    const ConvolutionMethod   conv_algorithm = node.convolution_method();
-
-    // Validate function
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(node.num_groups() != 1, "Grouping is not supported by ConvolutionLayer!");
-    if(conv_algorithm == ConvolutionMethod::Direct)
-    {
-        bool is_square         = weights->tensor_shape().x() == weights->tensor_shape().y();
-        bool is_direct         = (weights->tensor_shape().x() == 1) || (weights->tensor_shape().x() == 3) || (weights->tensor_shape().x() == 5);
-        bool is_correct_stride = (conv_info.stride().first) <= 2 && (conv_info.stride().second <= 2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(is_square && is_direct && is_correct_stride), "Direct convolution is not supported for given configuration");
-    }
-
-    return Status{};
-}
-} // namespace
-
-Status GCNodeValidator::validate(INode *node)
-{
-    if(node == nullptr)
-    {
-        return Status{};
-    }
-
-    NodeType type = node->type();
-    switch(type)
-    {
-        case NodeType::BoundingBoxTransformLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer");
-        case NodeType::ChannelShuffleLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ChannelShuffleLayer");
-        case NodeType::ConvolutionLayer:
-            return validate_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node));
-        case NodeType::DepthwiseConvolutionLayer:
-            return validate_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
-        case NodeType::DequantizationLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : DequantizationLayer");
-        case NodeType::DetectionOutputLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : DetectionOutputLayer");
-        case NodeType::DetectionPostProcessLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : DetectionPostProcessLayer");
-        case NodeType::FlattenLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : FlattenLayer");
-        case NodeType::GenerateProposalsLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
-        case NodeType::NormalizePlanarYUVLayer:
-            return detail::validate_normalize_planar_yuv_layer<GCNormalizePlanarYUVLayer>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
-        case NodeType::PadLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PadLayer");
-        case NodeType::PermuteLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PermuteLayer");
-        case NodeType::PriorBoxLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : PriorBoxLayer");
-        case NodeType::QuantizationLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : QuantizationLayer");
-        case NodeType::ReorgLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ReorgLayer");
-        case NodeType::ReshapeLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ReshapeLayer");
-        case NodeType::ROIAlignLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
-        case NodeType::SliceLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : SliceLayer");
-        default:
-            return Status{};
-    }
-}
-} // namespace backends
-} // namespace graph
-} // namespace arm_compute
diff --git a/src/graph/backends/GLES/GCTensorHandle.cpp b/src/graph/backends/GLES/GCTensorHandle.cpp
deleted file mode 100644
index 94e8813246..0000000000
--- a/src/graph/backends/GLES/GCTensorHandle.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/graph/backends/GLES/GCTensorHandle.h"
-
-#include "arm_compute/runtime/IMemoryGroup.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-namespace backends
-{
-GCTensorHandle::GCTensorHandle(const ITensorInfo &info)
-    : _tensor()
-{
-    _tensor.allocator()->init(info);
-}
-
-void GCTensorHandle::allocate()
-{
-    _tensor.allocator()->allocate();
-}
-
-void GCTensorHandle::free()
-{
-    _tensor.allocator()->free();
-}
-
-void GCTensorHandle::manage(IMemoryGroup *mg)
-{
-    if(mg != nullptr)
-    {
-        mg->manage(&_tensor);
-    }
-}
-
-void GCTensorHandle::map(bool blocking)
-{
-    _tensor.map(blocking);
-}
-
-void GCTensorHandle::unmap()
-{
-    _tensor.unmap();
-}
-
-void GCTensorHandle::release_if_unused()
-{
-    // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
-    if(!_tensor.is_used())
-    {
-        _tensor.allocator()->free();
-    }
-}
-
-const arm_compute::ITensor &GCTensorHandle::tensor() const
-{
-    return _tensor;
-}
-
-arm_compute::ITensor &GCTensorHandle::tensor()
-{
-    return _tensor;
-}
-
-ITensorHandle *GCTensorHandle::parent_handle()
-{
-    return this;
-}
-
-bool GCTensorHandle::is_subtensor() const
-{
-    return false;
-}
-
-Target GCTensorHandle::target() const
-{
-    return Target::GC;
-}
-} // namespace backends
-} // namespace graph
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp b/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
deleted file mode 100644
index 695331d743..0000000000
--- a/src/runtime/GLES_COMPUTE/GCBufferAllocator.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-void *GCBufferAllocator::allocate(size_t size, size_t alignment)
-{
-    ARM_COMPUTE_UNUSED(alignment);
-
-    auto *gl_ssbo_name = new GLuint;
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, *gl_ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
-
-    return reinterpret_cast<void *>(gl_ssbo_name);
-}
-
-void GCBufferAllocator::free(void *ptr)
-{
-    ARM_COMPUTE_ERROR_ON(ptr == nullptr);
-    auto *gl_ssbo_name = reinterpret_cast<GLuint *>(ptr);
-    ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, gl_ssbo_name));
-    delete gl_ssbo_name;
-}
-
-std::unique_ptr<IMemoryRegion> GCBufferAllocator::make_region(size_t size, size_t alignment)
-{
-    ARM_COMPUTE_UNUSED(alignment);
-    return std::make_unique<GCBufferMemoryRegion>(size);
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCHelpers.cpp b/src/runtime/GLES_COMPUTE/GCHelpers.cpp
deleted file mode 100644
index f4378d0fbf..0000000000
--- a/src/runtime/GLES_COMPUTE/GCHelpers.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCHelpers.h"
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-std::tuple<EGLDisplay, EGLContext, EGLBoolean> create_opengl_display_and_context()
-{
-    EGLBoolean res;
-    EGLDisplay display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError());
-
-    res = eglInitialize(display, nullptr, nullptr);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    const char *egl_extension_st = eglQueryString(display, EGL_EXTENSIONS);
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context");
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
-    ARM_COMPUTE_UNUSED(egl_extension_st);
-
-    const std::array<EGLint, 3> config_attribs =
-    {
-        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
-        EGL_NONE
-    };
-    EGLConfig cfg;
-    EGLint    count;
-
-    res = eglChooseConfig(display, config_attribs.data(), &cfg, 1, &count);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglBindAPI(EGL_OPENGL_ES_API);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
-
-    const std::array<EGLint, 3> attribs =
-    {
-        EGL_CONTEXT_CLIENT_VERSION, 3,
-        EGL_NONE
-    };
-    EGLContext context = eglCreateContext(display,
-                                          cfg,
-                                          EGL_NO_CONTEXT,
-                                          attribs.data());
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglMakeCurrent(display, EGL_NO_SURFACE, EGL_NO_SURFACE, context);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    return std::make_tuple(display, context, res);
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCMemory.cpp b/src/runtime/GLES_COMPUTE/GCMemory.cpp
deleted file mode 100644
index 4d74555f4e..0000000000
--- a/src/runtime/GLES_COMPUTE/GCMemory.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemory.h"
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
-#include "support/Cast.h"
-
-namespace arm_compute
-{
-GCMemory::GCMemory()
-    : _region(nullptr), _region_owned(nullptr)
-{
-}
-
-GCMemory::GCMemory(const std::shared_ptr<IGCMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
-{
-    _region_owned = memory;
-    _region       = _region_owned.get();
-}
-
-GCMemory::GCMemory(IGCMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
-{
-    _region = memory;
-}
-
-IGCMemoryRegion *GCMemory::gc_region()
-{
-    return _region;
-}
-
-IGCMemoryRegion *GCMemory::gc_region() const
-{
-    return _region;
-}
-
-IMemoryRegion *GCMemory::region()
-{
-    return _region;
-}
-
-IMemoryRegion *GCMemory::region() const
-{
-    return _region;
-}
-
-void GCMemory::set_region(IMemoryRegion *region)
-{
-    auto gc_region = utils::cast::polymorphic_downcast<IGCMemoryRegion *>(region);
-    _region_owned  = nullptr;
-    _region        = gc_region;
-}
-
-void GCMemory::set_owned_region(std::unique_ptr<IMemoryRegion> region)
-{
-    _region_owned = utils::cast::polymorphic_downcast_unique_ptr<IGCMemoryRegion>(std::move(region));
-    _region       = _region_owned.get();
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp b/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp
deleted file mode 100644
index 562854fb2b..0000000000
--- a/src/runtime/GLES_COMPUTE/GCMemoryRegion.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-IGCMemoryRegion::IGCMemoryRegion(size_t size)
-    : IMemoryRegion(size), _mapping(nullptr), _ssbo_name(0)
-{
-}
-
-const GLuint &IGCMemoryRegion::gc_ssbo_name() const
-{
-    return _ssbo_name;
-}
-
-void *IGCMemoryRegion::buffer()
-{
-    return _mapping;
-}
-
-const void *IGCMemoryRegion::buffer() const
-{
-    return _mapping;
-}
-
-GCBufferMemoryRegion::GCBufferMemoryRegion(size_t size)
-    : IGCMemoryRegion(size)
-{
-    ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast<GLsizeiptr>(size), nullptr, GL_STATIC_DRAW));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
-}
-
-GCBufferMemoryRegion::~GCBufferMemoryRegion()
-{
-    ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_ssbo_name));
-}
-
-void *GCBufferMemoryRegion::ptr()
-{
-    return nullptr;
-}
-
-void *GCBufferMemoryRegion::map(bool blocking)
-{
-    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
-    ARM_COMPUTE_UNUSED(blocking);
-
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
-    void *p  = ARM_COMPUTE_GL_CHECK(glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, static_cast<GLsizeiptr>(size()), GL_MAP_READ_BIT | GL_MAP_WRITE_BIT));
-    _mapping = reinterpret_cast<uint8_t *>(p);
-
-    return _mapping;
-}
-
-void GCBufferMemoryRegion::unmap()
-{
-    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
-
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _ssbo_name));
-    ARM_COMPUTE_GL_CHECK(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER));
-    ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0));
-    _mapping = nullptr;
-}
-
-std::unique_ptr<IMemoryRegion> GCBufferMemoryRegion::extract_subregion(size_t offset, size_t size)
-{
-    ARM_COMPUTE_UNUSED(offset, size);
-    return nullptr;
-}
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp b/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp
deleted file mode 100644
index 2ed78fe099..0000000000
--- a/src/runtime/GLES_COMPUTE/GCRuntimeContext.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h"
-
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-namespace arm_compute
-{
-GCRuntimeContext::GCRuntimeContext()
-    : _gpu_owned_scheduler(std::make_unique<GCScheduler>()),
-      _gpu_scheduler(_gpu_owned_scheduler.get()),
-      _core_context()
-{
-    auto attrs   = create_opengl_display_and_context();
-    auto display = std::get<0>(attrs);
-    auto ctx     = std::get<1>(attrs);
-
-    _gpu_owned_scheduler->default_init_with_context(display, ctx);
-    _kernel_lib.init("./cs_shaders/", display, ctx);
-
-    _core_context = GCCoreRuntimeContext(&_kernel_lib);
-}
-
-GCKernelLibrary &GCRuntimeContext::kernel_library()
-{
-    return _kernel_lib;
-}
-
-GCCoreRuntimeContext *GCRuntimeContext::core_runtime_context()
-{
-    return &_core_context;
-}
-
-void GCRuntimeContext::set_gpu_scheduler(GCScheduler *scheduler)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(scheduler);
-    _gpu_scheduler = scheduler;
-}
-
-GCScheduler *GCRuntimeContext::gpu_scheduler()
-{
-    return _gpu_scheduler;
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp
deleted file mode 100644
index 946d558e05..0000000000
--- a/src/runtime/GLES_COMPUTE/GCScheduler.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-
-using namespace arm_compute;
-
-std::once_flag GCScheduler::_initialize_symbols;
-
-GCScheduler::GCScheduler()
-    : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _target(GPUTarget::MIDGARD)
-{
-}
-
-GCScheduler::~GCScheduler()
-{
-    eglDestroyContext(_display, _context);
-    eglTerminate(_display);
-
-    _context = EGL_NO_CONTEXT;
-    _display = EGL_NO_DISPLAY;
-}
-
-void GCScheduler::default_init()
-{
-    setup_context();
-
-    init(_display, _context);
-}
-
-void GCScheduler::default_init_with_context(EGLDisplay display, EGLContext ctx)
-{
-    _context = ctx;
-    _display = display;
-
-    _target = get_target_from_device();
-}
-
-void GCScheduler::init(EGLDisplay dpy, EGLContext ctx)
-{
-    _target = get_target_from_device();
-
-    GCKernelLibrary::get().init("./cs_shaders/", dpy, ctx);
-}
-
-GCScheduler &GCScheduler::get()
-{
-    std::call_once(_initialize_symbols, opengles31_is_available);
-    static GCScheduler scheduler;
-    return scheduler;
-}
-
-void GCScheduler::dispatch(IGCKernel &kernel, bool flush)
-{
-    kernel.run(kernel.window());
-    if(flush)
-    {
-        ARM_COMPUTE_GL_CHECK(glFlush());
-    }
-}
-
-void GCScheduler::memory_barrier()
-{
-    ARM_COMPUTE_GL_CHECK(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT));
-}
-
-void GCScheduler::setup_context()
-{
-    EGLBoolean res;
-    _display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(_display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError());
-
-    res = eglInitialize(_display, nullptr, nullptr);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    const char *egl_extension_st = eglQueryString(_display, EGL_EXTENSIONS);
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context");
-    ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context");
-    ARM_COMPUTE_UNUSED(egl_extension_st);
-
-    const std::array<EGLint, 3> config_attribs =
-    {
-        EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR,
-        EGL_NONE
-    };
-    EGLConfig cfg;
-    EGLint    count;
-
-    res = eglChooseConfig(_display, config_attribs.data(), &cfg, 1, &count);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglBindAPI(EGL_OPENGL_ES_API);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError());
-
-    const std::array<EGLint, 3> attribs =
-    {
-        EGL_CONTEXT_CLIENT_VERSION, 3,
-        EGL_NONE
-    };
-    _context = eglCreateContext(_display,
-                                cfg,
-                                EGL_NO_CONTEXT,
-                                attribs.data());
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-
-    res = eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError());
-    ARM_COMPUTE_UNUSED(res);
-}
diff --git a/src/runtime/GLES_COMPUTE/GCTensor.cpp b/src/runtime/GLES_COMPUTE/GCTensor.cpp
deleted file mode 100644
index a73c9950a4..0000000000
--- a/src/runtime/GLES_COMPUTE/GCTensor.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-
-namespace arm_compute
-{
-GCTensor::GCTensor(IRuntimeContext *)
-    : _allocator(this)
-{
-}
-
-ITensorAllocator *GCTensor::allocator()
-{
-    return &_allocator;
-}
-
-TensorInfo *GCTensor::info() const
-{
-    return &_allocator.info();
-}
-
-TensorInfo *GCTensor::info()
-{
-    return &_allocator.info();
-}
-
-uint8_t *GCTensor::buffer() const
-{
-    return _allocator.data();
-}
-
-GLuint GCTensor::gc_buffer() const
-{
-    return _allocator.get_gl_ssbo_name();
-}
-
-void GCTensor::associate_memory_group(arm_compute::IMemoryGroup *memory_group)
-{
-    _allocator.set_associated_memory_group(memory_group);
-}
-
-void GCTensor::map(bool blocking)
-{
-    IGCTensor::map(blocking);
-}
-
-void GCTensor::unmap()
-{
-    IGCTensor::unmap();
-}
-
-uint8_t *GCTensor::do_map(bool blocking)
-{
-    return _allocator.map(blocking);
-}
-
-void GCTensor::do_unmap()
-{
-    _allocator.unmap();
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
deleted file mode 100644
index b3344d8ecb..0000000000
--- a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCTensorAllocator::GCTensorAllocator(IMemoryManageable *owner)
-    : _owner(owner), _associated_memory_group(nullptr), _memory(), _mapping(nullptr)
-{
-}
-
-uint8_t *GCTensorAllocator::data()
-{
-    return _mapping;
-}
-
-void GCTensorAllocator::allocate()
-{
-    if(_associated_memory_group == nullptr)
-    {
-        _memory.set_owned_region(std::make_unique<GCBufferMemoryRegion>(info().total_size()));
-    }
-    else
-    {
-        _associated_memory_group->finalize_memory(_owner, _memory, info().total_size(), alignment());
-    }
-    info().set_is_resizable(false);
-}
-
-void GCTensorAllocator::free()
-{
-    _mapping = nullptr;
-    _memory.set_region(nullptr);
-    info().set_is_resizable(true);
-}
-
-void GCTensorAllocator::set_associated_memory_group(IMemoryGroup *associated_memory_group)
-{
-    ARM_COMPUTE_ERROR_ON(associated_memory_group == nullptr);
-    ARM_COMPUTE_ERROR_ON(_associated_memory_group != nullptr && _associated_memory_group != associated_memory_group);
-    ARM_COMPUTE_ERROR_ON(_memory.region() != nullptr && _memory.gc_region()->gc_ssbo_name() != 0);
-
-    _associated_memory_group = associated_memory_group;
-}
-
-uint8_t *GCTensorAllocator::lock()
-{
-    return map(true);
-}
-
-void GCTensorAllocator::unlock()
-{
-    unmap();
-}
-
-GLuint GCTensorAllocator::get_gl_ssbo_name() const
-{
-    return (_memory.region() == nullptr) ? static_cast<GLuint>(0) : _memory.gc_region()->gc_ssbo_name();
-}
-
-uint8_t *GCTensorAllocator::map(bool blocking)
-{
-    ARM_COMPUTE_ERROR_ON(_mapping != nullptr);
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-
-    _mapping = reinterpret_cast<uint8_t *>(_memory.gc_region()->map(blocking));
-    return _mapping;
-}
-
-void GCTensorAllocator::unmap()
-{
-    ARM_COMPUTE_ERROR_ON(_mapping == nullptr);
-    ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-
-    _memory.gc_region()->unmap();
-    _mapping = nullptr;
-}
diff --git a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
deleted file mode 100644
index 4bb6a9994f..0000000000
--- a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-IGCSimpleFunction::IGCSimpleFunction(GCRuntimeContext *ctx) //NOLINT
-    : _kernel(),
-      _border_handler(),
-      _ctx(ctx)
-{
-}
-
-void IGCSimpleFunction::run()
-{
-    ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the GLES kernel or function isn't configured");
-    GCScheduler *scheduler = (_ctx != nullptr) ? _ctx->gpu_scheduler() : &GCScheduler::get().get();
-    ARM_COMPUTE_ERROR_ON(scheduler == nullptr);
-
-    scheduler->dispatch(_border_handler, false);
-    scheduler->memory_barrier();
-    scheduler->dispatch(*_kernel);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp b/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp
deleted file mode 100644
index 29630c8981..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/Helpers.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCAbsoluteDifference::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output)
-{
-    auto k = std::make_unique<GCAbsoluteDifferenceKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp
deleted file mode 100644
index b3815f1625..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
-
-namespace arm_compute
-{
-GCActivationLayer::GCActivationLayer(GCRuntimeContext *ctx)
-    : IGCSimpleFunction(ctx)
-{
-}
-
-void GCActivationLayer::configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info)
-{
-    auto core_ctx = _ctx ? _ctx->core_runtime_context() : /* Legacy */ nullptr;
-
-    auto k = std::make_unique<GCActivationLayerKernel>(core_ctx);
-    k->configure(input, output, act_info);
-    _kernel = std::move(k);
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp b/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp
deleted file mode 100755
index 5661a9bfdd..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCArithmeticAddition::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = std::make_unique<GCArithmeticAdditionKernel>();
-    k->configure(input1, input2, output, policy);
-    _kernel = std::move(k);
-}
-
-Status GCArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return GCArithmeticAdditionKernel::validate(input1, input2, output, policy);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
deleted file mode 100755
index 7ec0e4224f..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCBatchNormalizationLayer::GCBatchNormalizationLayer()
-    : _norm_kernel()
-{
-}
-
-void GCBatchNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon,
-                                          ActivationLayerInfo act_info)
-{
-    _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon, act_info);
-}
-
-void GCBatchNormalizationLayer::run()
-{
-    GCScheduler::get().dispatch(_norm_kernel, true);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
deleted file mode 100644
index 2c21d81e17..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
-namespace arm_compute
-{
-GCConcatenateLayer::GCConcatenateLayer()
-    : _concat_kernels(),
-      _num_inputs(0),
-      _axis(Window::DimZ)
-{
-}
-
-void GCConcatenateLayer::configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output, size_t axis)
-{
-    ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2);
-
-    _num_inputs = inputs_vector.size();
-    _axis       = axis;
-
-    TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, axis);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, inputs_vector[0]->info()->data_type());
-
-    unsigned int offset = 0;
-    switch(axis)
-    {
-        case Window::DimZ:
-        {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
-            {
-                auto kernel = std::make_unique<GCDepthConcatenateLayerKernel>();
-                kernel->configure(inputs_vector.at(i), offset, output);
-                offset += inputs_vector.at(i)->info()->dimension(axis);
-                _concat_kernels.emplace_back(std::move(kernel));
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Axis not supported");
-    }
-}
-
-void GCConcatenateLayer::run()
-{
-    for(auto &kernel : _concat_kernels)
-    {
-        GCScheduler::get().dispatch(*kernel, true);
-    }
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
deleted file mode 100644
index 93a66f012e..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-#include <cmath>
-#include <tuple>
-
-using namespace arm_compute;
-
-GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights()
-    : _weights_reshape_kernel()
-{
-}
-
-void GCConvolutionLayerReshapeWeights::configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(weights->info()->data_type()));
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
-
-    const bool       append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
-    const IGCTensor *biases_to_use = (append_biases) ? biases : nullptr;
-
-    _weights_reshape_kernel.configure(weights, biases_to_use, output);
-}
-
-void GCConvolutionLayerReshapeWeights::run()
-{
-    GCScheduler::get().dispatch(_weights_reshape_kernel);
-}
-
-GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr),
-      _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_activationlayer_enabled(false), _is_prepared(false)
-{
-}
-
-void GCConvolutionLayer::configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), output->info()));
-
-    _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
-}
-
-Status GCConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output)
-{
-    // Perform validation step on Matrix multiply function
-    GCGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */));
-    return Status{};
-}
-
-void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-    ARM_COMPUTE_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
-    ARM_COMPUTE_ERROR_ON(num_groups > 1);
-    ARM_COMPUTE_UNUSED(num_groups);
-
-    _is_prepared      = false;
-    _original_weights = weights;
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != weights->info()->dimension(3));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
-
-    const DataType dt = input->info()->data_type();
-
-    // Set the GPU target for im2col and col2im
-    _input_im2col_kernel.set_target(GCScheduler::get().get_target());
-    _output_col2im_kernel.set_target(GCScheduler::get().get_target());
-
-    const bool       append_bias   = (biases != nullptr);
-    const unsigned   bias_element  = (append_bias) ? 1 : 0;
-    const IGCTensor *biases_to_use = (append_bias) ? biases : nullptr;
-
-    // Get parameters from conv_info
-    unsigned int stride_x = 0;
-    unsigned int stride_y = 0;
-    std::tie(stride_x, stride_y) = conv_info.stride();
-
-    // Get convolved dimensions
-    unsigned int conv_w = 0;
-    unsigned int conv_h = 0;
-
-    const unsigned int kernel_width  = weights->info()->dimension(0);
-    const unsigned int kernel_height = weights->info()->dimension(1);
-    std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_width, kernel_height,
-                                                 conv_info, dilation);
-
-    unsigned int mat_weights_cols = weights->info()->dimension(3);
-    unsigned int mat_weights_rows = weights->info()->dimension(0) * weights->info()->dimension(1) * weights->info()->dimension(2) + bias_element;
-
-    // _weights_reshaped will be auto configured in the kernel.
-    // Just append biases and do not transpose 1xW as it will be reshaped in GCGEMM
-    _reshape_weights.configure(weights, biases_to_use, &_weights_reshaped);
-
-    weights = &_weights_reshaped;
-
-    // Create tensor to store im2col reshaped inputs
-    const unsigned int mat_input_cols = mat_weights_rows;
-    const unsigned int mat_input_rows = conv_w * conv_h;
-    TensorShape        shape_im2col   = input->info()->tensor_shape();
-    shape_im2col.set(0, mat_input_cols);
-    shape_im2col.set(1, mat_input_rows);
-    shape_im2col.set(2, 1);
-
-    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
-    TensorInfo im2col_reshaped_info(shape_im2col, 1, dt);
-    _input_im2col_reshaped.allocator()->init(im2col_reshaped_info);
-    _memory_group.manage(&_input_im2col_reshaped);
-
-    // Create GEMM output tensor
-    TensorShape shape_gemm = _input_im2col_reshaped.info()->tensor_shape();
-    shape_gemm.set(0, mat_weights_cols);
-    shape_gemm.set(1, mat_input_rows);
-    const DataType gemm_data_type = dt;
-
-    // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
-    TensorInfo info_gemm(shape_gemm, 1, gemm_data_type);
-    _gemm_output.allocator()->init(info_gemm);
-    _memory_group.manage(&_gemm_output);
-
-    if(dt == DataType::F16)
-    {
-        BorderSize border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-        input->info()->extend_padding(border_size);
-        _fill_border.configure(input, border_size, BorderMode::CONSTANT, PixelValue()); // for PAD of im2col fp16: consider it as border
-    }
-    // Configure im2col
-    _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation);
-
-    // Configure GEMM
-    configure_mm(&_input_im2col_reshaped, weights, &_gemm_output);
-
-    _input_im2col_reshaped.allocator()->allocate();
-
-    // Configure Col2Im
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
-    _gemm_output.allocator()->allocate();
-
-    ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
-
-    ARM_COMPUTE_UNUSED(weights_info);
-}
-
-void GCConvolutionLayer::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Run im2col
-    GCScheduler::get().dispatch(_fill_border);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_input_im2col_kernel);
-
-    // Run gemm on reshaped matrices
-    _mm_gemm.run();
-    GCScheduler::get().memory_barrier();
-
-    // Reshape output matrix
-    GCScheduler::get().dispatch(_output_col2im_kernel, false);
-    GCScheduler::get().memory_barrier();
-
-    // Run Activation Layer
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-}
-
-void GCConvolutionLayer::prepare()
-{
-    if(!_is_prepared)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run weights reshaping and mark as unused
-        _weights_reshaped.allocator()->allocate();
-        _reshape_weights.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-
-        _is_prepared = true;
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
deleted file mode 100644
index 46d5cc40d9..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCDepthwiseConvolutionLayer3x3::GCDepthwiseConvolutionLayer3x3()
-    : _kernel(nullptr), _border_handler(), _shift_handler(), _activationlayer_function(), _is_activationlayer_enabled(false)
-{
-}
-
-void GCDepthwiseConvolutionLayer3x3::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
-{
-    ARM_COMPUTE_ERROR_ON(dilation.x() != 1 || dilation.y() != 1);
-    ARM_COMPUTE_UNUSED(dilation);
-    auto k = std::make_unique<GCDepthwiseConvolutionLayer3x3Kernel>();
-    k->configure(input, weights, biases, output, conv_info, depth_multiplier);
-    _kernel = std::move(k);
-
-    // Configure border handler
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
-
-    _shift_handler.configure(input);
-
-    //Configure Activation Layer
-    _is_activationlayer_enabled = act_info.enabled();
-
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.configure(output, nullptr, act_info);
-    }
-}
-
-void GCDepthwiseConvolutionLayer3x3::run()
-{
-    GCScheduler::get().dispatch(_shift_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_border_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(*_kernel);
-
-    // Run Activation Layer
-    if(_is_activationlayer_enabled)
-    {
-        _activationlayer_function.run();
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
deleted file mode 100644
index 63c963196a..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCDirectConvolutionLayer::GCDirectConvolutionLayer()
-    : _kernel(nullptr), _border_handler(), _shift_handler()
-{
-}
-
-void GCDirectConvolutionLayer::configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                                         const ActivationLayerInfo &act_info)
-{
-    int kernel_size = weights->info()->dimension(0);
-
-    if(kernel_size == 1)
-    {
-        auto k = std::make_unique<GCDirectConvolutionLayer1x1Kernel>();
-        k->configure(input, weights, biases, output, conv_info, act_info);
-        _kernel = std::move(k);
-    }
-    else if(kernel_size == 3)
-    {
-        auto k = std::make_unique<GCDirectConvolutionLayer3x3Kernel>();
-        k->configure(input, weights, biases, output, conv_info, act_info);
-        _kernel = std::move(k);
-    }
-    else if(kernel_size == 5)
-    {
-        auto k = std::make_unique<GCDirectConvolutionLayer5x5Kernel>();
-        k->configure(input, weights, biases, output, conv_info, act_info);
-        _kernel = std::move(k);
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("kernel size unsupported!");
-        return;
-    }
-
-    _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue());
-
-    _shift_handler.configure(input);
-}
-
-void GCDirectConvolutionLayer::run()
-{
-    GCScheduler::get().dispatch(_shift_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_border_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(*_kernel);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_shift_handler);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
deleted file mode 100644
index 661bf5f9d6..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h"
-
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-
-using namespace arm_compute;
-
-GCDropoutLayer::GCDropoutLayer()
-    : _dropout_kernel()
-{
-}
-
-void GCDropoutLayer::configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mask, output);
-
-    // Configure kernel
-    _dropout_kernel.configure(input, mask, output, ratio, forward);
-}
-
-void GCDropoutLayer::run()
-{
-    GCScheduler::get().dispatch(_dropout_kernel);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp b/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp
deleted file mode 100644
index 97b4fd946c..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/Helpers.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCFillBorder::configure(IGCTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
-{
-    auto k = std::make_unique<GCFillBorderKernel>();
-    k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
deleted file mode 100644
index 299a027b42..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
-
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-#include <algorithm>
-
-using namespace arm_compute;
-
-void GCFullyConnectedLayerReshapeWeights::configure(const IGCTensor *input, IGCTensor *output)
-{
-    auto k = std::make_unique<GCTransposeKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-
-GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
-    : _memory_group(std::move(memory_manager)), _weights_manager(std::move(weights_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(),
-      _reshape_weights_output(), _original_weights(nullptr), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
-{
-}
-
-void GCFullyConnectedLayer::configure_conv_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))));
-
-    const DataType dt = input->info()->data_type();
-
-    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
-
-    // Initialize output tensor for im2col
-    TensorShape shape_im2col;
-    shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2));
-    shape_im2col.set(1, input->info()->dimension(3));
-    shape_im2col.set(2, input->info()->dimension(4));
-    shape_im2col.set(3, input->info()->dimension(5));
-    _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt));
-
-    // Configure im2col kernel
-    _memory_group.manage(&_im2col_output);
-    _im2col_kernel.configure(input, &_im2col_output, Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false);
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false);
-
-    // Allocate the output tensor for im2col once all the configure methods have been called
-    _im2col_output.allocator()->allocate();
-}
-
-void GCFullyConnectedLayer::configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1));
-
-    // Configure matrix multiply kernel
-    _mm_kernel.configure(input, weights, output, 1.0f, false);
-}
-
-void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output,
-                                      FullyConnectedLayerInfo fc_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
-    ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
-
-    _original_weights     = weights;
-    _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
-    _is_fc_after_conv     = true;
-    _accumulate_biases    = false;
-
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-
-        _accumulate_biases = true;
-
-        // Configure accumulate biases kernel
-        _accumulate_biases_kernel.configure(output, biases);
-    }
-
-    // With the Fully Connected layer we can have 4 different cases:
-    //  1) Convolution layer -> Fully Connected layer without batches
-    //  2) Fully Connected layer -> Fully Connected layer without batches
-    //  3) Convolution layer -> Fully Connected layer with batches
-    //  4) Fully Connected layer -> Fully Connected layer with batches
-
-    const IGCTensor *weights_to_use = weights;
-
-    if(!_are_weights_reshaped)
-    {
-        weights_to_use = &_reshape_weights_output;
-
-        // Reshape the weights
-        _reshape_weights_kernel.configure(weights, &_reshape_weights_output);
-    }
-
-    // Check if we have a fully connected layer with batches
-    const bool is_batched_fc_layer = output->info()->dimension(1) > 1;
-
-    if(is_batched_fc_layer)
-    {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3,
-                                                                                  input->info()->tensor_shape().cend(),
-                                                                                  output->info()->tensor_shape().cbegin() + 1));
-    }
-    else
-    {
-        _is_fc_after_conv = input->info()->num_dimensions() > 1;
-    }
-
-    if(_is_fc_after_conv)
-    {
-        // Fully Connected layer after a Convolution Layer without batches
-        configure_conv_fc(input, weights_to_use, output);
-    }
-    else
-    {
-        // Fully Connected layer after a Fully Connected Layer without batches
-        configure_fc_fc(input, weights_to_use, output);
-    }
-
-    ARM_COMPUTE_ERROR_ON(fc_info.retain_internal_weights && _reshape_weights_output.gc_buffer() == 0);
-    _are_weights_reshaped = _are_weights_reshaped || fc_info.retain_internal_weights;
-}
-
-void GCFullyConnectedLayer::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    // Linearize input if it comes from a convolutional layer
-    if(_is_fc_after_conv)
-    {
-        GCScheduler::get().dispatch(_im2col_kernel, false);
-    }
-
-    if(!_are_weights_reshaped || _is_fc_after_conv)
-    {
-        GCScheduler::get().memory_barrier();
-    }
-
-    // Run matrix multiply
-    GCScheduler::get().dispatch(_mm_kernel, !_accumulate_biases);
-
-    // Accumulate biases if provided
-    if(_accumulate_biases)
-    {
-        GCScheduler::get().memory_barrier();
-
-        GCScheduler::get().dispatch(_accumulate_biases_kernel);
-    }
-}
-
-void GCFullyConnectedLayer::prepare()
-{
-    // Reshape of the weights (happens only once)
-    if(!_are_weights_reshaped)
-    {
-        ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-        // Run reshape weights kernel and mark weights as unused
-        _reshape_weights_output.allocator()->allocate();
-        _reshape_weights_kernel.run();
-
-        // Mark original weights tensor as unused
-        _original_weights->mark_as_unused();
-
-        _are_weights_reshaped = true;
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
deleted file mode 100644
index a5a26f4bb9..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo())
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output);
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
-    ARM_COMPUTE_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-
-    if(c != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c->info());
-        ARM_COMPUTE_ERROR_ON_MSG(a->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_ERROR_ON_MSG(b->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix B");
-    }
-
-    if(output->total_size() != 0)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != output->dimension(0), "The output matrix must have the same number of columns as the matrix B");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != output->dimension(1), "The output matrix must have the same number of rows as the matrix A");
-    }
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-
-    ARM_COMPUTE_UNUSED(alpha);
-    ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_UNUSED(gemm_info);
-    return Status{};
-}
-} // namespace
-
-GCGEMM::GCGEMM(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
-      _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
-{
-}
-
-void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-
-    // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(a->info(), b->info(), c, output->info(), alpha, beta, gemm_info));
-
-    // Check if we need to reshape the matrix B only on the first run
-    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-    _is_prepared                 = false;
-    _original_b                  = b;
-
-    const IGCTensor *matrix_a = a;
-    const IGCTensor *matrix_b = b;
-
-    // Get the GPU target
-    const GPUTarget gpu_target = GCScheduler::get().get_target();
-
-    // Set the target for the kernels
-    _interleave_kernel.set_target(gpu_target);
-    _mm_kernel.set_target(gpu_target);
-
-    // Arguments used by GEMMReshapeInfo
-    // If we pass the matrix A and matrix B reshaped to GCGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GCGEMMReshapeInfo
-    // in order to know how the matrices have been reshaped
-    const int m                         = a->info()->dimension(1);
-    const int n                         = b->info()->dimension(0);
-    const int k                         = a->info()->dimension(0);
-    int       mult_transpose1xW_width   = 1;
-    int       mult_interleave4x4_height = 1;
-
-    // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors
-    _is_interleaved_transposed = a->info()->dimension(1) > 16;
-
-    if(_is_interleaved_transposed)
-    {
-        matrix_a = &_tmp_a;
-        matrix_b = &_tmp_b;
-
-        // Manage intermediate buffers
-        _memory_group.manage(&_tmp_a);
-        if(!_reshape_b_only_on_first_run)
-        {
-            _memory_group.manage(&_tmp_b);
-        }
-        // _tmp_a and _tmp_b will be auto configured in _interleave_kernel and in _transpose_kernel
-
-        // Configure interleave kernel
-        _interleave_kernel.configure(a, &_tmp_a);
-
-        // Configure transpose kernel
-        _transpose_kernel.configure(b, &_tmp_b);
-    }
-
-    _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed, GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height));
-
-    if(_is_interleaved_transposed)
-    {
-        // Allocate intermediate tensors
-        _tmp_a.allocator()->allocate();
-        if(!_reshape_b_only_on_first_run)
-        {
-            _tmp_b.allocator()->allocate();
-        }
-    }
-
-    // Configure matrix addition kernel
-    if(beta != 0 && c != nullptr)
-    {
-        _ma_kernel.configure(c, output, beta);
-        _run_addition = true;
-    }
-}
-
-Status GCGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(a, b, c, output, alpha, beta, gemm_info));
-    return Status{};
-}
-
-void GCGEMM::run()
-{
-    prepare();
-
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    if(_is_interleaved_transposed)
-    {
-        // Run interleave kernel
-        GCScheduler::get().dispatch(_interleave_kernel, false);
-
-        if(!_reshape_b_only_on_first_run)
-        {
-            // Run transpose kernel
-            GCScheduler::get().dispatch(_transpose_kernel, false);
-        }
-
-        GCScheduler::get().memory_barrier();
-    }
-
-    // Run matrix multiply kernel
-    GCScheduler::get().dispatch(_mm_kernel, !_run_addition);
-
-    // Run matrix addition kernel
-    if(_run_addition)
-    {
-        GCScheduler::get().memory_barrier();
-        GCScheduler::get().dispatch(_ma_kernel);
-    }
-}
-
-void GCGEMM::prepare()
-{
-    if(!_is_prepared)
-    {
-        if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
-        {
-            ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
-            // Run transpose kernel
-            _tmp_b.allocator()->allocate();
-            GCScheduler::get().dispatch(_transpose_kernel, false);
-            GCScheduler::get().memory_barrier();
-
-            // Mark original weights tensor as unused
-            _original_b->mark_as_unused();
-        }
-
-        _is_prepared = true;
-    }
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp
deleted file mode 100644
index c1287f7e9c..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
-
-using namespace arm_compute;
-
-void GCGEMMInterleave4x4::configure(const IGCTensor *input, IGCTensor *output)
-{
-    auto k = std::make_unique<GCGEMMInterleave4x4Kernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp
deleted file mode 100644
index d085357eaa..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/Types.h"
-
-using namespace arm_compute;
-
-void GCGEMMTranspose1xW::configure(const IGCTensor *input, IGCTensor *output)
-{
-    auto k = std::make_unique<GCGEMMTranspose1xWKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
deleted file mode 100644
index c4bf141446..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCNormalizationLayer::GCNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler()
-{
-}
-
-void GCNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const NormalizationLayerInfo &norm_info)
-{
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
-
-    _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type()));
-    _memory_group.manage(&_squared_input);
-
-    _norm_kernel.configure(input, &_squared_input, output, norm_info);
-    _multiply_kernel.configure(input, input, &_squared_input, 1.0f);
-    // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-    _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue());
-
-    // Allocate intermediate buffers
-    _squared_input.allocator()->allocate();
-}
-
-void GCNormalizationLayer::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    GCScheduler::get().dispatch(_multiply_kernel, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_border_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_norm_kernel, true);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
deleted file mode 100755
index 3e677b5d97..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-using namespace arm_compute;
-
-GCNormalizePlanarYUVLayer::GCNormalizePlanarYUVLayer()
-    : _norm_kernel()
-{
-}
-
-void GCNormalizePlanarYUVLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std)
-{
-    _norm_kernel.configure(input, output, mean, std);
-}
-
-Status GCNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *std)
-{
-    return GCNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
-}
-
-void GCNormalizePlanarYUVLayer::run()
-{
-    GCScheduler::get().dispatch(_norm_kernel, true);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp b/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp
deleted file mode 100755
index ce50a63e53..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCPixelWiseMultiplication::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale, const ActivationLayerInfo &act_info)
-{
-    ARM_COMPUTE_UNUSED(act_info);
-    auto k = std::make_unique<GCPixelWiseMultiplicationKernel>();
-    k->configure(input1, input2, output, scale);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
deleted file mode 100644
index 6a71fbebe7..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-namespace arm_compute
-{
-GCPoolingLayer::GCPoolingLayer()
-    : _kernel(nullptr), _border_handler(), _shift_handler()
-{
-}
-
-void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices)
-{
-    // Configure pooling kernel
-    auto k = std::make_unique<GCPoolingLayerKernel>();
-    k->configure(input, output, pool_info, indices);
-    _kernel = std::move(k);
-
-    // Configure border depending on operation required
-    BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-    _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0.0f));
-
-    _shift_handler.configure(input);
-}
-
-Status GCPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
-{
-    return GCPoolingLayerKernel::validate(input, output, pool_info, indices);
-}
-
-void GCPoolingLayer::run()
-{
-    GCScheduler::get().dispatch(_shift_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_border_handler, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(*_kernel);
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCScale.cpp b/src/runtime/GLES_COMPUTE/functions/GCScale.cpp
deleted file mode 100644
index 225bb4131f..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCScale.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-void GCScale::configure(IGCTensor *input, IGCTensor *output, const ScaleKernelInfo &info)
-{
-    auto k = std::make_unique<GCScaleKernel>();
-    k->configure(input, output, info);
-    _kernel = std::move(k);
-    _border_handler.configure(input, _kernel->border_size(), info.border_mode, info.constant_border_value);
-}
-} // namespace arm_compute
diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
deleted file mode 100644
index fdb9a42f13..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-namespace arm_compute
-{
-GCSoftmaxLayer::GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp()
-{
-}
-
-void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output, float beta, int32_t axis)
-{
-    ARM_COMPUTE_UNUSED(beta, axis);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(beta != 1.0f);
-    ARM_COMPUTE_ERROR_ON_MSG(axis != 0, "axis must be 0 for GLES");
-
-    // Create intermediate tensors shapes
-    _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type()));
-
-    TensorShape shape = input->info()->tensor_shape();
-    shape.set(0, 1);
-    TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type());
-    _max.allocator()->init(tensor_info_max_sum);
-    _sum.allocator()->init(tensor_info_max_sum);
-
-    // Manage intermediate buffers
-    _memory_group.manage(&_tmp);
-    _memory_group.manage(&_max);
-    _memory_group.manage(&_sum);
-
-    // Configure Kernels
-    _max_kernel.configure(input, &_max);
-    _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum);
-    _norm_kernel.configure(&_tmp, &_sum, output);
-
-    // Allocate intermediate buffers
-    _tmp.allocator()->allocate();
-    _max.allocator()->allocate();
-    _sum.allocator()->allocate();
-}
-
-void GCSoftmaxLayer::run()
-{
-    MemoryGroupResourceScope scope_mg(_memory_group);
-
-    GCScheduler::get().dispatch(_max_kernel, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_shift_exp_sum_kernel, false);
-    GCScheduler::get().memory_barrier();
-    GCScheduler::get().dispatch(_norm_kernel);
-}
-
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp b/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
deleted file mode 100644
index 050dc7e9f5..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCTensorShift.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h"
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Utils.h"
-
-using namespace arm_compute;
-
-void GCTensorShift::configure(IGCTensor *input)
-{
-    auto k = std::make_unique<GCTensorShiftKernel>();
-    k->configure(input);
-    _kernel = std::move(k);
-}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp b/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp
deleted file mode 100644
index 14125e9db2..0000000000
--- a/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void GCTranspose::configure(const IGCTensor *input, IGCTensor *output)
-{
-    auto k = std::make_unique<GCTransposeKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
author	Manuel Bottini <manuel.bottini@arm.com>	2021-02-16 15:15:19 +0000
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2021-02-23 18:21:55 +0000
commit	ceaa0bfe219631b5a4e638613f90f9fa47a3defe (patch)
tree	3bb878645ae7509f7807197d320a02882ad84751 /src
parent	c40562d4467e3a68b0dac5e865570c8f38d1487e (diff)
download	ComputeLibrary-ceaa0bfe219631b5a4e638613f90f9fa47a3defe.tar.gz