From 7068f9900d136312318ff430aef588b14e0c87ad Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Thu, 26 Oct 2017 15:23:08 +0100 Subject: COMPMID-631: Merge branches/gles_compute branch Last commit: commit b25c5f68042b0c81bf611d59a1bb8535e1c42497 Author: Xinghang Zhou Date: Wed Oct 25 18:48:10 2017 +0800 Synced validation's tolerances of GCSoftmax from cl side Change-Id: Ibe72054205c1c8721845d679a31af7ed0a7c5cf6 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/93283 Reviewed-by: Anthony Barbier Tested-by: Kaizen --- src/core/CL/cl_kernels/direct_convolution1x1.cl | 4 +- src/core/CL/cl_kernels/direct_convolution3x3.cl | 4 +- src/core/CL/cl_kernels/direct_convolution5x5.cl | 4 +- src/core/Error.cpp | 18 +- src/core/GLES_COMPUTE/GCKernelLibrary.cpp | 716 +++++++++ src/core/GLES_COMPUTE/IGCKernel.cpp | 157 ++ src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp | 51 + src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp | 52 + src/core/GLES_COMPUTE/IGCSimpleKernel.cpp | 54 + src/core/GLES_COMPUTE/IGCTensor.cpp | 54 + src/core/GLES_COMPUTE/OpenGLES.cpp | 820 ++++++++++ src/core/GLES_COMPUTE/cs_shaders/absdiff.cs | 71 + .../GLES_COMPUTE/cs_shaders/activation_layer.cs | 262 ++++ .../cs_shaders/batchnormalization_layer.cs | 222 +++ src/core/GLES_COMPUTE/cs_shaders/concatenate.cs | 106 ++ .../GLES_COMPUTE/cs_shaders/convolution_layer.cs | 302 ++++ .../cs_shaders/direct_convolution1x1.cs | 275 ++++ .../cs_shaders/direct_convolution3x3.cs | 1583 ++++++++++++++++++++ .../cs_shaders/direct_convolution5x5.cs | 313 ++++ src/core/GLES_COMPUTE/cs_shaders/dropout.cs | 204 +++ src/core/GLES_COMPUTE/cs_shaders/fill_border.cs | 553 +++++++ src/core/GLES_COMPUTE/cs_shaders/gemm.cs | 623 ++++++++ src/core/GLES_COMPUTE/cs_shaders/helpers.h | 582 +++++++ .../GLES_COMPUTE/cs_shaders/normalization_layer.cs | 157 ++ .../GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs | 75 + src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs | 1444 ++++++++++++++++++ src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs | 541 +++++++ src/core/GLES_COMPUTE/cs_shaders/transpose.cs | 187 +++ src/core/GLES_COMPUTE/egl_entries.in | 35 + src/core/GLES_COMPUTE/gl_entries.in | 63 + .../kernels/GCAbsoluteDifferenceKernel.cpp | 112 ++ .../kernels/GCActivationLayerKernel.cpp | 128 ++ .../kernels/GCBatchNormalizationLayerKernel.cpp | 129 ++ src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp | 101 ++ .../kernels/GCDepthConcatenateKernel.cpp | 145 ++ .../kernels/GCDirectConvolutionLayerKernel.cpp | 394 +++++ src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp | 110 ++ .../GLES_COMPUTE/kernels/GCFillBorderKernel.cpp | 169 +++ .../kernels/GCGEMMInterleave4x4Kernel.cpp | 129 ++ .../kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp | 123 ++ .../kernels/GCGEMMMatrixAdditionKernel.cpp | 104 ++ .../kernels/GCGEMMMatrixMultiplyKernel.cpp | 210 +++ .../kernels/GCGEMMTranspose1xWKernel.cpp | 128 ++ src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp | 230 +++ .../kernels/GCNormalizationLayerKernel.cpp | 124 ++ .../kernels/GCPixelWiseMultiplicationKernel.cpp | 127 ++ .../GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp | 254 ++++ .../GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp | 353 +++++ .../GLES_COMPUTE/kernels/GCTransposeKernel.cpp | 116 ++ src/core/Helpers.cpp | 7 + src/core/Utils.cpp | 3 +- src/runtime/CL/functions/CLNormalizationLayer.cpp | 2 +- src/runtime/GLES_COMPUTE/GCScheduler.cpp | 61 + src/runtime/GLES_COMPUTE/GCTensor.cpp | 77 + src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp | 94 ++ src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp | 45 + .../functions/GCAbsoluteDifference.cpp | 40 + .../GLES_COMPUTE/functions/GCActivationLayer.cpp | 37 + .../functions/GCBatchNormalizationLayer.cpp | 48 + .../GLES_COMPUTE/functions/GCDepthConcatenate.cpp | 69 + .../functions/GCDirectConvolutionLayer.cpp | 64 + .../GLES_COMPUTE/functions/GCDropoutLayer.cpp | 50 + .../GLES_COMPUTE/functions/GCFillBorder.cpp | 40 + .../functions/GCFullyConnectedLayer.cpp | 177 +++ src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp | 133 ++ .../GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp | 36 + .../GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp | 38 + .../functions/GCNormalizationLayer.cpp | 61 + .../functions/GCPixelWiseMultiplication.cpp | 38 + .../GLES_COMPUTE/functions/GCPoolingLayer.cpp | 42 + .../GLES_COMPUTE/functions/GCSoftmaxLayer.cpp | 66 + src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp | 38 + .../NEON/functions/NENormalizationLayer.cpp | 2 +- 73 files changed, 13971 insertions(+), 15 deletions(-) create mode 100644 src/core/GLES_COMPUTE/GCKernelLibrary.cpp create mode 100644 src/core/GLES_COMPUTE/IGCKernel.cpp create mode 100644 src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp create mode 100644 src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp create mode 100644 src/core/GLES_COMPUTE/IGCSimpleKernel.cpp create mode 100644 src/core/GLES_COMPUTE/IGCTensor.cpp create mode 100644 src/core/GLES_COMPUTE/OpenGLES.cpp create mode 100644 src/core/GLES_COMPUTE/cs_shaders/absdiff.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/concatenate.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/dropout.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/fill_border.cs create mode 100755 src/core/GLES_COMPUTE/cs_shaders/gemm.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/helpers.h create mode 100755 src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs create mode 100644 src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs create mode 100755 src/core/GLES_COMPUTE/cs_shaders/transpose.cs create mode 100644 src/core/GLES_COMPUTE/egl_entries.in create mode 100644 src/core/GLES_COMPUTE/gl_entries.in create mode 100644 src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp create mode 100644 src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp create mode 100644 src/runtime/GLES_COMPUTE/GCScheduler.cpp create mode 100644 src/runtime/GLES_COMPUTE/GCTensor.cpp create mode 100644 src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp create mode 100644 src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp create mode 100755 src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp create mode 100755 src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp create mode 100755 src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp create mode 100644 src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp (limited to 'src') diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl index 7b73b85eac..484bc35ef1 100644 --- a/src/core/CL/cl_kernels/direct_convolution1x1.cl +++ b/src/core/CL/cl_kernels/direct_convolution1x1.cl @@ -153,7 +153,7 @@ inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_T * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) @@ -241,7 +241,7 @@ __kernel void direct_convolution1x1( * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl index 1420d7c873..e6e3007c95 100644 --- a/src/core/CL/cl_kernels/direct_convolution3x3.cl +++ b/src/core/CL/cl_kernels/direct_convolution3x3.cl @@ -102,7 +102,7 @@ MULQ_SAT_IMPL(qs32x8, qs32x8) * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) @@ -198,7 +198,7 @@ __kernel void direct_convolution3x3( * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl index 6fdd019a14..12cf0fb68e 100644 --- a/src/core/CL/cl_kernels/direct_convolution5x5.cl +++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl @@ -91,7 +91,7 @@ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) @@ -197,7 +197,7 @@ __kernel void direct_convolution5x5( * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) diff --git a/src/core/Error.cpp b/src/core/Error.cpp index 2e699feeb9..3b0a012f5f 100644 --- a/src/core/Error.cpp +++ b/src/core/Error.cpp @@ -30,23 +30,29 @@ using namespace arm_compute; +Error arm_compute::create_error_va_list(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, va_list args) +{ + char out[512]; + int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line); + vsnprintf(out + offset, sizeof(out) - offset, msg, args); + + return Error(error_code, std::string(out)); +} + Error arm_compute::create_error(ErrorCode error_code, const char *function, const char *file, const int line, const char *msg, ...) { - char out[512]; va_list args; va_start(args, msg); - int offset = snprintf(out, sizeof(out), "in %s %s:%d: ", function, file, line); - vsnprintf(out + offset, sizeof(out) - offset, msg, args); + auto err = create_error_va_list(error_code, function, file, line, msg, args); va_end(args); - - return Error(error_code, std::string(out)); + return err; } void arm_compute::error(const char *function, const char *file, const int line, const char *msg, ...) { va_list args; va_start(args, msg); - auto err = create_error(ErrorCode::RUNTIME_ERROR, function, file, line, msg, args); + auto err = create_error_va_list(ErrorCode::RUNTIME_ERROR, function, file, line, msg, args); va_end(args); throw std::runtime_error(err.description()); } diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp new file mode 100644 index 0000000000..fd362f1665 --- /dev/null +++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp @@ -0,0 +1,716 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" + +#include +#include +#include +#include +#include +#include + +using namespace arm_compute; + +GCProgram::GCProgram() + : _name(), _source() +{ +} + +GCProgram::GCProgram(std::string name, std::string source) + : _name(std::move(name)), _source(std::move(source)) +{ +} + +GLuint GCProgram::link_program(GLuint shader) +{ + GLuint program = ARM_COMPUTE_GL_CHECK(glCreateProgram()); + + GLint rvalue; + GLsizei length; + + ARM_COMPUTE_GL_CHECK(glAttachShader(program, shader)); + ARM_COMPUTE_GL_CHECK(glLinkProgram(program)); + ARM_COMPUTE_GL_CHECK(glDetachShader(program, shader)); + ARM_COMPUTE_GL_CHECK(glDeleteShader(shader)); + + // Check if there were some issues when linking the shader. + ARM_COMPUTE_GL_CHECK(glGetProgramiv(program, GL_LINK_STATUS, &rvalue)); + + if(rvalue == 0) + { + ARM_COMPUTE_GL_CHECK(glGetProgramiv(program, GL_INFO_LOG_LENGTH, &length)); + + std::vector log(length); + ARM_COMPUTE_GL_CHECK(glGetProgramInfoLog(program, length, nullptr, log.data())); + ARM_COMPUTE_ERROR("Error: Linker log:\n%s\n", log.data()); + + return 0; + } + + ARM_COMPUTE_GL_CHECK(glUseProgram(program)); + + return program; +} + +GLuint GCProgram::compile_shader(const std::string &build_options) +{ + GLuint shader = ARM_COMPUTE_GL_CHECK(glCreateShader(GL_COMPUTE_SHADER)); + + const char *src[] + { + "#version 310 es\n", + build_options.c_str(), + _source.c_str() + }; + + ARM_COMPUTE_GL_CHECK(glShaderSource(shader, sizeof(src) / sizeof(src[0]), src, nullptr)); + + ARM_COMPUTE_GL_CHECK(glCompileShader(shader)); + + // Check if there were any issues when compiling the shader + GLint rvalue; + GLsizei length; + + ARM_COMPUTE_GL_CHECK(glGetShaderiv(shader, GL_COMPILE_STATUS, &rvalue)); + + if(rvalue == 0) + { + ARM_COMPUTE_GL_CHECK(glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &length)); + + std::vector log(length); + ARM_COMPUTE_GL_CHECK(glGetShaderInfoLog(shader, length, nullptr, log.data())); + +#ifdef ARM_COMPUTE_DEBUG_ENABLED + std::istringstream ss(_source); + std::stringstream output_stream; + std::string line; + size_t line_num = 1; + + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("GLES Shader build options:\n%s\n", build_options.c_str()); + while(std::getline(ss, line, '\n')) + { + output_stream << std::setw(6) << line_num << ": " << line << std::endl; + line_num++; + } + ARM_COMPUTE_LOG_INFO_STREAM_CORE("GLES Shader source code:" << output_stream.rdbuf()); +#endif /* ARM_COMPUTE_DEBUG_ENABLED */ + + ARM_COMPUTE_ERROR("Error: Compiler log:\n%s\n", log.data()); + + return 0; + } + + return shader; +} + +GCKernel::GCKernel() + : _name(), _program(), _params(), _shader_params(), _shader_params_binding_point(), _shader_params_index(), _shader_params_size() +{ +} + +GCKernel::GCKernel(std::string name, GLuint program) + : _name(std::move(name)), + _program(program), + _params(), + _shader_params(0), + _shader_params_binding_point(0), + _shader_params_index(0), + _shader_params_size(0) +{ + _params.clear(); + + ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params)); + + _shader_params_index = ARM_COMPUTE_GL_CHECK(glGetUniformBlockIndex(_program, _shader_params_name)); + ARM_COMPUTE_ERROR_ON_MSG((_shader_params_index == GL_INVALID_INDEX), "Failed to get index of %s", _shader_params_name); + ARM_COMPUTE_GL_CHECK(glGetActiveUniformBlockiv(_program, _shader_params_index, GL_UNIFORM_BLOCK_DATA_SIZE, &_shader_params_size)); + ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size == 0), "Failed to get size of %s", _shader_params_name); +} + +void GCKernel::cleanup() +{ + ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_shader_params)); + ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0)); + ARM_COMPUTE_GL_CHECK(glDeleteProgram(_program)); + ARM_COMPUTE_GL_CHECK(glUseProgram(0)); +} + +void GCKernel::use() +{ + ARM_COMPUTE_GL_CHECK(glUseProgram(_program)); +} + +void GCKernel::unuse() +{ + ARM_COMPUTE_GL_CHECK(glUseProgram(0)); +} + +void GCKernel::update_shader_params() +{ + ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size != (int)(_params.size() * sizeof(_params[0]))), "Params size (%d) is not equal to shader params block size (%d)", _params.size() * sizeof(_params[0]), + _shader_params_size); + + ARM_COMPUTE_GL_CHECK(glUniformBlockBinding(_program, _shader_params_index, _shader_params_binding_point)); + ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_UNIFORM_BUFFER, _shader_params_binding_point, _shader_params)); + ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, _shader_params)); + ARM_COMPUTE_GL_CHECK(glBufferData(GL_UNIFORM_BUFFER, _shader_params_size, _params.data(), GL_DYNAMIC_DRAW)); + ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0)); +} + +const std::map GCKernelLibrary::_shader_program_map = +{ + { "absdiff", "absdiff.cs" }, + { "col2im", "convolution_layer.cs" }, + { "direct_convolution1x1", "direct_convolution1x1.cs" }, + { "direct_convolution3x3", "direct_convolution3x3.cs" }, + { "direct_convolution5x5", "direct_convolution5x5.cs" }, + { "pooling_layer_2", "pooling_layer.cs" }, + { "pooling_layer_3", "pooling_layer.cs" }, + { "pooling_layer_7", "pooling_layer.cs" }, + { "pooling_layer_3_optimized", "pooling_layer.cs" }, + { "pooling_layer_n", "pooling_layer.cs" }, + { "fill_image_borders_replicate", "fill_border.cs" }, + { "fill_image_borders_constant", "fill_border.cs" }, + { "gemm_accumulate_biases", "gemm.cs" }, + { "gemm_interleave4x4", "gemm.cs" }, + { "gemm_ma", "gemm.cs" }, + { "gemm_mm_interleaved_transposed", "gemm.cs" }, + { "gemm_mm_floating_point", "gemm.cs" }, + { "gemm_transpose1x4", "gemm.cs" }, + { "im2col_kernel3x3_padx0_pady0", "convolution_layer.cs" }, + { "im2col_generic", "convolution_layer.cs" }, + { "im2col_reduced", "convolution_layer.cs" }, + { "transpose", "transpose.cs" }, + { "activation_layer", "activation_layer.cs" }, + { "softmax_layer_max", "softmax_layer.cs" }, + { "softmax_layer_shift_exp_sum", "softmax_layer.cs" }, + { "softmax_layer_norm", "softmax_layer.cs" }, + { "pixelwise_mul_float", "pixelwise_mul_float.cs" }, + { "normalization_layer", "normalization_layer.cs" }, + { "batchnormalization_layer", "batchnormalization_layer.cs" }, + { "concatenate_depth", "concatenate.cs" }, + { "dropout", "dropout.cs" }, +}; + +const std::map GCKernelLibrary::_program_source_map = +{ +#ifdef EMBEDDED_KERNELS + { + "absdiff.cs", +#include "./cs_shaders/absdiff.csembed" + }, + { + "convolution_layer.cs", +#include "./cs_shaders/convolution_layer.csembed" + }, + { + "direct_convolution1x1.cs", +#include "./cs_shaders/direct_convolution1x1.csembed" + }, + { + "direct_convolution3x3.cs", +#include "./cs_shaders/direct_convolution3x3.csembed" + }, + { + "direct_convolution5x5.cs", +#include "./cs_shaders/direct_convolution5x5.csembed" + }, + { + "pooling_layer.cs", +#include "./cs_shaders/pooling_layer.csembed" + }, + { + "fill_border.cs", +#include "./cs_shaders/fill_border.csembed" + }, + { + "gemm.cs", +#include "./cs_shaders/gemm.csembed" + }, + { + "transpose.cs", +#include "./cs_shaders/transpose.csembed" + }, + { + "activation_layer.cs", +#include "./cs_shaders/activation_layer.csembed" + }, + { + "softmax_layer.cs", +#include "./cs_shaders/softmax_layer.csembed" + }, + { + "pixelwise_mul_float.cs", +#include "./cs_shaders/pixelwise_mul_float.csembed" + }, + { + "normalization_layer.cs", +#include "./cs_shaders/normalization_layer.csembed" + }, + { + "batchnormalization_layer.cs", +#include "./cs_shaders/batchnormalization_layer.csembed" + }, + { + "concatenate.cs", +#include "./cs_shaders/concatenate.csembed" + }, + { + "dropout.cs", +#include "./cs_shaders/dropout.csembed" + }, +#endif /* EMBEDDED_KERNELS */ +}; + +GCKernelLibrary::GCKernelLibrary() + : _display(EGL_NO_DISPLAY), _context(EGL_NO_CONTEXT), _frame_buffer(0), _tex_rt(0), _own_context(false), _shader_path("./"), _programs_map(), _built_programs_map() +{ +} + +GCKernelLibrary &GCKernelLibrary::get() +{ + static GCKernelLibrary _kernel_library; + return _kernel_library; +} + +GCKernel GCKernelLibrary::create_kernel(const std::string &shader_name, const StringSet &build_options_set) const +{ + // Find which program contains the kernel + auto shader_program_it = _shader_program_map.find(shader_name); + + if(_shader_program_map.end() == shader_program_it) + { + ARM_COMPUTE_ERROR("Shader %s not found in the GCKernelLibrary", shader_name.c_str()); + } + + // Check if the program has been built before with same build options. + const std::string program_name = shader_program_it->second; + const std::string build_options = stringify_set(build_options_set); + const std::string built_program_name = program_name + "_" + build_options; + auto built_program_it = _built_programs_map.find(built_program_name); + + GCKernel kernel; + + if(_built_programs_map.end() != built_program_it) + { + // If program has been built, retrieve to create kernel from it + kernel = built_program_it->second; + kernel.use(); + } + else + { + GCProgram program = load_program(program_name); + + std::string source_name = _shader_path + shader_program_it->second; + + // load shader + GLuint shader = program.compile_shader(build_options); + + // Build program + GLuint gles_program = program.link_program(shader); + + // Create GCKernel + kernel = GCKernel(shader_name, gles_program); + + // Add built program to internal map + _built_programs_map.emplace(built_program_name, kernel); + } + + return kernel; +} + +const std::string GCKernelLibrary::preprocess_shader(const std::string &shader_source) const +{ + enum class ParserStage + { + FIRST, + SKIP_COMMENTS = FIRST, + RESOLVE_INCLUDES, + SKIP_PREPROCESSOR_DIRECTIVES, + SEARCH_MACRO_DEFINITIONS, + EXPAND_MACRO_USES, + LAST + }; + + struct MacroDefinitionInfo + { + const std::vector param_list; + const std::string content; + }; + + // Found macro definitions so far + std::map macro_definitions; + + // Define a GLES compute shader parser function + std::function cs_parser; + cs_parser = [&](const std::string & src, ParserStage stage, int nested_level) -> std::string + { + std::string dst; + + if(stage == ParserStage::LAST || std::regex_match(src, std::regex(R"(\s*)"))) + { + return src; + } + auto next_stage = static_cast(static_cast(stage) + 1); + + std::string search_pattern; + switch(stage) + { + case ParserStage::SKIP_COMMENTS: + search_pattern = R"((/\*([^*]|\n|(\*+([^*/]|\n)))*\*+/)|(//.*))"; + break; + case ParserStage::RESOLVE_INCLUDES: + search_pattern = R"rgx((?:^|\n)[ \t]*#include "(.*)")rgx"; + break; + case ParserStage::SKIP_PREPROCESSOR_DIRECTIVES: + search_pattern = R"((^|\n)[ \t]*(#ifdef|#ifndef|#if)[^\n]+)"; + break; + case ParserStage::SEARCH_MACRO_DEFINITIONS: + search_pattern = R"((?:^|\n)[ \t]*#define[ \t]+(\w+)(?:\((\w+(?:[ \t]*,[ \t]*\w+)*)\))?(?: |\t|\\\n)*((?:(?:[^\\\n]|\\[^\n])*\\+\n)*(?:[ \t]*[^ \t\n]+)*)[ \t]*)"; + break; + case ParserStage::EXPAND_MACRO_USES: + { + if(macro_definitions.empty()) + { + // Nothing to expand + return src; + } + int i = 0; + for(auto &def : macro_definitions) + { + if(i == 0) + { + search_pattern = R"((\b)" + def.first; + } + else + { + search_pattern += R"(\b|\b)" + def.first; + } + i++; + } + search_pattern += R"(\b))"; + break; + } + default: + break; + } + + std::regex search_regex(search_pattern); + std::smatch match; + ptrdiff_t parsed_pos = 0; + if(std::regex_search(src, match, search_regex)) + { + // Pass the content before the match to the next stage + dst.append(cs_parser(src.substr(0, match.position()), next_stage, 0)); + parsed_pos = match.position() + match.length(); + + // Deal with the matched content + switch(stage) + { + case ParserStage::RESOLVE_INCLUDES: + { + // Replace with the included file contents + // And parse the content from the first stage + const std::string source_name = _shader_path + match.str(1); + dst.append(cs_parser(read_file(source_name, false), ParserStage::FIRST, 0)); + break; + } + case ParserStage::SEARCH_MACRO_DEFINITIONS: + { + std::regex params_regex(R"(\b\w+\b)"); + const std::string macro_param_str = match.str(2); + const std::vector macro_param_list( + std::sregex_token_iterator(macro_param_str.begin(), + macro_param_str.end(), + params_regex), + std::sregex_token_iterator()); + + const MacroDefinitionInfo info = + { + macro_param_list, + match.str(3) + }; + // Collect the macro definition data and not change the shader source + macro_definitions.insert(std::pair(match.str(1), info)); + dst.append(match.str()); + break; + } + case ParserStage::EXPAND_MACRO_USES: + { + ptrdiff_t args_str_length = 0; + std::vector args_list; + + // Walk through argument list, because the regular expression does NOT support nested parentheses + size_t cur_args_str_pos = match.position() + match.length(); + if(src[cur_args_str_pos++] == '(') + { + int nested_parentheses = 0; + ptrdiff_t cur_arg_pos = cur_args_str_pos; + ptrdiff_t cur_arg_length = 0; + + args_str_length++; + while(src[cur_args_str_pos] != ')' || nested_parentheses != 0) + { + switch(src[cur_args_str_pos++]) + { + case '(': + nested_parentheses++; + cur_arg_length++; + break; + case ',': + if(nested_parentheses == 0) + { + args_list.push_back(src.substr(cur_arg_pos, cur_arg_length)); + cur_arg_pos = cur_args_str_pos; + cur_arg_length = 0; + } + else + { + cur_arg_length++; + } + break; + case ' ': + case '\t': + if(cur_arg_length == 0) + { + cur_arg_pos++; + } + else + { + cur_arg_length++; + } + break; + case ')': + nested_parentheses--; + // no break here! + default: + cur_arg_length++; + break; + } + args_str_length++; + } + if(src[cur_args_str_pos] == ')' && nested_parentheses == 0) + { + args_list.push_back(src.substr(cur_arg_pos, cur_arg_length)); + } + args_str_length++; + } + + std::string expanded_content = match.str(); + const std::vector macro_param_list = macro_definitions.at(match.str()).param_list; + + if((nested_level != 0 || !macro_param_list.empty()) && macro_param_list.size() == args_list.size()) + { + parsed_pos += args_str_length; + expanded_content = macro_definitions.at(match.str()).content; + size_t i = 0; + for(auto ¶m_name : macro_param_list) + { + std::regex params_regex(R"(\b)" + param_name + R"(\b)"); + expanded_content.assign(std::regex_replace(expanded_content, params_regex, args_list[i])); + ++i; + } + // Expand macro recursively + expanded_content = cs_parser(expanded_content, stage, nested_level + 1); + + if(nested_level == 0) + { + const std::regex token_pasting_rgx = std::regex(R"(\b##\b)"); + if(std::regex_search(expanded_content, token_pasting_rgx)) + { + // Remove token pasting operator "##" + expanded_content.assign(std::regex_replace(expanded_content, std::regex(token_pasting_rgx), "")); + // Trim trailing whitespace + expanded_content.assign(std::regex_replace(expanded_content, std::regex(R"([ \t]*\\\n)"), "\n")); + } + else + { + // Do not expand the macro if the result does not have token pasting operator "##" + expanded_content = src.substr(match.position(), match.length() + args_str_length); + } + } + } + dst.append(expanded_content); + break; + } + case ParserStage::SKIP_COMMENTS: + case ParserStage::SKIP_PREPROCESSOR_DIRECTIVES: + default: + dst.append(match.str()); + break; + } + next_stage = stage; + } + dst.append(cs_parser(src.substr(parsed_pos, src.length() - parsed_pos), next_stage, 0)); + + return dst; + }; + + return cs_parser(shader_source, ParserStage::FIRST, 0); +} + +const GCProgram &GCKernelLibrary::load_program(const std::string &program_name) const +{ + const auto program_it = _programs_map.find(program_name); + + if(program_it != _programs_map.end()) + { + return program_it->second; + } + + GCProgram program; + +#ifdef EMBEDDED_KERNELS + const auto program_source_it = _program_source_map.find(program_name); + + if(_program_source_map.end() == program_source_it) + { + ARM_COMPUTE_ERROR("Embedded program for %s does not exist.", program_name.c_str()); + } + + // TODO(APPBROWSER-298): Do not call shader preprocessor here + // We should do the preprocess at compile time + // The preprocess_shader function is used for support "#include" directive and token pasting operator "##". + // This job could be done at compile time by using a python script in order to get better performance at runtime. + // BTW: We usually defined EMBEDDED_KERNELS in release build. + program = GCProgram(program_name, preprocess_shader(program_source_it->second)); +#else /* EMBEDDED_KERNELS */ + // Check for binary + std::string source_name = _shader_path + program_name; + if(std::ifstream(source_name).is_open()) + { + program = GCProgram(program_name, preprocess_shader(read_file(source_name, false))); + } + else + { + ARM_COMPUTE_ERROR("Shader file %s does not exist.", source_name.c_str()); + } +#endif /* EMBEDDED_KERNELS */ + + // Insert program to program map + const auto new_program = _programs_map.emplace(program_name, std::move(program)); + + return new_program.first->second; +} + +void GCKernelLibrary::setup_context() +{ + EGLBoolean res; + _display = eglGetDisplay(EGL_DEFAULT_DISPLAY); + + ARM_COMPUTE_ERROR_ON_MSG(_display == EGL_NO_DISPLAY, "Failed to get display: 0x%x.", eglGetError()); + + res = eglInitialize(_display, nullptr, nullptr); + + ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to initialize egl: 0x%x.", eglGetError()); + ARM_COMPUTE_UNUSED(res); + + const char *egl_extension_st = eglQueryString(_display, EGL_EXTENSIONS); + ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_create_context") == nullptr), "Failed to query EGL_KHR_create_context"); + ARM_COMPUTE_ERROR_ON_MSG((strstr(egl_extension_st, "EGL_KHR_surfaceless_context") == nullptr), "Failed to query EGL_KHR_surfaceless_context"); + ARM_COMPUTE_UNUSED(egl_extension_st); + + const EGLint config_attribs[] = + { + EGL_RENDERABLE_TYPE, EGL_OPENGL_ES3_BIT_KHR, + EGL_NONE + }; + EGLConfig cfg; + EGLint count; + + res = eglChooseConfig(_display, config_attribs, &cfg, 1, &count); + + ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to choose config: 0x%x.", eglGetError()); + ARM_COMPUTE_UNUSED(res); + + res = eglBindAPI(EGL_OPENGL_ES_API); + + ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to bind api: 0x%x.", eglGetError()); + + const EGLint attribs[] = + { + EGL_CONTEXT_CLIENT_VERSION, 3, + EGL_NONE + }; + _context = eglCreateContext(_display, + cfg, + EGL_NO_CONTEXT, + attribs); + + ARM_COMPUTE_ERROR_ON_MSG(_context == EGL_NO_CONTEXT, "Failed to create context: 0x%x.", eglGetError()); + ARM_COMPUTE_UNUSED(res); + + res = eglMakeCurrent(_display, EGL_NO_SURFACE, EGL_NO_SURFACE, _context); + + ARM_COMPUTE_ERROR_ON_MSG(res == EGL_FALSE, "Failed to make current: 0x%x.", eglGetError()); + ARM_COMPUTE_UNUSED(res); +} + +void GCKernelLibrary::setup_dummy_fbo() +{ + ARM_COMPUTE_GL_CHECK(glGenFramebuffers(1, &_frame_buffer)); + ARM_COMPUTE_GL_CHECK(glBindFramebuffer(GL_FRAMEBUFFER, _frame_buffer)); + ARM_COMPUTE_GL_CHECK(glGenTextures(1, &_tex_rt)); + ARM_COMPUTE_GL_CHECK(glBindTexture(GL_TEXTURE_2D, _tex_rt)); + ARM_COMPUTE_GL_CHECK(glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 1, 1, 0, GL_RGB, GL_UNSIGNED_BYTE, nullptr)); + ARM_COMPUTE_GL_CHECK(glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, _tex_rt, 0)); +} + +GCKernelLibrary::~GCKernelLibrary() +{ + for(auto &program : _built_programs_map) + { + static_cast(program.second).cleanup(); + } + + ARM_COMPUTE_GL_CHECK(glBindTexture(GL_TEXTURE_2D, 0)); + ARM_COMPUTE_GL_CHECK(glBindFramebuffer(GL_FRAMEBUFFER, 0)); + ARM_COMPUTE_GL_CHECK(glDeleteTextures(1, &_tex_rt)); + ARM_COMPUTE_GL_CHECK(glDeleteFramebuffers(1, &_frame_buffer)); + + if(_own_context) + { + eglDestroyContext(_display, _context); + eglTerminate(_display); + + _context = EGL_NO_CONTEXT; + _display = EGL_NO_DISPLAY; + } +} + +std::string GCKernelLibrary::stringify_set(const StringSet &s) const +{ + std::string concat_set; + + // Concatenate set + for(const auto &el : s) + { + concat_set += el + "\n"; + } + + return concat_set; +} diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp new file mode 100644 index 0000000000..154a2c0c66 --- /dev/null +++ b/src/core/GLES_COMPUTE/IGCKernel.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" + +#include +#include + +using namespace arm_compute; + +void arm_compute::enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws) +{ + ARM_COMPUTE_UNUSED(kernel); + + if(kernel.kernel().get_program() == 0) + { + return; + } + + ARM_COMPUTE_ERROR_ON((0 == (window.x().end() - window.x().start())) || (0 == (window.y().end() - window.y().start()))); + + ARM_COMPUTE_ERROR_ON_MSG((((window.x().end() - window.x().start()) % (window.x().step() * lws[0])) != 0), + "window x end =%d, start=%d, step=%d, lws x=%d", window.x().end(), window.x().start(), window.x().step(), lws[0]); + ARM_COMPUTE_ERROR_ON_MSG((((window.y().end() - window.y().start()) % (window.y().step() * lws[1])) != 0), + "window y end =%d, start=%d, step=%d, lws y=%d", window.y().end(), window.y().start(), window.y().step(), lws[1]); + ARM_COMPUTE_ERROR_ON_MSG((((window.z().end() - window.z().start()) % (window.z().step() * lws[2])) != 0), + "window z end =%d, start=%d, step=%d, lws z=%d", window.z().end(), window.z().start(), window.z().step(), lws[2]); + + ARM_COMPUTE_GL_CHECK(glDispatchCompute((window.x().end() - window.x().start()) / (window.x().step() / lws[0]), + (window.y().end() - window.y().start()) / (window.y().step() / lws[1]), + (window.z().end() - window.z().start()) / (window.z().step() / lws[2]))); +} + +IGCKernel::IGCKernel() + : _kernel() +{ +} + +GCKernel &IGCKernel::kernel() +{ + return _kernel; +} + +template +unsigned int IGCKernel::num_arguments_per_tensor() const +{ + return 2 + 2 * dimension_size; +} + +template +void IGCKernel::add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam ¶m, const Window &window) +{ + ARM_COMPUTE_ERROR_ON(tensor == nullptr); + + const ITensorInfo *info = tensor->info(); + const Strides &strides = info->strides_in_bytes(); + + // Calculate offset to the start of the window + unsigned int offset_first_element = info->offset_first_element_in_bytes(); + + for(unsigned int n = 0; n < info->num_dimensions(); ++n) + { + offset_first_element += window[n].start() * strides[n]; + } + + unsigned int idx_start = idx; + + for(unsigned int dimension = 0; dimension < dimension_size; dimension++) + { + _kernel.set_params(idx++, strides[dimension]); + _kernel.set_params(idx++, strides[dimension] * window[dimension].step()); + } + + _kernel.set_params(idx++, offset_first_element); + _kernel.set_params(idx++, param.buffer_data_type_shift); + + ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, param.binding_point, tensor->gc_buffer())); + + ARM_COMPUTE_ERROR_ON_MSG(idx_start + num_arguments_per_tensor() != idx, + "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor()); + ARM_COMPUTE_UNUSED(idx_start); +} + +void IGCKernel::add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window) +{ + add_tensor_argument<1>(idx, tensor, BufferParam(binding_point, 0), window); +} + +void IGCKernel::add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam ¶m, const Window &window) +{ + add_tensor_argument<1>(idx, tensor, param, window); +} + +void IGCKernel::add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window) +{ + add_tensor_argument<2>(idx, tensor, BufferParam(binding_point, 0), window); +} + +void IGCKernel::add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam ¶m, const Window &window) +{ + add_tensor_argument<2>(idx, tensor, param, window); +} + +void IGCKernel::add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window) +{ + add_tensor_argument<3>(idx, tensor, BufferParam(binding_point, 0), window); +} + +void IGCKernel::add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const BufferParam ¶m, const Window &window) +{ + add_tensor_argument<3>(idx, tensor, param, window); +} + +unsigned int IGCKernel::num_arguments_per_1D_tensor() const +{ + return num_arguments_per_tensor<1>(); +} + +unsigned int IGCKernel::num_arguments_per_2D_tensor() const +{ + return num_arguments_per_tensor<2>(); +} + +unsigned int IGCKernel::num_arguments_per_3D_tensor() const +{ + return num_arguments_per_tensor<3>(); +} diff --git a/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp b/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp new file mode 100644 index 0000000000..5bb479ed24 --- /dev/null +++ b/src/core/GLES_COMPUTE/IGCSimple2DKernel.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h" + +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +void IGCSimple2DKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, 1, slice); + add_2D_tensor_argument(idx, _output, 2, slice); + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp b/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp new file mode 100644 index 0000000000..61225d8533 --- /dev/null +++ b/src/core/GLES_COMPUTE/IGCSimple3DKernel.cpp @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h" + +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +void IGCSimple3DKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + + _kernel.use(); + + do + { + unsigned int idx = 0; + unsigned int binding = 1; // SSBO binding starts from 1. + add_3D_tensor_argument(idx, _input, binding++, slice); + add_3D_tensor_argument(idx, _output, binding++, slice); + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp new file mode 100644 index 0000000000..459601e68b --- /dev/null +++ b/src/core/GLES_COMPUTE/IGCSimpleKernel.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +IGCSimpleKernel::IGCSimpleKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void IGCSimpleKernel::configure(const IGCTensor *input, IGCTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size) +{ + _input = input; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size); + + IGCKernel::configure(win); +} diff --git a/src/core/GLES_COMPUTE/IGCTensor.cpp b/src/core/GLES_COMPUTE/IGCTensor.cpp new file mode 100644 index 0000000000..5576665243 --- /dev/null +++ b/src/core/GLES_COMPUTE/IGCTensor.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" + +using namespace arm_compute; + +IGCTensor::IGCTensor() + : _mapping(nullptr) +{ +} + +void IGCTensor::map(bool blocking) +{ + _mapping = do_map(blocking); +} + +void IGCTensor::unmap() +{ + do_unmap(); + _mapping = nullptr; +} + +void IGCTensor::clear() +{ + this->map(); + std::memset(static_cast(_mapping), 0, this->info()->total_size()); + this->unmap(); +} + +uint8_t *IGCTensor::buffer() const +{ + return _mapping; +} diff --git a/src/core/GLES_COMPUTE/OpenGLES.cpp b/src/core/GLES_COMPUTE/OpenGLES.cpp new file mode 100644 index 0000000000..fdfc085db2 --- /dev/null +++ b/src/core/GLES_COMPUTE/OpenGLES.cpp @@ -0,0 +1,820 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" + +#include +#include +#include + +using eglGetProcAddress_func = __eglMustCastToProperFunctionPointerType EGLAPIENTRY (*)(const char *procname); +using eglBindAPI_func = EGLBoolean EGLAPIENTRY (*)(EGLenum api); +using eglChooseConfig_func = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config); +using eglCreateContext_func = EGLContext EGLAPIENTRY (*)(EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list); +using eglDestroyContext_func = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLContext ctx); +using eglGetDisplay_func = EGLDisplay EGLAPIENTRY (*)(EGLNativeDisplayType display_id); +using eglInitialize_func = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLint *major, EGLint *minor); +using eglMakeCurrent_func = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx); +using eglTerminate_func = EGLBoolean EGLAPIENTRY (*)(EGLDisplay dpy); +using eglGetError_func = EGLint EGLAPIENTRY (*)(); +using eglQueryString_func = char const * EGLAPIENTRY (*)(EGLDisplay dpy, EGLint name); +using glAttachShader_func = void GL_APIENTRY (*)(GLuint program, GLuint shader); +using glCompileShader_func = void GL_APIENTRY (*)(GLuint shader); +using glCreateProgram_func = GLuint GL_APIENTRY (*)(); +using glCreateShader_func = GLuint GL_APIENTRY (*)(GLenum type); +using glDeleteProgram_func = void GL_APIENTRY (*)(GLuint program); +using glDeleteShader_func = void GL_APIENTRY (*)(GLuint shader); +using glDetachShader_func = void GL_APIENTRY (*)(GLuint program, GLuint shader); +using glGetProgramInfoLog_func = void GL_APIENTRY (*)(GLuint program, GLsizei bufsize, GLsizei *length, GLchar *infolog); +using glGetProgramiv_func = void GL_APIENTRY (*)(GLuint program, GLenum pname, GLint *params); +using glGetShaderInfoLog_func = void GL_APIENTRY (*)(GLuint shader, GLsizei bufsize, GLsizei *length, GLchar *infolog); +using glGetShaderiv_func = void GL_APIENTRY (*)(GLuint shader, GLenum pname, GLint *params); +using glLinkProgram_func = void GL_APIENTRY (*)(GLuint program); +using glShaderSource_func = void GL_APIENTRY (*)(GLuint shader, GLsizei count, const GLchar *const *string, const GLint *length); +using glUseProgram_func = void GL_APIENTRY (*)(GLuint program); +using glBindBuffer_func = void GL_APIENTRY (*)(GLenum target, GLuint buffer); +using glBindBufferBase_func = void GL_APIENTRY (*)(GLenum target, GLuint index, GLuint buffer); +using glBufferData_func = void GL_APIENTRY (*)(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage); +using glDeleteBuffers_func = void GL_APIENTRY (*)(GLsizei n, const GLuint *buffers); +using glDispatchCompute_func = void GL_APIENTRY (*)(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z); +using glFlush_func = void GL_APIENTRY (*)(); +using glGenBuffers_func = void GL_APIENTRY (*)(GLsizei n, GLuint *buffers); +using glGetProgramResourceIndex_func = GLuint GL_APIENTRY (*)(GLuint program, GLenum programInterface, const GLchar *name); +using glGetUniformLocation_func = GLint GL_APIENTRY (*)(GLuint program, const GLchar *name); +using glMapBufferRange_func = void *GL_APIENTRY (*)(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access); +using glMemoryBarrier_func = void GL_APIENTRY (*)(GLbitfield barriers); +using glUniform1ui_func = void GL_APIENTRY (*)(GLint location, GLuint v0); +using glUnmapBuffer_func = GLboolean GL_APIENTRY (*)(GLenum target); +using glGetError_func = GLenum GL_APIENTRY (*)(); +using glGetActiveUniformBlockiv_func = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params); +using glUniformBlockBinding_func = void GL_APIENTRY (*)(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding); +using glGetUniformBlockIndex_func = GLuint GL_APIENTRY (*)(GLuint program, const GLchar *uniformBlockName); +using glGenTextures_func = void GL_APIENTRY (*)(GLsizei n, GLuint *textures); +using glDeleteTextures_func = void GL_APIENTRY (*)(GLsizei n, const GLuint *textures); +using glBindTexture_func = void GL_APIENTRY (*)(GLenum target, GLuint texture); +using glTexImage2D_func = void GL_APIENTRY (*)(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, + const GLvoid *pixels); +using glGenFramebuffers_func = void GL_APIENTRY (*)(GLsizei n, GLuint *framebuffers); +using glDeleteFramebuffers_func = void GL_APIENTRY (*)(GLsizei n, const GLuint *framebuffers); +using glBindFramebuffer_func = void GL_APIENTRY (*)(GLenum target, GLuint framebuffer); +using glFramebufferTexture2D_func = void GL_APIENTRY (*)(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level); + +class GLESSymbols +{ +private: + void init() + { + void *egl_handle = dlopen("libEGL.so", RTLD_LAZY | RTLD_LOCAL); + void *glesv2_handle = dlopen("libGLESv2.so", RTLD_LAZY | RTLD_LOCAL); + void *glesv3_handle = dlopen("libGLESv3.so", RTLD_LAZY | RTLD_LOCAL); + if(egl_handle == nullptr) + { + std::cerr << "Can't load libEGL.so: " << dlerror() << std::endl; + } + else + { +#undef EGL_ENTRY +#define EGL_ENTRY(_api) _api = reinterpret_cast<_api##_func>(dlsym(egl_handle, #_api)); +#include "./egl_entries.in" +#undef EGL_ENTRY + + if(eglGetProcAddress != nullptr) + { +#undef EGL_ENTRY +#define EGL_ENTRY(_api) \ + if((_api) == nullptr) \ + (_api) = reinterpret_cast<_api##_func>(eglGetProcAddress(#_api)); +#include "./egl_entries.in" +#undef EGL_ENTRY + +#undef GL_ENTRY +#define GL_ENTRY(_api) _api = reinterpret_cast<_api##_func>(eglGetProcAddress(#_api)); +#include "./gl_entries.in" +#undef GL_ENTRY + } + + std::vector handles = { glesv3_handle, glesv2_handle }; + for(auto &handle : handles) + { + if(handle != nullptr) + { +#undef GL_ENTRY +#define GL_ENTRY(_api) \ + if((_api) == nullptr) \ + (_api) = reinterpret_cast<_api##_func>(dlsym(handle, #_api)); +#include "./gl_entries.in" +#undef GL_ENTRY + } + } + + if(glesv3_handle != nullptr) + { + dlclose(glesv3_handle); + } + if(glesv2_handle != nullptr) + { + dlclose(glesv2_handle); + } + dlclose(egl_handle); + } + } + bool _initialized = false; + +public: + static GLESSymbols &get() + { + static GLESSymbols symbols = GLESSymbols(); + if(!symbols._initialized) + { + symbols._initialized = true; + symbols.init(); + } + + return symbols; + } + +#undef EGL_ENTRY +#undef GL_ENTRY +#define EGL_ENTRY(_api) _api##_func _api = nullptr; +#define GL_ENTRY(_api) EGL_ENTRY(_api) +#include "./egl_entries.in" +#include "./gl_entries.in" +#undef EGL_ENTRY +#undef GL_ENTRY +}; + +bool arm_compute::opengles31_is_available() +{ + return GLESSymbols::get().glDispatchCompute != nullptr; +} + +__eglMustCastToProperFunctionPointerType EGLAPIENTRY eglGetProcAddress(const char *procname) +{ + auto func = GLESSymbols::get().eglGetProcAddress; + if(func != nullptr) + { + return func(procname); + } + else + { + return nullptr; + } +} + +EGLBoolean EGLAPIENTRY eglBindAPI(EGLenum api) +{ + auto func = GLESSymbols::get().eglBindAPI; + if(func != nullptr) + { + return func(api); + } + else + { + return EGL_FALSE; + } +} + +EGLBoolean EGLAPIENTRY eglChooseConfig(EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config) +{ + auto func = GLESSymbols::get().eglChooseConfig; + if(func != nullptr) + { + return func(dpy, attrib_list, configs, config_size, num_config); + } + else + { + return EGL_FALSE; + } +} + +EGLContext EGLAPIENTRY eglCreateContext(EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list) +{ + auto func = GLESSymbols::get().eglCreateContext; + if(func != nullptr) + { + return func(dpy, config, share_context, attrib_list); + } + else + { + return nullptr; + } +} + +EGLBoolean EGLAPIENTRY eglDestroyContext(EGLDisplay dpy, EGLContext ctx) +{ + auto func = GLESSymbols::get().eglDestroyContext; + if(func != nullptr) + { + return func(dpy, ctx); + } + else + { + return EGL_FALSE; + } +} + +EGLDisplay EGLAPIENTRY eglGetDisplay(EGLNativeDisplayType display_id) +{ + auto func = GLESSymbols::get().eglGetDisplay; + if(func != nullptr) + { + return func(display_id); + } + else + { + return nullptr; + } +} + +EGLBoolean EGLAPIENTRY eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor) +{ + auto func = GLESSymbols::get().eglInitialize; + if(func != nullptr) + { + return func(dpy, major, minor); + } + else + { + return EGL_FALSE; + } +} + +EGLBoolean EGLAPIENTRY eglMakeCurrent(EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx) +{ + auto func = GLESSymbols::get().eglMakeCurrent; + if(func != nullptr) + { + return func(dpy, draw, read, ctx); + } + else + { + return EGL_FALSE; + } +} + +EGLBoolean EGLAPIENTRY eglTerminate(EGLDisplay dpy) +{ + auto func = GLESSymbols::get().eglTerminate; + if(func != nullptr) + { + return func(dpy); + } + else + { + return EGL_FALSE; + } +} + +EGLint EGLAPIENTRY eglGetError() +{ + auto func = GLESSymbols::get().eglGetError; + if(func != nullptr) + { + return func(); + } + else + { + return GL_NO_ERROR; + } +} + +char const *EGLAPIENTRY eglQueryString(EGLDisplay dpy, EGLint name) +{ + auto func = GLESSymbols::get().eglQueryString; + if(func != nullptr) + { + return func(dpy, name); + } + else + { + return nullptr; + } +} + +void GL_APIENTRY glAttachShader(GLuint program, GLuint shader) +{ + auto func = GLESSymbols::get().glAttachShader; + if(func != nullptr) + { + return func(program, shader); + } + else + { + return; + } +} + +void GL_APIENTRY glCompileShader(GLuint shader) +{ + auto func = GLESSymbols::get().glCompileShader; + if(func != nullptr) + { + return func(shader); + } + else + { + return; + } +} + +GLuint GL_APIENTRY glCreateProgram() +{ + auto func = GLESSymbols::get().glCreateProgram; + if(func != nullptr) + { + return func(); + } + else + { + return 0; + } +} + +GLuint GL_APIENTRY glCreateShader(GLenum type) +{ + auto func = GLESSymbols::get().glCreateShader; + if(func != nullptr) + { + return func(type); + } + else + { + return 0; + } +} + +void GL_APIENTRY glDeleteProgram(GLuint program) +{ + auto func = GLESSymbols::get().glDeleteProgram; + if(func != nullptr) + { + return func(program); + } + else + { + return; + } +} + +void GL_APIENTRY glDeleteShader(GLuint shader) +{ + auto func = GLESSymbols::get().glDeleteShader; + if(func != nullptr) + { + return func(shader); + } + else + { + return; + } +} + +void GL_APIENTRY glDetachShader(GLuint program, GLuint shader) +{ + auto func = GLESSymbols::get().glDetachShader; + if(func != nullptr) + { + return func(program, shader); + } + else + { + return; + } +} + +void GL_APIENTRY glGetProgramInfoLog(GLuint program, GLsizei bufSize, GLsizei *length, GLchar *infoLog) +{ + auto func = GLESSymbols::get().glGetProgramInfoLog; + if(func != nullptr) + { + return func(program, bufSize, length, infoLog); + } + else + { + return; + } +} + +void GL_APIENTRY glGetProgramiv(GLuint program, GLenum pname, GLint *params) +{ + auto func = GLESSymbols::get().glGetProgramiv; + if(func != nullptr) + { + return func(program, pname, params); + } + else + { + return; + } +} + +void GL_APIENTRY glGetShaderInfoLog(GLuint shader, GLsizei bufSize, GLsizei *length, GLchar *infoLog) +{ + auto func = GLESSymbols::get().glGetShaderInfoLog; + if(func != nullptr) + { + return func(shader, bufSize, length, infoLog); + } + else + { + return; + } +} + +void GL_APIENTRY glGetShaderiv(GLuint shader, GLenum pname, GLint *params) +{ + auto func = GLESSymbols::get().glGetShaderiv; + if(func != nullptr) + { + return func(shader, pname, params); + } + else + { + return; + } +} + +void GL_APIENTRY glLinkProgram(GLuint program) +{ + auto func = GLESSymbols::get().glLinkProgram; + if(func != nullptr) + { + return func(program); + } + else + { + return; + } +} + +void GL_APIENTRY glShaderSource(GLuint shader, GLsizei count, const GLchar *const *string, const GLint *length) +{ + auto func = GLESSymbols::get().glShaderSource; + if(func != nullptr) + { + return func(shader, count, string, length); + } + else + { + return; + } +} + +void GL_APIENTRY glUseProgram(GLuint program) +{ + auto func = GLESSymbols::get().glUseProgram; + if(func != nullptr) + { + return func(program); + } + else + { + return; + } +} + +void GL_APIENTRY glBindBuffer(GLenum target, GLuint buffer) +{ + auto func = GLESSymbols::get().glBindBuffer; + if(func != nullptr) + { + return func(target, buffer); + } + else + { + return; + } +} + +void GL_APIENTRY glBindBufferBase(GLenum target, GLuint index, GLuint buffer) +{ + auto func = GLESSymbols::get().glBindBufferBase; + if(func != nullptr) + { + return func(target, index, buffer); + } + else + { + return; + } +} + +void GL_APIENTRY glBufferData(GLenum target, GLsizeiptr size, const GLvoid *data, GLenum usage) +{ + auto func = GLESSymbols::get().glBufferData; + if(func != nullptr) + { + return func(target, size, data, usage); + } + else + { + return; + } +} + +void GL_APIENTRY glDeleteBuffers(GLsizei n, const GLuint *buffers) +{ + auto func = GLESSymbols::get().glDeleteBuffers; + if(func != nullptr) + { + return func(n, buffers); + } + else + { + return; + } +} + +void GL_APIENTRY glDispatchCompute(GLuint num_groups_x, GLuint num_groups_y, GLuint num_groups_z) +{ + auto func = GLESSymbols::get().glDispatchCompute; + if(func != nullptr) + { + return func(num_groups_x, num_groups_y, num_groups_z); + } + else + { + return; + } +} + +void GL_APIENTRY glFlush(void) +{ + auto func = GLESSymbols::get().glFlush; + if(func != nullptr) + { + return func(); + } + else + { + return; + } +} + +void GL_APIENTRY glGenBuffers(GLsizei n, GLuint *buffers) +{ + auto func = GLESSymbols::get().glGenBuffers; + if(func != nullptr) + { + return func(n, buffers); + } + else + { + return; + } +} + +GLuint GL_APIENTRY glGetProgramResourceIndex(GLuint program, GLenum programInterface, const GLchar *name) +{ + auto func = GLESSymbols::get().glGetProgramResourceIndex; + if(func != nullptr) + { + return func(program, programInterface, name); + } + else + { + return GL_INVALID_INDEX; + } +} + +GLint GL_APIENTRY glGetUniformLocation(GLuint program, const GLchar *name) +{ + auto func = GLESSymbols::get().glGetUniformLocation; + if(func != nullptr) + { + return func(program, name); + } + else + { + return -1; + } +} + +void *GL_APIENTRY glMapBufferRange(GLenum target, GLintptr offset, GLsizeiptr length, GLbitfield access) +{ + auto func = GLESSymbols::get().glMapBufferRange; + if(func != nullptr) + { + return func(target, offset, length, access); + } + else + { + return nullptr; + } +} + +void GL_APIENTRY glMemoryBarrier(GLbitfield barriers) +{ + auto func = GLESSymbols::get().glMemoryBarrier; + if(func != nullptr) + { + return func(barriers); + } + else + { + return; + } +} + +void GL_APIENTRY glUniform1ui(GLint location, GLuint v0) +{ + auto func = GLESSymbols::get().glUniform1ui; + if(func != nullptr) + { + return func(location, v0); + } + else + { + return; + } +} + +GLboolean GL_APIENTRY glUnmapBuffer(GLenum target) +{ + auto func = GLESSymbols::get().glUnmapBuffer; + if(func != nullptr) + { + return func(target); + } + else + { + return GL_FALSE; + } +} + +GLenum GL_APIENTRY glGetError(void) +{ + auto func = GLESSymbols::get().glGetError; + if(func != nullptr) + { + return func(); + } + else + { + return GL_NO_ERROR; + } +} + +void GL_APIENTRY glGetActiveUniformBlockiv(GLuint program, GLuint uniformBlockIndex, GLenum pname, GLint *params) +{ + auto func = GLESSymbols::get().glGetActiveUniformBlockiv; + if(func != nullptr) + { + return func(program, uniformBlockIndex, pname, params); + } + else + { + return; + } +} + +void GL_APIENTRY glUniformBlockBinding(GLuint program, GLuint uniformBlockIndex, GLuint uniformBlockBinding) +{ + auto func = GLESSymbols::get().glUniformBlockBinding; + if(func != nullptr) + { + return func(program, uniformBlockIndex, uniformBlockBinding); + } + else + { + return; + } +} + +GLuint GL_APIENTRY glGetUniformBlockIndex(GLuint program, const GLchar *uniformBlockName) +{ + auto func = GLESSymbols::get().glGetUniformBlockIndex; + if(func != nullptr) + { + return func(program, uniformBlockName); + } + else + { + return GL_INVALID_INDEX; + } +} + +void GL_APIENTRY glGenTextures(GLsizei n, GLuint *textures) +{ + auto func = GLESSymbols::get().glGenTextures; + if(func != nullptr) + { + return func(n, textures); + } + else + { + return; + } +} + +void GL_APIENTRY glDeleteTextures(GLsizei n, const GLuint *textures) +{ + auto func = GLESSymbols::get().glDeleteTextures; + if(func != nullptr) + { + return func(n, textures); + } + else + { + return; + } +} + +void GL_APIENTRY glBindTexture(GLenum target, GLuint texture) +{ + auto func = GLESSymbols::get().glBindTexture; + if(func != nullptr) + { + return func(target, texture); + } + else + { + return; + } +} + +void GL_APIENTRY glTexImage2D(GLenum target, GLint level, GLint internalformat, GLsizei width, GLsizei height, GLint border, GLenum format, GLenum type, const GLvoid *pixels) +{ + auto func = GLESSymbols::get().glTexImage2D; + if(func != nullptr) + { + return func(target, level, internalformat, width, height, border, format, type, pixels); + } + else + { + return; + } +} + +void GL_APIENTRY glGenFramebuffers(GLsizei n, GLuint *framebuffers) +{ + auto func = GLESSymbols::get().glGenFramebuffers; + if(func != nullptr) + { + return func(n, framebuffers); + } + else + { + return; + } +} + +void GL_APIENTRY glDeleteFramebuffers(GLsizei n, const GLuint *framebuffers) +{ + auto func = GLESSymbols::get().glDeleteFramebuffers; + if(func != nullptr) + { + return func(n, framebuffers); + } + else + { + return; + } +} + +void GL_APIENTRY glBindFramebuffer(GLenum target, GLuint framebuffer) +{ + auto func = GLESSymbols::get().glBindFramebuffer; + if(func != nullptr) + { + return func(target, framebuffer); + } + else + { + return; + } +} + +void GL_APIENTRY glFramebufferTexture2D(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level) +{ + auto func = GLESSymbols::get().glFramebufferTexture2D; + if(func != nullptr) + { + return func(target, attachment, textarget, texture, level); + } + else + { + return; + } +} diff --git a/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs new file mode 100644 index 0000000000..f6113e13eb --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/absdiff.cs @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; +#include "helpers.h" + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(src1); + IMAGE_PARAM_DECLARATION(src2); + IMAGE_PARAM_DECLARATION(dst); +}; + +BUFFER_DECLARATION(src1, 1, uint, readonly); +BUFFER_DECLARATION(src2, 2, uint, readonly); +BUFFER_DECLARATION(dst, 3, uint, writeonly); + +/** Calculate the absolute difference of two input images. + * + * @param[in] src1_ptr Pointer to the first source image. Supported data types: U8 + * @param[in] src1_stride_x Stride of the first source image in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the first source image in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] src2_ptr Pointer to the second source image. Supported data types: Same as @p in1_ptr + * @param[in] src2_stride_x Stride of the second source image in X dimension (in bytes) + * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src2_stride_y Stride of the second source image in Y dimension (in bytes) + * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the second source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: Same as @p in1_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + Image src1 = CONVERT_TO_IMAGE_STRUCT(src1); + Image src2 = CONVERT_TO_IMAGE_STRUCT(src2); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + uvec4 tmp1 = UNPACK(LOAD4(src1, CURRENT_OFFSET(src1)), uint, uvec4); + uvec4 tmp2 = UNPACK(LOAD4(src2, CURRENT_OFFSET(src2)), uint, uvec4); + uvec4 diff = uvec4(abs(ivec4(tmp1 - tmp2))); + + STORE4(dst, CURRENT_OFFSET(dst), PACK(diff, uvec4, uint)); +} diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs new file mode 100644 index 0000000000..fc9da114f7 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; + +#include "helpers.h" + +#ifdef DATA_TYPE_FP32 +precision highp float; +#elif defined(DATA_TYPE_FP16) +#if defined(LOGISTIC) || defined(TANH) || defined(SRELU) || defined(SQRT) +precision highp float; +#else /*LOGISTIC_TANH_SRELU_SQRT*/ +precision mediump float; +#endif /*LOGISTIC_TANH_SRELU_SQRT*/ +#endif /*DATA_TYPE_FP32*/ + +#define ABS_OP(a) abs((a)) +#define ADD_OP(a, b) ((a) + (b)) +#define SUB_OP(a, b) ((a) - (b)) +#define MUL_OP(a, b) ((a) * (b)) +#define MLA_OP(a, b, c) ((b) * (c) + (a)) +#define DIV_OP(a, b) ((a) / (b)) +#define EXP_OP(a) exp((a)) +#define LOG_OP(a) log((a)) +#define SQRT_OP(a) sqrt((a)) +#define CONST_ONE (1.f) + +// Logistic Activation +float logistic_op(float x) +{ + return DIV_OP(CONST_ONE, ADD_OP(CONST_ONE, EXP_OP(-x))); +} +// Hyperbolic Tangent Activation +float tanh_op(float x) +{ + float tmp = float(B_VAL) * x; + if(tmp > 10.f) + { + return MUL_OP(float(A_VAL), 1.f); + } + else if(tmp < -10.f) + { + return MUL_OP(float(A_VAL), -1.f); + } + else + { + return MUL_OP(float(A_VAL), tanh(tmp + 0.000001f)); + } +} +// RELU Tangent Activation +float relu_op(float x) +{ + return max(0.f, x); +} +// Bounded RELU Activation +float brelu_op(float x) +{ + return min(float(A_VAL), max(float(0.0), x)); +} +// Lower Upper Bounded RELU Activation +float lu_brelu_op(float x) +{ + return min(max(x, float(B_VAL)), float(A_VAL)); +} +// Leaky RELU Activation +float lrelu_op(float x) +{ + return (x > float(0.0)) ? x : MUL_OP(float(A_VAL), x); +} +// Soft RELU Activation +float srelu_op(float x) +{ + return LOG_OP(ADD_OP(CONST_ONE, EXP_OP(x))); +} +// Absolute Activation +float abs_op(float x) +{ + return ABS_OP(x); +} +// Square Activation +float square_op(float x) +{ + return MUL_OP(x, x); +} +// Square-root Activation +float sqrt_op(float x) +{ + return SQRT_OP(x); +} +// Linear Activation +float linear_op(float x) +{ + return MLA_OP(float(B_VAL), float(A_VAL), x); +} + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +#ifdef DATA_TYPE_FP32 +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); + +/** This performs an activation function floating point inputs. + * + * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH" + * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y ride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + float data = src_ptr[src.current_offset]; + float data_out = 0.f; + // Perform activation + +#ifdef LOGISTIC + data_out = logistic_op(data); +#elif defined(TANH) /*LOGISTIC*/ + data_out = tanh_op(data); +#elif defined(RELU) /*RELU*/ + data_out = relu_op(data); +#elif defined(BRELU) /*BRELU*/ + data_out = brelu_op(data); +#elif defined(LU_BRELU) /*LU_BRELU*/ + data_out = lu_brelu_op(data); +#elif defined(LRELU) /*LRELU*/ + data_out = lrelu_op(data); +#elif defined(SRELU) /*SRELU*/ + data_out = srelu_op(data); +#elif defined(ABS) /*ABS*/ + data_out = abs_op(data); +#elif defined(SQUARE) /*SQUARE*/ + data_out = square_op(data); +#elif defined(SQRT) /*SQRT*/ + data_out = sqrt_op(data); +#elif defined(LINEAR) /*LINEAR*/ + data_out = linear_op(data); +#else /*LOGISTIC*/ +#error Activation function not provided +#endif /*LOGISTIC*/ + + dst_ptr[dst.current_offset] = data_out; +} + +#elif defined(DATA_TYPE_FP16) +BUFFER_DECLARATION(src, 1, uint, readonly); +BUFFER_DECLARATION(dst, 2, uint, writeonly); + +/** This performs an activation function floating point inputs. + * + * @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH" + * @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively. + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y ride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + uint data = src_ptr[src.current_offset >> 2]; + // Perform activation + float a = unpackHalf2x16(data).x; + float b = unpackHalf2x16(data).y; + vec2 data_out; +#ifdef LOGISTIC /*LOGISTIC*/ + data_out.x = logistic_op(a); + data_out.y = logistic_op(b); +#elif defined(TANH) /*TANH*/ + data_out.x = tanh_op(a); + data_out.y = tanh_op(b); +#elif defined(RELU) /*RELU*/ + data_out.x = relu_op(a); + data_out.y = relu_op(b); +#elif defined(BRELU) /*BRELU*/ + data_out.x = brelu_op(a); + data_out.y = brelu_op(b); +#elif defined(LU_BRELU) /*LU_BRELU*/ + data_out.x = lu_brelu_op(a); + data_out.y = lu_brelu_op(b); +#elif defined(LRELU) /*LRELU*/ + data_out.x = lrelu_op(a); + data_out.y = lrelu_op(b); +#elif defined(SRELU) /*SRELU*/ + data_out.x = srelu_op(a); + data_out.y = srelu_op(b); +#elif defined(ABS) /*ABS*/ + data_out.x = abs_op(a); + data_out.y = abs_op(b); +#elif defined(SQUARE) /*SQUARE*/ + data_out.x = square_op(a); + data_out.y = square_op(b); +#elif defined(SQRT) /*SQRT*/ + data_out.x = sqrt_op(a); + data_out.y = sqrt_op(b); +#elif defined(LINEAR) /*LINEAR*/ + data_out.x = linear_op(a); + data_out.y = linear_op(b); +#else /*LOGISTIC*/ +#error Activation function not provided +#endif /*LOGISTIC*/ + + dst_ptr[dst.current_offset >> 2] = packHalf2x16(data_out); +} +#endif /*DATA_TYPE_FP32*/ diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs new file mode 100644 index 0000000000..54880926cc --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; + +#include "helpers.h" + +#ifdef DATA_TYPE_FP32 +precision highp float; +#elif defined(DATA_TYPE_FP16) +precision mediump float; +#endif /*DATA_TYPE_FP32*/ + +#define ADD_OP(a, b) ((a) + (b)) +#define SUB_OP(a, b) ((a) - (b)) +#define MUL_OP(a, b) ((a) * (b)) +#define INVSQRT_OP(a) inversesqrt((a)) +#define SQCVT_SAT(a) (a) + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); + VECTOR_PARAM_DECLARATION(mean); + VECTOR_PARAM_DECLARATION(var); + VECTOR_PARAM_DECLARATION(beta); + VECTOR_PARAM_DECLARATION(gamma); +}; + +#ifdef DATA_TYPE_FP32 +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); +BUFFER_DECLARATION(mean, 3, float, readonly); +BUFFER_DECLARATION(var, 4, float, readonly); +BUFFER_DECLARATION(beta, 5, float, readonly); +BUFFER_DECLARATION(gamma, 6, float, readonly); + +/** Apply batch normalization. + * + * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1" + * + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p src_ptr + * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) + * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor + * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p src_ptr + * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes) + * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor + * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p src_ptr + * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) + * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor + */ +void main(void) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); + Vector var = CONVERT_TO_VECTOR_STRUCT(var); + Vector beta = CONVERT_TO_VECTOR_STRUCT(beta); + Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma); + + float input_value = 0.f; + float denominator = 0.f; + float numerator = 0.f; + float x_bar = 0.f; + float gamma_param = 0.f; + float beta_param = 0.f; + + uint current_slice = gl_GlobalInvocationID.z; + + input_value = src_ptr[src.current_offset]; + denominator = var_ptr[var.current_offset + (current_slice * var.stride_x) >> 2]; + denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON)))); + + // Calculate x bar and store results + numerator = mean_ptr[mean.current_offset + (current_slice * mean.stride_x) >> 2]; + numerator = SUB_OP(input_value, numerator); + x_bar = MUL_OP(numerator, denominator); + + gamma_param = gamma_ptr[gamma.current_offset + (current_slice * beta.stride_x) >> 2]; + beta_param = beta_ptr[beta.current_offset + (current_slice * beta.stride_x) >> 2]; + + dst_ptr[dst.current_offset] = ADD_OP(MUL_OP(gamma_param, x_bar), beta_param); +} + +#elif defined(DATA_TYPE_FP16) +BUFFER_DECLARATION(src, 1, uint, ); +BUFFER_DECLARATION(dst, 2, uint, writeonly); +BUFFER_DECLARATION(mean, 3, uint, ); +BUFFER_DECLARATION(var, 4, uint, ); +BUFFER_DECLARATION(beta, 5, uint, ); +BUFFER_DECLARATION(gamma, 6, uint, ); + +/** Apply batch normalization. + * + * @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1" + * + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p src_ptr + * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) + * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor + * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p src_ptr + * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes) + * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor + * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p src_ptr + * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) + * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor + */ +void main(void) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + Vector mean = CONVERT_TO_VECTOR_STRUCT_FP16(mean); + Vector var = CONVERT_TO_VECTOR_STRUCT_FP16(var); + Vector beta = CONVERT_TO_VECTOR_STRUCT_FP16(beta); + Vector gamma = CONVERT_TO_VECTOR_STRUCT_FP16(gamma); + + vec2 input_value; + float denominator; + float numerator; + vec2 x_bar; + float gamma_param; + float beta_param; + + uint current_slice = gl_GlobalInvocationID.z; + if((current_slice % uint(2)) == uint(0)) + { + input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]); + denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).x; + denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON)))); + + //Calculate x bar and store results + numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).x; + x_bar = MUL_OP(SUB_OP(input_value, numerator), denominator); + + gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).x; + beta_param = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).x; + + dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param)); + } + else + { + input_value = unpackHalf2x16(src_ptr[src.current_offset >> 2]); + denominator = unpackHalf2x16(var_ptr[(var.current_offset + current_slice * var.stride_x) >> 2]).y; + denominator = INVSQRT_OP(ADD_OP(denominator, SQCVT_SAT(float(ESPILON)))); + + //Calculate x bar and store results + numerator = unpackHalf2x16(mean_ptr[(mean.current_offset + current_slice * mean.stride_x) >> 2]).y; + x_bar = MUL_OP(SUB_OP(input_value, numerator), denominator); + + gamma_param = unpackHalf2x16(gamma_ptr[(gamma.current_offset + current_slice * beta.stride_x) >> 2]).y; + beta_param = unpackHalf2x16(beta_ptr[(beta.current_offset + current_slice * beta.stride_x) >> 2]).y; + + dst_ptr[dst.current_offset >> 2] = packHalf2x16(ADD_OP(MUL_OP(gamma_param, x_bar), beta_param)); + } +} +#endif /*DATA_TYPE_FP32*/ diff --git a/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs new file mode 100644 index 0000000000..65000f2de2 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/concatenate.cs @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; +#include "helpers.h" + +#ifdef DATA_TYPE_FP32 +precision highp float; + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); + +/** This kernel concatenates the input tensor into the output tensor along the third dimension + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + dst_ptr[dst.current_offset + uint(OFFSETS_Z >> 2)] = src_ptr[tensor3D_offset(src, -OFFSETS_X, -OFFSETS_Y, 0)]; +} + +#elif defined(DATA_TYPE_FP16) +precision mediump float; + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +BUFFER_DECLARATION(src, 1, uvec2, readonly); +BUFFER_DECLARATION(dst, 2, uvec2, writeonly); + +/** This kernel concatenates the input tensor into the output tensor along the third dimension + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Tensor3D src = GC_CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); + + uvec2 packed_s; + GC_LOAD1_3D_OFFSET(packed_s, src, -OFFSETS_X, -OFFSETS_Y, 0); + dst_ptr[(dst.current_offset + uint(OFFSETS_Z)) >> 3] = packed_s; +} +#endif /*DATA_TYPE_FP32*/ \ No newline at end of file diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs new file mode 100644 index 0000000000..1a0c9f1d30 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; +#include "helpers.h" + +#ifdef DATA_TYPE_FP16 +BUFFER_DECLARATION(src, 1, uint, readonly); +BUFFER_DECLARATION(dst, 2, uint, restrict); +#else // DATA_TYPE_FP16 +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, restrict); +#endif // DATA_TYPE_FP16 + +layout(std140) uniform shader_params +{ +#ifdef IM2COL_GENERIC + TENSOR3D_PARAM_DECLARATION(src); + IMAGE_PARAM_DECLARATION(dst); + uint filter_depth; + uint src_stride_w; + uint dst_stride_w; +#endif // IM2COL_GENERIC + +#ifdef IM2COL_REDUCED + TENSOR3D_PARAM_DECLARATION(src); + VECTOR_PARAM_DECLARATION(dst); + uint width; + uint height; +#endif // IM2COL_REDUCED + +#ifdef COL2IM + IMAGE_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); + uint width; +#endif // COL2IM +}; + +#ifdef DATA_TYPE_FP16 + +precision mediump float; + +#ifdef IM2COL_REDUCED +/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] width The width of the input tensor + * @param[in] height The height of the input tensor + */ +void main(void) +{ + uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); + uvec3 size = uvec3(gl_WorkGroupSize.xyz); + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D src_nostep = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(src); + Vector dst = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(dst); + uint image_size = width * height; + uint element_count = src_step_x / src_stride_x; + uint tmp_out_offset = dst.current_offset + ((pos.x * element_count + pos.y * width + pos.z * image_size) * dst.stride_x); + uint width_fp16 = ((width + uint(1)) >> uint(1)); + uint tmp; + + // odd width + if(width % uint(2) != uint(0)) + { + // even row + if((pos.y + pos.z * height) % uint(2) == uint(0)) + { + LOAD1(tmp, src, src.current_offset >> uint(2)); + STORE1(dst, tmp_out_offset >> uint(2), tmp); + } + else + { + // special op + uint tmpleft = uint(0); + uint tmpright = uint(0); + LOAD1(tmpright, src, src.current_offset >> uint(2)); // right half + if(pos.x == uint(0)) + { + LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, int(width), int(pos.y) - 1, int(pos.z)) >> uint(2)); // left half + tmpright = (tmpleft & uint(0xffff)) + (tmpright << uint(16)); + } + else + { + LOAD1(tmpleft, src, tensor3D_offset_fp16(src_nostep, (int(pos.x) - 1) * int(element_count), int(pos.y), int(pos.z)) >> uint(2)); // left half + tmpright = ((tmpleft >> uint(16)) + (tmpright << uint(16))); + } + STORE1(dst, tmp_out_offset >> uint(2), tmpright); + } + } + else + { + LOAD1(tmp, src, src.current_offset >> uint(2)); + STORE1(dst, tmp_out_offset >> uint(2), tmp); + } + +#ifdef HAS_BIAS + // If it is the last thread in the 3 dimensional workgroup + if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1)) + { + tmp_out_offset += dst.stride_x; + + // FIXME: need odd/even detection for tmp_out_offset? + mediump vec2 bias_vec = vec2(1.0f, 1.0f); + uint bias_u = packHalf2x16(bias_vec); + STORE1(dst, tmp_out_offset >> uint(2), bias_u); + } +#endif // HAS_BIAS +} +#endif // IM2COL_REDUCED + +#elif defined(DATA_TYPE_FP32) + +#ifdef IM2COL_GENERIC +/** This kernel performs a reshaping of the input tensor to a tensor used to perform convolution using GEMM. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] filter_depth The depth of the used filter + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +void main(void) +{ + uint xc = gl_GlobalInvocationID.x; // x coordinate in the convolved tensor + uint yc = gl_GlobalInvocationID.y; // y coordinate in the convolved tensor + uint ch = gl_GlobalInvocationID.z % filter_depth; // input feature map + uint batch = gl_GlobalInvocationID.z / filter_depth; // the batch + + // Calculate input indeces + uint xi = xc * uint(STRIDE_X) - uint(PAD_X); + uint yi = yc * uint(STRIDE_Y) - uint(PAD_Y); + uint input_offset = (src_offset_first_element_in_bytes + (ch * src_stride_z) + (batch * src_stride_w)) >> uint(2); + + // Calculate output indeces + uint xo = ch * uint(KERNEL_WIDTH) * uint(KERNEL_HEIGHT); + uint yo = xc + yc * uint(CONVOLVED_WIDTH); // Index of the convolution + uint output_offset = (dst_offset_first_element_in_bytes + (yo * dst_stride_y) + (batch * dst_stride_w) + xo) >> uint(2); + + // Linearize convolution elements + for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y) + { + for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x) + { +#if PAD_X == 0 && PAD_Y == 0 + output_offset = input_offset + ((x * src_stride_x + y * src_stride_y) >> uint(2)); + STORE4(dst, output_offset, LOAD4(src, input_offset)); +#else // PAD_X == 0 && PAD_Y == 0 + if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT) + { + STORE4(dst, output_offset, 0.0f); + } + else + { + output_offset = input_offset + (x * src_stride_x + y * src_stride_y) >> uint(2)); + STORE4(dst, output_offset, LOAD4(src, input_offset)); + } +#endif // PAD_X == 0 && PAD_Y == 0 + } + } + +#ifdef HAS_BIAS + if(ch == (uint(KERNEL_DEPTH) - 1)) + { + STORE4(dst, output_offset, 1.0f); + } +#endif // HAS_BIAS +} +#endif // IM2COL_GENERIC + +#ifdef IM2COL_REDUCED +/** This kernel reshapes the tensor's low three dimensions to single row for GEMM operation + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note In case biases will be added in late stage, "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] width The width of the input tensor + * @param[in] height The height of the input tensor + */ +void main(void) +{ + uvec3 pos = uvec3(gl_GlobalInvocationID.xyz); + uvec3 size = uvec3(gl_WorkGroupSize.xyz); + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Vector dst = CONVERT_TO_VECTOR_STRUCT_NO_STEP(dst); + uint image_size = width * height; + uint tmp_out_offset = dst.current_offset + (((pos.x + pos.y * width + pos.z * image_size) * dst.stride_x) >> 2); + + STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset)); + +#ifdef HAS_BIAS + // If it is the last thread in the 3 dimensional workgroup + if(pos.x == (size.x - 1) && pos.y == (size.y - 1) && pos.z == (size.z - 1)) + { + tmp_out_offset += (dst.stride_x >> uint(2)); + STORE4(dst, tmp_out_offset, 1.f); + } +#endif // HAS_BIAS +} +#endif // IM2COL_REDUCED + +#ifdef COL2IM +/** This kernel performs a reshaping of the output of the convolution layer. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + */ +void main(void) +{ + uvec2 pos = uvec2(gl_GlobalInvocationID.xy); + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + uint idx = pos.x * dst.stride_z + (pos.y / width) * dst.stride_y + (pos.y % width) * dst.stride_x; + uint tmp_out_offset = dst.current_offset + (idx >> 2); + + STORE4(dst, tmp_out_offset, LOAD4(src, src.current_offset)); +} +#endif // COL2IM + +#else // DATA_TYPE_FP16 +#error Data type not supported +#endif // DATA_TYPE_FP16 diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs new file mode 100644 index 0000000000..3a31cb80a7 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs @@ -0,0 +1,275 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; + +#include "helpers.h" + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); + TENSOR3D_PARAM_DECLARATION(weights); +#ifdef BIAS + VECTOR_PARAM_DECLARATION(biases); +#endif /* BIAS */ + uint weights_stride_w; + uint weights_depth; +}; + +#if defined(DATA_TYPE_FP32) +precision highp float; + +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); +BUFFER_DECLARATION(weights, 3, float, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, float, readonly); +#endif /* BIAS */ + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1" + * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* BIAS */ + + float pixels = CONVERT(0, float); + uint z_index = gl_GlobalInvocationID.z; + weights.current_offset += z_index * weights_stride_w >> 2; + float temp; + float temp_weight; + + for(int d = 0; d < int(weights_depth); ++d) + { + temp = LOAD4(src, CURRENT_OFFSET(src)); + temp_weight = LOAD4(weights, CURRENT_OFFSET(weights)); + pixels += temp * temp_weight; + + src.current_offset += (src_stride_z >> 2); + weights.current_offset += (weights_stride_z >> 2); + } + +#ifdef BIAS + pixels += LOAD4(biases, vector_offset(biases, int(z_index))); +#endif /* BIAS */ + + STORE4(dst, CURRENT_OFFSET(dst), pixels); +} +#elif defined(DATA_TYPE_FP16) +precision mediump float; + +BUFFER_DECLARATION(src, 1, uvec4, readonly); +BUFFER_DECLARATION(dst, 2, uvec4, writeonly); +BUFFER_DECLARATION(weights, 3, uint, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, uint, readonly); +#endif /* BIAS */ + +#if STRIDE_X == 2 +#define CONVOLVE(s, w) convolve_stride2(s, w) +#elif STRIDE_X == 1 /* STRIDE_X == 1 */ +#define CONVOLVE(s, w) convolve_stride1(s, w) +#else /* STRIDE_X not equals 1 or 2 */ +#error STRIDE_X larger than 2 is not supported +#endif /* STRIDE_X == 2 */ + +vec4[2] convolve_stride1(Image src, float w) +{ + uvec4 packed_s; + vec4 s[2]; + + GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0); + + s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y)); + s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w)); + + s[0] *= w; + s[1] *= w; + + return s; +} + +vec4[2] convolve_stride2(Image src, float w) +{ + uvec4 packed_s; + vec4 s[2]; + vec4 r[2]; + + GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0); + s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y)); + s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w)); + + r[0] = vec4(s[0].xz, s[1].xz); + + GC_LOAD1_2D_OFFSET(packed_s, src, 8, 0); + s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y)); + s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w)); + + r[1] = vec4(s[0].xz, s[1].xz); + + r[0] *= w; + r[1] *= w; + + return r; +} + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1" + * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = GC_CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef BIAS + Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* BIAS */ + + vec4 pixels[2]; + pixels[0] = vec4(0.f); + pixels[1] = vec4(0.f); + + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w; + + uint packed_w; + float w; + + for(int d = 0; d < int(weights_depth); ++d) + { + GC_LOAD1_3D_OFFSET(packed_w, weights, 0, 0, 0); + w = unpackHalf2x16(packed_w).x; + + vec4 r[2] = CONVOLVE(src, w); + pixels[0] += r[0]; + pixels[1] += r[1]; + + src.current_offset += src_stride_z; + weights.current_offset += weights_stride_z; + } + +#ifdef BIAS + uint packed_b; + float b; + + GC_LOAD1_1D_OFFSET(packed_b, biases, z_index); + + if(z_index % uint(2) == uint(0)) + { + b = unpackHalf2x16(packed_b).x; + } + else + { + b = unpackHalf2x16(packed_b).y; + } + + pixels[0] += vec4(b); + pixels[1] += vec4(b); +#endif /* BIAS */ + + uvec4 packed_d; + packed_d = uvec4(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw), + packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw)); + GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0); +} +#else /* DATA_TYPE_FP32 */ +#error Data type not supported +#endif /* DATA_TYPE_FP32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs new file mode 100644 index 0000000000..67b92cb8cf --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs @@ -0,0 +1,1583 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; + +#include "helpers.h" + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); + TENSOR3D_PARAM_DECLARATION(weights); +#ifdef BIAS + VECTOR_PARAM_DECLARATION(biases); +#endif /* BIAS */ + uint weights_stride_w; + uint weights_depth; +}; + +#define LOAD12(r, name, offset) \ + r.x = LOAD4(name, offset); \ + r.y = LOAD4(name, offset + uint(1)); \ + r.z = LOAD4(name, offset + uint(2)) + +#define LOAD3X3(r, name) \ + r[0] = LOAD4(name, tensor3D_offset(name, 0, 0, 0)); \ + r[1] = LOAD4(name, tensor3D_offset(name, 1, 0, 0)); \ + r[2] = LOAD4(name, tensor3D_offset(name, 2, 0, 0)); \ + r[3] = LOAD4(name, tensor3D_offset(name, 0, 1, 0)); \ + r[4] = LOAD4(name, tensor3D_offset(name, 1, 1, 0)); \ + r[5] = LOAD4(name, tensor3D_offset(name, 2, 1, 0)); \ + r[6] = LOAD4(name, tensor3D_offset(name, 0, 2, 0)); \ + r[7] = LOAD4(name, tensor3D_offset(name, 1, 2, 0)); \ + r[8] = LOAD4(name, tensor3D_offset(name, 2, 2, 0)) + +#if defined(PROCESS_1_ELEMENT) +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); +BUFFER_DECLARATION(weights, 3, float, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, float, readonly); +#endif /* BIAS */ + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* BIAS */ + + float pixels = CONVERT(0, float); + + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w >> 2; + + for(int d = 0; d < int(weights_depth); ++d) + { + vec3 temp; + vec3 w; + + LOAD12(temp, src, offset(src, 0, 0)); + LOAD12(w, weights, tensor3D_offset(weights, 0, 0, 0)); + + pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2]; + + LOAD12(temp, src, offset(src, 0, 1)); + LOAD12(w, weights, tensor3D_offset(weights, 0, 1, 0)); + + pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2]; + + LOAD12(temp, src, offset(src, 0, 2)); + LOAD12(w, weights, tensor3D_offset(weights, 0, 2, 0)); + + pixels += temp.x * w[0] + temp.y * w[1] + temp.z * w[2]; + + src.current_offset += src_stride_z >> 2; + weights.current_offset += weights_stride_z >> 2; + } + +#ifdef BIAS + pixels += LOAD4(biases, vector_offset(biases, int(z_index))); +#endif /* BIAS */ + + STORE4(dst, CURRENT_OFFSET(dst), pixels); +} +#elif defined(PROCESS_8_ELEMENT) +BUFFER_DECLARATION(src, 1, vec4, readonly); +BUFFER_DECLARATION(dst, 2, vec4, writeonly); +BUFFER_DECLARATION(weights, 3, float, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, float, readonly); +#endif /* BIAS */ + +#if STRIDE_X == 2 +#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w) +#elif STRIDE_X == 1 /* STRIDE_X == 1 */ +#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w) +#else /* STRIDE_X not equals 1 or 2 */ +#error STRIDE_X larger than 2 is not supported +#endif /* STRIDE_X == 2 */ + +vec4[2] convolve1x3_stride1(uint offset, vec3 w) +{ + vec4 middle; + vec4 right; + vec4 tmp[3]; + vec4 r[2]; + + LOAD3(tmp, src, offset); + + middle = vec4(tmp[0].yzw, tmp[1].x); + right = vec4(tmp[0].zw, tmp[1].xy); + + r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2]; + + middle = vec4(tmp[1].yzw, tmp[2].x); + right = vec4(tmp[1].zw, tmp[2].xy); + + r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2]; + + return r; +} + +vec4[2] convolve1x3_stride2(uint offset, vec3 w) +{ + vec4 left; + vec4 middle; + vec4 right; + vec4 tmp[3]; + vec4 r[2]; + + LOAD3(tmp, src, offset); + + left = vec4(tmp[0].xz, tmp[1].xz); + middle = vec4(tmp[0].yw, tmp[1].yw); + right = vec4(tmp[0].z, tmp[1].xz, tmp[2].x); + + r[0] = left * w[0] + middle * w[1] + right * w[2]; + + LOAD2(tmp, src, offset + ((uint(3) * src_stride_x) >> 2)); + + left = vec4(tmp[2].xz, tmp[0].xz); + middle = vec4(tmp[2].yw, tmp[0].yw); + right = vec4(tmp[2].z, tmp[0].xz, tmp[1].x); + + r[1] = left * w[0] + middle * w[1] + right * w[2]; + + return r; +} + +/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8 elements at once + * + * @note This OpenGL ES shader works with stride_x = 1 and 2 + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* BIAS */ + + vec4 pixels[2]; + pixels[0] = vec4(0); + pixels[1] = vec4(0); + + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w >> 2; + + for(int d = 0; d < int(weights_depth); ++d) + { + // load 3 weights once + vec3 w; + vec4 r[2]; + + // first line + LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0)); + + r = CONVOLVE1x3(src.current_offset >> uint(2), w); + pixels[0] += r[0]; + pixels[1] += r[1]; + + // second line + LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0)); + + r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w); + pixels[0] += r[0]; + pixels[1] += r[1]; + + // third line + LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0)); + + r = CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w); + pixels[0] += r[0]; + pixels[1] += r[1]; + + src.current_offset += src_stride_z >> 2; + weights.current_offset += weights_stride_z >> 2; + } + +#ifdef BIAS + float b; + LOAD1(b, biases, vector_offset(biases, int(z_index))); + pixels[0] += vec4(b); + pixels[1] += vec4(b); +#endif /* BIAS */ + + STORE2(dst, dst.current_offset >> uint(2), pixels); +} +#elif defined(PROCESS_4_ELEMENT) +BUFFER_DECLARATION(src, 1, vec4, readonly); +BUFFER_DECLARATION(dst, 2, vec4, writeonly); +BUFFER_DECLARATION(weights, 3, float, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, float, readonly); +#endif /* BIAS */ + +#if STRIDE_X == 2 +#define CONVOLVE1x3(offset, w) convolve1x3_stride2(offset, w) +#elif STRIDE_X == 1 /* STRIDE_X == 1 */ +#define CONVOLVE1x3(offset, w) convolve1x3_stride1(offset, w) +#else /* STRIDE_X not equals 1 or 2 */ +#error STRIDE_X larger than 2 is not supported +#endif /* STRIDE_X == 2 */ + +vec4 convolve1x3_stride1(uint offset, vec3 w) +{ + vec4 tmp[2]; + vec4 middle; + vec4 right; + + LOAD2(tmp, src, offset); + + middle = vec4(tmp[0].yzw, tmp[1].x); + right = vec4(tmp[0].zw, tmp[1].xy); + + tmp[1] = tmp[0] * w[0] + middle * w[1] + right * w[2]; + + return tmp[1]; +} + +vec4 convolve1x3_stride2(uint offset, vec3 w) +{ + vec4 left; + vec4 middle; + vec4 right; + + vec4 tmp[3]; + + LOAD3(tmp, src, offset); + + left = vec4(tmp[0].xz, tmp[1].xz); + middle = vec4(tmp[0].yw, tmp[1].yw); + right = vec4(tmp[0].z, tmp[1].xz, tmp[2].x); + + tmp[0] = left * w[0] + middle * w[1] + right * w[2]; + + return tmp[0]; +} + +/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once + * + * @note This OpenGL ES shader works with stride_x = 1 and 2 + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* BIAS */ + + vec4 pixels; + pixels = vec4(0); + + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w >> 2; + + for(int d = 0; d < int(weights_depth); ++d) + { + // load 3 weights once + vec3 w; + + // first line + LOAD3(w, weights, tensor3D_offset(weights, 0, 0, 0)); + + pixels += CONVOLVE1x3(src.current_offset >> uint(2), w); + + // second line + LOAD3(w, weights, tensor3D_offset(weights, 0, 1, 0)); + + pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 2)) >> uint(2), w); + + // third line + LOAD3(w, weights, tensor3D_offset(weights, 0, 2, 0)); + + pixels += CONVOLVE1x3((src.current_offset + (src_stride_y >> 1)) >> uint(2), w); + + src.current_offset += src_stride_z >> 2; + weights.current_offset += weights_stride_z >> 2; + } + +#ifdef BIAS + float b; + LOAD1(b, biases, vector_offset(biases, int(z_index))); + pixels += vec4(b); +#endif /* BIAS */ + + STORE1(dst, dst.current_offset >> uint(2), pixels); +} +#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS) +BUFFER_DECLARATION(src, 1, vec4, readonly); +BUFFER_DECLARATION(dst, 2, vec4, writeonly); +BUFFER_DECLARATION(weights, 3, float, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, float, readonly); +#endif /* BIAS */ + +#define CONVOLVE1x3(left, middle, right, w) convolve1x3_stride1(left, middle, right, w) + +vec4 convolve1x3_stride1(vec4 left, vec4 middle, vec4 right, vec3 w) +{ + vec4 r; + + r = left * w[0] + middle * w[1] + right * w[2]; + + return r; +} + +/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once + * + * @note This OpenGL ES shader works with stride_x = 1 and 2 + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* BIAS */ + + vec4 pixels[3]; + pixels[0] = vec4(0); + pixels[1] = vec4(0); + pixels[2] = vec4(0); + + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w >> 2; + + for(int d = 0; d < int(weights_depth); ++d) + { + // load 3 weights once + vec3 w[3]; + + LOAD3(w[0], weights, tensor3D_offset(weights, 0, 0, 0)); + LOAD3(w[1], weights, tensor3D_offset(weights, 0, 1, 0)); + LOAD3(w[2], weights, tensor3D_offset(weights, 0, 2, 0)); + + vec4 s[2]; + vec4 middle; + vec4 right; + // first line + LOAD2(s, src, src.current_offset >> uint(2)); + middle = vec4(s[0].yzw, s[1].x); + right = vec4(s[0].zw, s[1].xy); + pixels[0] += CONVOLVE1x3(s[0], middle, right, w[0]); + + // second line + LOAD2(s, src, (src.current_offset + (src_stride_y >> 2)) >> uint(2)); + middle = vec4(s[0].yzw, s[1].x); + right = vec4(s[0].zw, s[1].xy); + pixels[0] += CONVOLVE1x3(s[0], middle, right, w[1]); + pixels[1] += CONVOLVE1x3(s[0], middle, right, w[0]); + + // third line + LOAD2(s, src, (src.current_offset + (src_stride_y >> 1)) >> uint(2)); + middle = vec4(s[0].yzw, s[1].x); + right = vec4(s[0].zw, s[1].xy); + pixels[0] += CONVOLVE1x3(s[0], middle, right, w[2]); + pixels[1] += CONVOLVE1x3(s[0], middle, right, w[1]); + pixels[2] += CONVOLVE1x3(s[0], middle, right, w[0]); + + // forth line + LOAD2(s, src, (src.current_offset + (uint(3) * (src_stride_y >> 2))) >> uint(2)); + middle = vec4(s[0].yzw, s[1].x); + right = vec4(s[0].zw, s[1].xy); + pixels[1] += CONVOLVE1x3(s[0], middle, right, w[2]); + pixels[2] += CONVOLVE1x3(s[0], middle, right, w[1]); + + // fifth line + LOAD2(s, src, (src.current_offset + (src_stride_y)) >> uint(2)); + middle = vec4(s[0].yzw, s[1].x); + right = vec4(s[0].zw, s[1].xy); + pixels[2] += CONVOLVE1x3(s[0], middle, right, w[2]); + + src.current_offset += src_stride_z >> 2; + weights.current_offset += weights_stride_z >> 2; + } + +#ifdef BIAS + float b; + LOAD1(b, biases, vector_offset(biases, int(z_index))); + + pixels[0] += vec4(b); + pixels[1] += vec4(b); + pixels[2] += vec4(b); +#endif /* BIAS */ + + STORE1(dst, dst.current_offset >> uint(2), pixels[0]); + STORE1(dst, (dst.current_offset + (dst_stride_y >> 2)) >> uint(2), pixels[1]); + STORE1(dst, (dst.current_offset + (dst_stride_y >> 1)) >> uint(2), pixels[2]); +} +#elif defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16) +precision mediump float; + +BUFFER_DECLARATION(src, 1, uvec4, readonly); +BUFFER_DECLARATION(dst, 2, uvec4, writeonly); +BUFFER_DECLARATION(weights, 3, uint, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, uint, readonly); +#endif /* BIAS */ + +#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w) + +vec4[2] convolve1x3_stride1(vec4 tmp[3], vec3 w) +{ + vec4 middle; + vec4 right; + vec4 r[2]; + + middle = vec4(tmp[0].yzw, tmp[1].x); + right = vec4(tmp[0].zw, tmp[1].xy); + + r[0] = tmp[0] * w[0] + middle * w[1] + right * w[2]; + + middle = vec4(tmp[1].yzw, tmp[2].x); + right = vec4(tmp[1].zw, tmp[2].xy); + + r[1] = tmp[1] * w[0] + middle * w[1] + right * w[2]; + + return r; +} + +vec4[3] load_and_unpack(uint offset) +{ + uvec4 packed_s[2]; + vec4 s[3]; + + LOAD1(packed_s[0], src, offset); + LOAD1(packed_s[1], src, offset + uint(1)); + ; + + s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y)); + s[1] = vec4(unpackHalf2x16(packed_s[0].z), unpackHalf2x16(packed_s[0].w)); + s[2] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y)); + + return s; +} + +/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 8x3 elements at once + * + * @note This OpenGL ES shader works with stride_x = 1 and 2 + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases); +#endif /* BIAS */ + + uvec2 packed_d[2]; + uvec4 vd; + + vec4 pixels[3][2]; + int i, j; + for(i = 0; i < 3; i++) + { + for(j = 0; j < 2; j++) + { + pixels[i][j] = vec4(0); + } + } + + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w; + + for(int d = 0; d < int(weights_depth); ++d) + { + // load 3 weights once + uvec2 packed_w[3]; + + LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2); + LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2); + LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2); + + vec3 w[3]; + w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x); + w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x); + w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x); + + uvec4 packed_s[2]; + vec4 s[3]; + vec4 r[2]; + uint offset; + // first line + offset = src.current_offset >> uint(4); + s = load_and_unpack(offset); + + r = CONVOLVE1x3(s, w[0]); + pixels[0][0] += r[0]; + pixels[0][1] += r[1]; + + // second line + offset = (src.current_offset + src_stride_y) >> uint(4); + s = load_and_unpack(offset); + + r = CONVOLVE1x3(s, w[1]); + pixels[0][0] += r[0]; + pixels[0][1] += r[1]; + r = CONVOLVE1x3(s, w[0]); + pixels[1][0] += r[0]; + pixels[1][1] += r[1]; + + // third line + offset = (src.current_offset + (src_stride_y << 1)) >> uint(4); + s = load_and_unpack(offset); + + r = CONVOLVE1x3(s, w[2]); + pixels[0][0] += r[0]; + pixels[0][1] += r[1]; + r = CONVOLVE1x3(s, w[1]); + pixels[1][0] += r[0]; + pixels[1][1] += r[1]; + r = CONVOLVE1x3(s, w[0]); + pixels[2][0] += r[0]; + pixels[2][1] += r[1]; + + // forth line + offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(4); + s = load_and_unpack(offset); + + r = CONVOLVE1x3(s, w[2]); + pixels[1][0] += r[0]; + pixels[1][1] += r[1]; + r = CONVOLVE1x3(s, w[1]); + pixels[2][0] += r[0]; + pixels[2][1] += r[1]; + + // fifth line + offset = (src.current_offset + (src_stride_y << 2)) >> uint(4); + s = load_and_unpack(offset); + + r = CONVOLVE1x3(s, w[2]); + pixels[2][0] += r[0]; + pixels[2][1] += r[1]; + + src.current_offset += src_stride_z; + weights.current_offset += weights_stride_z; + } + +#ifdef BIAS + uint packed_b; + float b; + LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2); + + if(z_index % uint(2) == uint(0)) + { + b = unpackHalf2x16(packed_b).x; + } + else + { + b = unpackHalf2x16(packed_b).y; + } + + for(i = 0; i < 3; i++) + { + for(j = 0; j < 2; j++) + { + pixels[i][j] += vec4(b); + } + } +#endif /* BIAS */ + + packed_d[0] = uvec2(packHalf2x16(pixels[0][0].xy), packHalf2x16(pixels[0][0].zw)); + packed_d[1] = uvec2(packHalf2x16(pixels[0][1].xy), packHalf2x16(pixels[0][1].zw)); + vd = uvec4(packed_d[0], packed_d[1]); + STORE1(dst, dst.current_offset >> uint(4), vd); + + packed_d[0] = uvec2(packHalf2x16(pixels[1][0].xy), packHalf2x16(pixels[1][0].zw)); + packed_d[1] = uvec2(packHalf2x16(pixels[1][1].xy), packHalf2x16(pixels[1][1].zw)); + vd = uvec4(packed_d[0], packed_d[1]); + STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(4), vd); + + packed_d[0] = uvec2(packHalf2x16(pixels[2][0].xy), packHalf2x16(pixels[2][0].zw)); + packed_d[1] = uvec2(packHalf2x16(pixels[2][1].xy), packHalf2x16(pixels[2][1].zw)); + vd = uvec4(packed_d[0], packed_d[1]); + STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(4), vd); +} +#elif defined(PROCESS_X_4ELEMENTS_FP16) +precision mediump float; + +BUFFER_DECLARATION(src, 1, uvec2, readonly); +BUFFER_DECLARATION(dst, 2, uvec2, writeonly); +BUFFER_DECLARATION(weights, 3, uint, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, uint, readonly); +#endif /* BIAS */ + +#if STRIDE_X == 2 +#define CONVOLVE1x3(s, w) convolve1x3_stride2(s, w) +#define LOAD_AND_UNPACK(offset) load_and_unpack_stride2(offset) +#elif STRIDE_X == 1 /* STRIDE_X == 1 */ +#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w) +#define LOAD_AND_UNPACK(offset) load_and_unpack_stride1(offset) +#else /* STRIDE_X not equals 1 or 2 */ +#error STRIDE_X larger than 2 is not supported +#endif /* STRIDE_X == 2 */ + +vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w) +{ + vec4 middle; + vec4 right; + vec4 r; + + middle = vec4(tmp[0].yzw, tmp[1].x); + right = vec4(tmp[0].zw, tmp[1].xy); + + r = tmp[0] * w[0] + middle * w[1] + right * w[2]; + + return r; +} + +vec4 convolve1x3_stride2(vec4 tmp[3], vec3 w) +{ + vec4 left; + vec4 middle; + vec4 right; + vec4 r; + + left = vec4(tmp[0].xz, tmp[1].xz); + middle = vec4(tmp[0].yw, tmp[1].yw); + right = vec4(tmp[0].z, tmp[1].xz, tmp[2].x); + + r = left * w[0] + middle * w[1] + right * w[2]; + + return r; +} + +vec4[2] load_and_unpack_stride1(uint offset) +{ + uvec2 packed_s[2]; + vec4 s[2]; + + LOAD1(packed_s[0], src, offset); + LOAD1(packed_s[1], src, offset + uint(1)); + + s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y)); + s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y)); + + return s; +} + +vec4[3] load_and_unpack_stride2(uint offset) +{ + uvec2 packed_s[3]; + vec4 s[3]; + + LOAD1(packed_s[0], src, offset); + LOAD1(packed_s[1], src, offset + uint(1)); + LOAD1(packed_s[2], src, offset + uint(2)); + + s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y)); + s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y)); + s[2] = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y)); + + return s; +} + +/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4 elements at once + * + * @note This OpenGL ES shader works with stride_x = 1 and 2 + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases); +#endif /* BIAS */ + + uvec2 packed_d; + + vec4 pixels = vec4(0); + + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w; + + for(int d = 0; d < int(weights_depth); ++d) + { + // load 3 weights once + uvec2 packed_w[3]; + + LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2); + LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2); + LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2); + + vec3 w[3]; + w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x); + w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x); + w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x); + +#if STRIDE_X == 2 + vec4 s[3]; +#elif STRIDE_X == 1 /* STRIDE_X == 1 */ + vec4 s[2]; +#else /* STRIDE_X not equals 1 or 2 */ +#error STRIDE_X larger than 2 is not supported +#endif /* STRIDE_X == 2 */ + vec4 r; + uint offset; + // first line + offset = src.current_offset >> uint(3); + s = LOAD_AND_UNPACK(offset); + + pixels += CONVOLVE1x3(s, w[0]); + + // second line + offset = (src.current_offset + src_stride_y) >> uint(3); + s = LOAD_AND_UNPACK(offset); + + pixels += CONVOLVE1x3(s, w[1]); + + // third line + offset = (src.current_offset + (src_stride_y << 1)) >> uint(3); + s = LOAD_AND_UNPACK(offset); + + pixels += CONVOLVE1x3(s, w[2]); + + src.current_offset += src_stride_z; + weights.current_offset += weights_stride_z; + } + +#ifdef BIAS + uint packed_b; + float b; + LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2); + + if(z_index % uint(2) == uint(0)) + { + b = unpackHalf2x16(packed_b).x; + } + else + { + b = unpackHalf2x16(packed_b).y; + } + + pixels += vec4(b); +#endif /* BIAS */ + + packed_d = uvec2(packHalf2x16(pixels.xy), packHalf2x16(pixels.zw)); + STORE1(dst, dst.current_offset >> uint(3), packed_d); +} +#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16) +precision mediump float; + +BUFFER_DECLARATION(src, 1, uvec2, readonly); +BUFFER_DECLARATION(dst, 2, uvec2, writeonly); +BUFFER_DECLARATION(weights, 3, uint, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, uint, readonly); +#endif /* BIAS */ + +#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w) + +vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w) +{ + vec4 middle; + vec4 right; + vec4 r; + + middle = vec4(tmp[0].yzw, tmp[1].x); + right = vec4(tmp[0].zw, tmp[1].xy); + + r = tmp[0] * w[0] + middle * w[1] + right * w[2]; + + return r; +} + +vec4[2] load_and_unpack(uint offset) +{ + uvec2 packed_s[2]; + vec4 s[2]; + + LOAD1(packed_s[0], src, offset); + LOAD1(packed_s[1], src, offset + uint(1)); + + s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y)); + s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y)); + + return s; +} + +/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3 elements at once + * + * @note This OpenGL ES shader works with stride_x = 1 and 2 + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases); +#endif /* BIAS */ + + uvec2 packed_d; + + vec4 pixels[3]; + int i; + + for(i = 0; i < 3; i++) + { + pixels[i] = vec4(0); + } + + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w; + + for(int d = 0; d < int(weights_depth); ++d) + { + // load 3 weights once + uvec2 packed_w[3]; + + LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2); + LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2); + LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2); + + vec3 w[3]; + w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x); + w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x); + w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x); + + vec4 s[2]; + vec4 r; + uint offset; + // first line + offset = src.current_offset >> uint(3); + s = load_and_unpack(offset); + + pixels[0] += CONVOLVE1x3(s, w[0]); + + // second line + offset = (src.current_offset + src_stride_y) >> uint(3); + s = load_and_unpack(offset); + + pixels[0] += CONVOLVE1x3(s, w[1]); + pixels[1] += CONVOLVE1x3(s, w[0]); + + // third line + offset = (src.current_offset + (src_stride_y << 1)) >> uint(3); + s = load_and_unpack(offset); + + pixels[0] += CONVOLVE1x3(s, w[2]); + pixels[1] += CONVOLVE1x3(s, w[1]); + pixels[2] += CONVOLVE1x3(s, w[0]); + + // forth line + offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3); + s = load_and_unpack(offset); + + pixels[1] += CONVOLVE1x3(s, w[2]); + pixels[2] += CONVOLVE1x3(s, w[1]); + + // fifth line + offset = (src.current_offset + (src_stride_y << 2)) >> uint(3); + s = load_and_unpack(offset); + + pixels[2] += CONVOLVE1x3(s, w[2]); + + src.current_offset += src_stride_z; + weights.current_offset += weights_stride_z; + } + +#ifdef BIAS + uint packed_b; + float b; + LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2); + + if(z_index % uint(2) == uint(0)) + { + b = unpackHalf2x16(packed_b).x; + } + else + { + b = unpackHalf2x16(packed_b).y; + } + + for(i = 0; i < 3; i++) + { + pixels[i] += vec4(b); + } +#endif /* BIAS */ + + packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw)); + STORE1(dst, dst.current_offset >> uint(3), packed_d); + + packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw)); + STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d); + + packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw)); + STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d); +} +#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16) +precision mediump float; + +BUFFER_DECLARATION(src, 1, uvec2, readonly); +BUFFER_DECLARATION(dst, 2, uvec2, writeonly); +BUFFER_DECLARATION(weights, 3, uint, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, uint, readonly); +#endif /* BIAS */ + +#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w) + +vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w) +{ + vec4 middle; + vec4 right; + vec4 r; + + middle = vec4(tmp[0].yzw, tmp[1].x); + right = vec4(tmp[0].zw, tmp[1].xy); + + r = tmp[0] * w[0] + middle * w[1] + right * w[2]; + + return r; +} + +vec4[2] load_and_unpack(uint offset) +{ + uvec2 packed_s[2]; + vec4 s[2]; + + LOAD1(packed_s[0], src, offset); + LOAD1(packed_s[1], src, offset + uint(1)); + + s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y)); + s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y)); + + return s; +} + +/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x4 elements at once + * + * @note This OpenGL ES shader works with stride_x = 1 and 2 + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases); +#endif /* BIAS */ + + uvec2 packed_d; + + vec4 pixels[4]; + int i; + + for(i = 0; i < 4; i++) + { + pixels[i] = vec4(0); + } + + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w; + + for(int d = 0; d < int(weights_depth); ++d) + { + // load 3 weights once + uvec2 packed_w[3]; + + LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2); + LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2); + LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2); + + vec3 w[3]; + w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x); + w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x); + w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x); + + vec4 s[2]; + vec4 r; + uint offset; + // first line + offset = src.current_offset >> uint(3); + s = load_and_unpack(offset); + + pixels[0] += CONVOLVE1x3(s, w[0]); + + // second line + offset = (src.current_offset + src_stride_y) >> uint(3); + s = load_and_unpack(offset); + + pixels[0] += CONVOLVE1x3(s, w[1]); + pixels[1] += CONVOLVE1x3(s, w[0]); + + // third line + offset = (src.current_offset + (src_stride_y << 1)) >> uint(3); + s = load_and_unpack(offset); + + pixels[0] += CONVOLVE1x3(s, w[2]); + pixels[1] += CONVOLVE1x3(s, w[1]); + pixels[2] += CONVOLVE1x3(s, w[0]); + + // forth line + offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3); + s = load_and_unpack(offset); + + pixels[1] += CONVOLVE1x3(s, w[2]); + pixels[2] += CONVOLVE1x3(s, w[1]); + pixels[3] += CONVOLVE1x3(s, w[0]); + + // fifth line + offset = (src.current_offset + (src_stride_y << 2)) >> uint(3); + s = load_and_unpack(offset); + + pixels[2] += CONVOLVE1x3(s, w[2]); + pixels[3] += CONVOLVE1x3(s, w[1]); + + // sixth line + offset = (src.current_offset + uint(5) * (src_stride_y)) >> uint(3); + s = load_and_unpack(offset); + + pixels[3] += CONVOLVE1x3(s, w[2]); + + src.current_offset += src_stride_z; + weights.current_offset += weights_stride_z; + } + +#ifdef BIAS + uint packed_b; + float b; + LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2); + + if(z_index % uint(2) == uint(0)) + { + b = unpackHalf2x16(packed_b).x; + } + else + { + b = unpackHalf2x16(packed_b).y; + } + + for(i = 0; i < 4; i++) + { + pixels[i] += vec4(b); + } +#endif /* BIAS */ + + packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw)); + STORE1(dst, dst.current_offset >> uint(3), packed_d); + + packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw)); + STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d); + + packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw)); + STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d); + + packed_d = uvec2(packHalf2x16(pixels[3].xy), packHalf2x16(pixels[3].zw)); + STORE1(dst, (dst.current_offset + uint(3) * (dst_stride_y)) >> uint(3), packed_d); +} +#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16) +precision mediump float; + +BUFFER_DECLARATION(src, 1, uvec2, readonly); +BUFFER_DECLARATION(dst, 2, uvec2, writeonly); +BUFFER_DECLARATION(weights, 3, uint, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, uint, readonly); +#endif /* BIAS */ + +#define CONVOLVE1x3(s, w) convolve1x3_stride1(s, w) + +vec4 convolve1x3_stride1(vec4 tmp[2], vec3 w) +{ + vec4 middle; + vec4 right; + vec4 r; + + middle = vec4(tmp[0].yzw, tmp[1].x); + right = vec4(tmp[0].zw, tmp[1].xy); + + r = tmp[0] * w[0] + middle * w[1] + right * w[2]; + + return r; +} + +vec4[2] load_and_unpack(uint offset) +{ + uvec2 packed_s[2]; + vec4 s[2]; + + LOAD1(packed_s[0], src, offset); + LOAD1(packed_s[1], src, offset + uint(1)); + + s[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y)); + s[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y)); + + return s; +} + +/** An optimized direct convolution 3x3 OpenGL ES compute shader for process 4x3x2 elements at once + * + * @note This OpenGL ES shader works with stride_x = 1 and 2 + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT_FP16(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(biases); +#endif /* BIAS */ + + uvec2 packed_d; + + vec4 pixels[3]; + int i; + + uint z_base_index = gl_GlobalInvocationID.z << 1; + + // store orginal src current offset + uint s_offset = src.current_offset; + + weights.current_offset += z_base_index * weights_stride_w; + + for(int z = 0; z < 2; ++z) + { + uint z_index = z_base_index + uint(z); + + src.current_offset = s_offset; + //weights.current_offset = z_index * weights_stride_w; + + for(i = 0; i < 3; i++) + { + pixels[i] = vec4(0); + } + + for(int d = 0; d < int(weights_depth); ++d) + { + // load 3 weights once + uvec2 packed_w[3]; + + LOAD2(packed_w[0], weights, tensor3D_offset_fp16(weights, 0, 0, 0) >> 2); + LOAD2(packed_w[1], weights, tensor3D_offset_fp16(weights, 0, 1, 0) >> 2); + LOAD2(packed_w[2], weights, tensor3D_offset_fp16(weights, 0, 2, 0) >> 2); + + vec3 w[3]; + w[0] = vec3(unpackHalf2x16(packed_w[0].x), unpackHalf2x16(packed_w[0].y).x); + w[1] = vec3(unpackHalf2x16(packed_w[1].x), unpackHalf2x16(packed_w[1].y).x); + w[2] = vec3(unpackHalf2x16(packed_w[2].x), unpackHalf2x16(packed_w[2].y).x); + + vec4 s[2]; + vec4 r; + uint offset; + // first line + offset = src.current_offset >> uint(3); + s = load_and_unpack(offset); + + pixels[0] += CONVOLVE1x3(s, w[0]); + + // second line + offset = (src.current_offset + src_stride_y) >> uint(3); + s = load_and_unpack(offset); + + pixels[0] += CONVOLVE1x3(s, w[1]); + pixels[1] += CONVOLVE1x3(s, w[0]); + + // third line + offset = (src.current_offset + (src_stride_y << 1)) >> uint(3); + s = load_and_unpack(offset); + + pixels[0] += CONVOLVE1x3(s, w[2]); + pixels[1] += CONVOLVE1x3(s, w[1]); + pixels[2] += CONVOLVE1x3(s, w[0]); + + // forth line + offset = (src.current_offset + uint(3) * (src_stride_y)) >> uint(3); + s = load_and_unpack(offset); + + pixels[1] += CONVOLVE1x3(s, w[2]); + pixels[2] += CONVOLVE1x3(s, w[1]); + + // fifth line + offset = (src.current_offset + (src_stride_y << 2)) >> uint(3); + s = load_and_unpack(offset); + + pixels[2] += CONVOLVE1x3(s, w[2]); + + src.current_offset += src_stride_z; + weights.current_offset += weights_stride_z; + } + +#ifdef BIAS + uint packed_b; + float b; + LOAD1(packed_b, biases, vector_offset_fp16(biases, int(z_index)) >> 2); + + if(z_index % uint(2) == uint(0)) + { + b = unpackHalf2x16(packed_b).x; + } + else + { + b = unpackHalf2x16(packed_b).y; + } + + for(i = 0; i < 3; i++) + { + pixels[i] += vec4(b); + } +#endif /* BIAS */ + + packed_d = uvec2(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw)); + STORE1(dst, dst.current_offset >> uint(3), packed_d); + + packed_d = uvec2(packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw)); + STORE1(dst, (dst.current_offset + dst_stride_y) >> uint(3), packed_d); + + packed_d = uvec2(packHalf2x16(pixels[2].xy), packHalf2x16(pixels[2].zw)); + STORE1(dst, (dst.current_offset + (dst_stride_y << 1)) >> uint(3), packed_d); + + dst.current_offset += dst_stride_z; + } +} +#endif /* PROCESS_1_ELEMENT */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs new file mode 100644 index 0000000000..4fdbf0d19e --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs @@ -0,0 +1,313 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; + +#include "helpers.h" + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); + TENSOR3D_PARAM_DECLARATION(weights); +#ifdef BIAS + VECTOR_PARAM_DECLARATION(biases); +#endif /* BIAS */ + uint weights_stride_w; + uint weights_depth; +}; + +#ifdef DATA_TYPE_FP32 + +precision highp float; + +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); +BUFFER_DECLARATION(weights, 3, float, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, float, readonly); +#endif /* BIAS */ + +#define LOAD20(r, name, offset) \ + r[0] = LOAD4(name, offset); \ + r[1] = LOAD4(name, offset + uint(1)); \ + r[2] = LOAD4(name, offset + uint(2)); \ + r[3] = LOAD4(name, offset + uint(3)); \ + r[4] = LOAD4(name, offset + uint(4)) + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef BIAS + Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* BIAS */ + + float pixels = CONVERT(0, float); + uint z_index = gl_GlobalInvocationID.z; + weights.current_offset += z_index * weights_stride_w >> 2; + float temp[5]; + float temp_weight[5]; + + for(int d = 0; d < int(weights_depth); ++d) + { + LOAD20(temp, src, offset(src, 0, 0)); + LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 0, 0)); + pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; + + LOAD20(temp, src, offset(src, 0, 1)); + LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 1, 0)); + pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; + + LOAD20(temp, src, offset(src, 0, 2)); + LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 2, 0)); + pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; + + LOAD20(temp, src, offset(src, 0, 3)); + LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 3, 0)); + pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; + + LOAD20(temp, src, offset(src, 0, 4)); + LOAD20(temp_weight, weights, tensor3D_offset(weights, 0, 4, 0)); + pixels += temp[0] * temp_weight[0] + temp[1] * temp_weight[1] + temp[2] * temp_weight[2] + temp[3] * temp_weight[3] + temp[4] * temp_weight[4]; + + src.current_offset += (src_stride_z >> 2); + weights.current_offset += (weights_stride_z >> 2); + } + +#ifdef BIAS + pixels += LOAD4(biases, vector_offset(biases, int(z_index))); +#endif /* BIAS */ + + STORE4(dst, CURRENT_OFFSET(dst), pixels); +} + +#elif defined(DATA_TYPE_FP16) + +precision mediump float; + +BUFFER_DECLARATION(src, 1, uvec2, readonly); +BUFFER_DECLARATION(dst, 2, uvec2, writeonly); +BUFFER_DECLARATION(weights, 3, uint, readonly); +#ifdef BIAS +BUFFER_DECLARATION(biases, 4, uint, readonly); +#endif /* BIAS */ + +#if STRIDE_X == 1 +#define LOAD_SRC(src, row) load_src_stride1(src, row) +#define CONVOLVE1x5(src, weight) convolve1x5_stride1(src, weight) +#elif STRIDE_X == 2 /* STRIDE_X == 1 */ +#define LOAD_SRC(src, row) load_src_stride2(src, row) +#define CONVOLVE1x5(src, weight) convolve1x5_stride2(src, weight) +#else /* STRDIDE_X == 1 */ +#error STRIDE_X larger than 2 is not supported +#endif /* STRIDE_X == 1 */ + +vec4[2] load_src_stride1(Image src, int row) +{ + uvec2 packed[2]; + vec4 ret[2]; + + GC_LOAD2_2D_OFFSET(packed, src, 0, row); + + ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y)); + ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y)); + + return ret; +} + +vec4[3] load_src_stride2(Image src, int row) +{ + uvec2 packed[3]; + vec4 ret[3]; + + GC_LOAD3_2D_OFFSET(packed, src, 0, row); + + ret[0] = vec4(unpackHalf2x16(packed[0].x), unpackHalf2x16(packed[0].y)); + ret[1] = vec4(unpackHalf2x16(packed[1].x), unpackHalf2x16(packed[1].y)); + ret[2] = vec4(unpackHalf2x16(packed[2].x), unpackHalf2x16(packed[2].y)); + + return ret; +} + +vec2[3] load_weight(Tensor3D weights, int row) +{ + uvec3 packed_w; + vec2 ret[3]; + + GC_LOAD3_3D_OFFSET(packed_w, weights, 0, row, 0); + + ret[0] = vec2(unpackHalf2x16(packed_w[0])); + ret[1] = vec2(unpackHalf2x16(packed_w[1])); + ret[2] = vec2(unpackHalf2x16(packed_w[2])); + + return ret; +} + +// output 4 element per thread +vec4 convolve1x5_stride1(vec4 tmp[2], vec2 w[3]) +{ + vec4 src0 = tmp[0]; + vec4 src1 = vec4(tmp[0].yzw, tmp[1].x); + vec4 src2 = vec4(tmp[0].zw, tmp[1].xy); + vec4 src3 = vec4(tmp[0].w, tmp[1].xyz); + vec4 src4 = tmp[1]; + vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x; + + return ret; +} + +vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3]) +{ + vec4 src0 = vec4(tmp[0].xz, tmp[1].xz); + vec4 src1 = vec4(tmp[0].yw, tmp[1].yw); + vec4 src2 = vec4(tmp[0].z, tmp[1].xz, tmp[2].x); + vec4 src3 = vec4(tmp[0].w, tmp[1].yw, tmp[2].y); + vec4 src4 = vec4(tmp[1].x, tmp[1].z, tmp[2].xz); + vec4 ret = src0 * w[0].x + src1 * w[0].y + src2 * w[1].x + src3 * w[1].y + src4 * w[2].x; + + return ret; +} + +/** This kernel performs a direct convolution to convolve the low three dimensions. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * @note If biases are used then "define HAS_BIAS" has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr + * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr + * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes) + * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor + * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension + * @param[in] weights_depth The third dimensions of the weights tensors + */ +void main() +{ + Image src = GC_CONVERT_TO_IMAGE_STRUCT(src); + Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights); + Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); + +#ifdef BIAS + Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); +#endif /* BIAS */ + + vec4 res = vec4(0); + vec2 w[3]; + vec4 s[STRIDE_X + 1]; + uvec2 packed_d; + uint z_index = gl_GlobalInvocationID.z; + + weights.current_offset += z_index * weights_stride_w; + + for(int d = 0; d < int(weights_depth); ++d) + { + for(int row = 0; row < 5; row++) + { + w = load_weight(weights, row); + s = LOAD_SRC(src, row); + res += CONVOLVE1x5(s, w); + } + + src.current_offset += src_stride_z; + weights.current_offset += weights_stride_z; + } + +#ifdef BIAS + uint packed_b; + float b; + + GC_LOAD1_1D_OFFSET(packed_b, biases, z_index); + b = (z_index % uint(2) == uint(0)) ? unpackHalf2x16(packed_b).x : unpackHalf2x16(packed_b).y; + res += vec4(b); +#endif /* BIAS */ + + packed_d = uvec2(packHalf2x16(res.xy), packHalf2x16(res.zw)); + GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0); +} + +#else /* DATA_TYPE_FP16 */ +#error Data type not supported +#endif /* DATA_TYPE_FP16 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/dropout.cs b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs new file mode 100644 index 0000000000..54e08b1306 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/dropout.cs @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; + +#include "helpers.h" + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(mask); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +uint hash(uint x) +{ + x += (x << 10u); + x ^= (x >> 6u); + x += (x << 3u); + x ^= (x >> 11u); + x += (x << 15u); + return x; +} + +uint hash(uvec3 v) +{ + return hash(v.x ^ hash(v.y) ^ hash(v.z)); +} + +float float_construct(uint m) +{ + const uint ieee_mantissa = 0x007FFFFFu; + const uint ieee_one = 0x3F800000u; + + m &= ieee_mantissa; + m |= ieee_one; + + float f = uintBitsToFloat(m); + return f - 1.0; +} + +float rand(vec3 v, float seed) +{ + return float_construct(hash(floatBitsToUint(v + seed))); +} + +#ifdef DATA_TYPE_FP32 + +precision highp float; + +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(mask, 2, float, ); +BUFFER_DECLARATION(dst, 3, float, writeonly); + +/** Dropout is used to improve over-fit on neural networks. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32" + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] mask_ptr Pointer to the mask tensor. Supported data types: same as @p src_ptr + * @param[in] mask_stride_x Stride of the mask tensor in X dimension (in bytes) + * @param[in] mask_step_x mask_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mask_stride_y Stride of the mask tensor in Y dimension (in bytes) + * @param[in] mask_step_y mask_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] mask_stride_z Stride of the mask tensor in Z dimension (in bytes) + * @param[in] mask_step_z mask_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] mask_offset_first_element_in_bytes The offset of the first element in the mask tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Tensor3D src = GC_CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask); + Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); + + float random = 0.f; + float inputv = 0.f; + float maskv = 0.f; + float outputv = 0.f; + +#ifdef FORWARD + random = rand(vec3(gl_GlobalInvocationID.xyz), SEED); + maskv = (random > RATIO) ? 1.f : 0.f; + GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0); +#else /* FORWARD */ + GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0); +#endif /* FORWARD */ + + GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0); + outputv = maskv * inputv * float(SCALE); + GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0); +} + +#elif defined(DATA_TYPE_FP16) + +precision mediump float; + +BUFFER_DECLARATION(src, 1, uint, readonly); +BUFFER_DECLARATION(mask, 2, uint, ); +BUFFER_DECLARATION(dst, 3, uint, writeonly); + +/** Dropout is used to improve over-fit on neural networks. + * + * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16" + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] mask_ptr Pointer to the mask tensor. Supported data types: same as @p src_ptr + * @param[in] mask_stride_x Stride of the mask tensor in X dimension (in bytes) + * @param[in] mask_step_x mask_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mask_stride_y Stride of the mask tensor in Y dimension (in bytes) + * @param[in] mask_step_y mask_stride_y * number of elements along y processed per workitem(in bytes) + * @param[in] mask_stride_z Stride of the mask tensor in Z dimension (in bytes) + * @param[in] mask_step_z mask_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] mask_offset_first_element_in_bytes The offset of the first element in the mask tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Tensor3D src = GC_CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D mask = GC_CONVERT_TO_TENSOR3D_STRUCT(mask); + Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst); + + float random1 = 0.f; + float random2 = 0.f; + uint inputv = uint(0); + uint outputv = uint(0); + uint maskv = uint(0); + vec2 input_vec = vec2(0, 0); + vec2 output_vec = vec2(0, 0); + vec2 mask_vec = vec2(0, 0); + +#ifdef FORWARD + random1 = rand(vec3(gl_GlobalInvocationID.xyz), SEED); + random2 = rand(vec3(float(gl_GlobalInvocationID.x) + 0.5f, gl_GlobalInvocationID.yz), SEED); + mask_vec.x = (random1 > RATIO) ? 1.f : 0.f; + mask_vec.y = (random2 > RATIO) ? 1.f : 0.f; + maskv = packHalf2x16(mask_vec); + GC_STORE1_3D_OFFSET(maskv, mask, 0, 0, 0); +#else /* FORWARD */ + GC_LOAD1_3D_OFFSET(maskv, mask, 0, 0, 0); + mask_vec = unpackHalf2x16(maskv); +#endif /* FORWARD */ + + GC_LOAD1_3D_OFFSET(inputv, src, 0, 0, 0); + + input_vec = unpackHalf2x16(inputv); + output_vec = mask_vec * input_vec * float(SCALE); + outputv = packHalf2x16(output_vec); + + GC_STORE1_3D_OFFSET(outputv, dst, 0, 0, 0); +} + +#else /* DATA_TYPE_FP32 */ + +#endif /* DATA_TYPE_FP32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs new file mode 100644 index 0000000000..01a39866c7 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/fill_border.cs @@ -0,0 +1,553 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; +#include "helpers.h" + +#if defined(DATA_TYPE_FP32) +#ifdef FILL_IMAGE_BORDERS_REPLICATE +BUFFER_DECLARATION(buf, 1, float, restrict); +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(buf); + uint width; + uint height; + int start_pos_x; + int start_pos_y; +}; + +/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel. + * + * @attention The border size for top, bottom, left, right needs to be passed at the compile time. + * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2 + * + * @param[in,out] buf_ptr Pointer to the source image. Supported data types: F32 + * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] buf_stride_z Stride between images if batching images (in bytes) + * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] width Width of the valid region of the image + * @param[in] height Height of the valid region of the image + * @param[in] start_pos_x X coordinate indicating the start point of the valid region + * @param[in] start_pos_y Y coordinate indicating the start point of the valid region + */ +void main() +{ + Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf); + + // Update pointer to point to the starting point of the valid region + buf.current_offset = uint(int(buf.current_offset) + ((start_pos_y * int(buf_stride_y) + start_pos_x * int(buf_stride_x)) >> 2)); + + int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT; + int gid0 = int(gl_GlobalInvocationID.x); + int gidH = gid0 - total_width; + int gidW = gid0 - BORDER_SIZE_LEFT; + + if(gidH >= 0) + { + // Handle left border + float left_val = LOAD4(buf, offset(buf, 0, gidH)); + for(int i = -BORDER_SIZE_LEFT; i < 0; ++i) + { + STORE4(buf, offset(buf, i, gidH), left_val); + } + // Handle right border + float right_val = LOAD4(buf, offset(buf, int(width) - 1, gidH)); + for(int i = 0; i < BORDER_SIZE_RIGHT; ++i) + { + STORE4(buf, offset(buf, int(width) + i, gidH), right_val); + } + } + else + { + // Get value for corners + int val_idx = gidW; + if(gidW < 0 || gidW > (int(width) - 1)) + { + val_idx = gidW < 0 ? 0 : int(width) - 1; + } + + // Handle top border + float top_val = LOAD4(buf, offset(buf, val_idx, 0)); + for(int i = -BORDER_SIZE_TOP; i < 0; ++i) + { + STORE4(buf, offset(buf, gidW, i), top_val); + } + // Handle bottom border + float bottom_val = LOAD4(buf, offset(buf, val_idx, int(height) - 1)); + for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i) + { + STORE4(buf, offset(buf, gidW, int(height) + i), bottom_val); + } + } +} +#endif /* FILL_IMAGE_BORDERS_REPLICATE */ + +#ifdef FILL_IMAGE_BORDERS_CONSTANT +BUFFER_DECLARATION(buf, 1, float, writeonly); +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(buf); + uint width; + uint height; + int start_pos_x; + int start_pos_y; + float constant_value; +}; + +/** Fill N pixels of the padding edge of a single channel image with a constant value. + * + * @attention The border size for top, bottom, left, right needs to be passed at the compile time. + * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2 + * + * @param[out] buf_ptr Pointer to the source image. Supported data types: F32 + * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] width Width of the valid region of the image + * @param[in] height Height of the valid region of the image + * @param[in] start_pos_x X coordinate indicating the start point of the valid region + * @param[in] start_pos_y Y coordinate indicating the start point of the valid region + * @param[in] constant_value Constant value to use to fill the edges + */ +void main() +{ + Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf); + + // Update pointer to point to the starting point of the valid region + buf.current_offset = uint(int(buf.current_offset) + ((start_pos_y * int(buf_stride_y) + start_pos_x * int(buf_stride_x)) >> 2)); + + int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT; + int gid0 = int(gl_GlobalInvocationID.x); + int gidH = gid0 - total_width; + int gidW = gid0 - BORDER_SIZE_LEFT; + + if(gidH >= 0) + { + // Handle left border + for(int i = -BORDER_SIZE_LEFT; i < 0; ++i) + { + STORE1(buf, offset(buf, i, gidH), constant_value); + } + // Handle right border + for(int i = 0; i < BORDER_SIZE_RIGHT; ++i) + { + STORE1(buf, offset(buf, int(width) + i, gidH), constant_value); + } + } + else + { + // Handle top border + for(int i = -BORDER_SIZE_TOP; i < 0; ++i) + { + STORE1(buf, offset(buf, gidW, i), constant_value); + } + // Handle bottom border + for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i) + { + STORE1(buf, offset(buf, gidW, int(height) + i), constant_value); + } + } +} +#endif /* FILL_IMAGE_BORDERS_CONSTANT */ + +#elif defined(DATA_TYPE_FP16) +precision mediump float; + +#ifdef FILL_IMAGE_BORDERS_REPLICATE +BUFFER_DECLARATION(buf, 1, uint, restrict); +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(buf); + uint width; + uint height; + int start_pos_x; + int start_pos_y; +}; + +void set_replicate(uint offset, int pos, uint replicate_value) +{ + uint packed_b; + LOAD1(packed_b, buf, offset); + + vec2 b = unpackHalf2x16(packed_b); + vec2 c = unpackHalf2x16(replicate_value); + + if(pos % 2 == 0) + { + b.x = c.y; + } + else + { + b.y = c.x; + } + + packed_b = packHalf2x16(b); + + STORE1(buf, offset, packed_b); +} + +/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel. + * + * @attention The border size for top, bottom, left, right needs to be passed at the compile time. + * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2 + * + * @param[in,out] buf_ptr Pointer to the source image. Supported data types: F16 + * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] buf_stride_z Stride between images if batching images (in bytes) + * @param[in] buf_step_z buf_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] width Width of the valid region of the image + * @param[in] height Height of the valid region of the image + * @param[in] start_pos_x X coordinate indicating the start point of the valid region + * @param[in] start_pos_y Y coordinate indicating the start point of the valid region + */ +void main() +{ + Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(buf); + + // Update pointer to point to the starting point of the valid region + buf.current_offset = uint(buf.current_offset + uint(start_pos_y) * buf_stride_y + uint(start_pos_x) * buf_stride_x); + + int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT; + int gid0 = int(gl_GlobalInvocationID.x); + int gidH = gid0 - total_width; + int gidW = gid0 - BORDER_SIZE_LEFT; + + if(gidH >= 0) + { + // Handle left border + uint left_val; + LOAD1(left_val, buf, offset_fp16(buf, 0, gidH) >> uint(2)); + for(int i = -BORDER_SIZE_LEFT; i < 0; ++i) + { + uint offset = offset_fp16(buf, i, gidH) >> 2; + int pos = i + BORDER_SIZE_LEFT; + if(i == -1) + { + if(pos % 2 == 0) + { + set_replicate(offset, pos, left_val); + } + } + else + { + if(pos % 2 == 0) + { + vec2 a = unpackHalf2x16(left_val); + uint b = packHalf2x16(a.xx); + STORE1(buf, offset, b); + } + } + } + // Handle right border + uint right_val; + LOAD1(right_val, buf, offset_fp16(buf, int(width) - 1, gidH) >> uint(2)); + for(int i = 0; i < BORDER_SIZE_RIGHT; ++i) + { + uint offset = offset_fp16(buf, int(width) + i, gidH) >> 2; + int pos = i + BORDER_SIZE_LEFT + int(width); + + if(i == 0) + { + if(pos % 2 == 0) + { + vec2 a = unpackHalf2x16(right_val); + uint b = packHalf2x16(a.yy); + STORE1(buf, offset, b); + } + else + { + set_replicate(offset, pos, right_val); + } + } + else + { + if(pos % 2 == 0) + { + vec2 a = unpackHalf2x16(right_val); + uint b = packHalf2x16(a.yy); + STORE1(buf, offset, b); + } + } + } + } + else + { + // Get value for corners + int val_idx = gidW; + if(gidW < 0 || (gidW > (int(width) - 1))) + { + val_idx = gidW < 0 ? 0 : (int(width) - 1); + } + + // Handle top border + uint top_val; + LOAD1(top_val, buf, offset_fp16(buf, val_idx, 0) >> uint(2)); + for(int i = -BORDER_SIZE_TOP; i < 0; ++i) + { + uint offset = offset_fp16(buf, gidW, i) >> 2; + + if(gid0 % 2 == 0) + { + if(gidW == (int(width) - 1)) + { + vec2 a = unpackHalf2x16(top_val); + uint b = packHalf2x16(a.xx); + STORE1(buf, offset, b); + } + else + { + if(gidW < 0) + { + vec2 a = unpackHalf2x16(top_val); + uint b; + if(BORDER_SIZE_LEFT % 2 == 0) + { + b = packHalf2x16(a.xx); + } + else + { + b = packHalf2x16(a.yy); + } + STORE1(buf, offset, b); + } + else if(gidW >= int(width)) + { + vec2 a = unpackHalf2x16(top_val); + uint b; + if((BORDER_SIZE_LEFT + int(width)) % 2 == 0) + { + b = packHalf2x16(a.yy); + } + STORE1(buf, offset, b); + } + else + { + STORE1(buf, offset, top_val); + } + } + } + } + // Handle bottom border + uint bottom_val; + LOAD1(bottom_val, buf, offset_fp16(buf, val_idx, int(height) - 1) >> uint(2)); + for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i) + { + uint offset = offset_fp16(buf, gidW, int(height) + i) >> 2; + + if(gid0 % 2 == 0) + { + if(gidW == (int(width) - 1)) + { + vec2 a = unpackHalf2x16(bottom_val); + uint b = packHalf2x16(a.xx); + STORE1(buf, offset, b); + } + else + { + if(gidW < 0) + { + vec2 a = unpackHalf2x16(bottom_val); + uint b; + if(BORDER_SIZE_LEFT % 2 == 0) + { + b = packHalf2x16(a.xx); + } + else + { + b = packHalf2x16(a.yy); + } + STORE1(buf, offset, b); + } + else if(gidW >= int(width)) + { + vec2 a = unpackHalf2x16(bottom_val); + uint b; + if((BORDER_SIZE_LEFT + int(width)) % 2 == 0) + { + b = packHalf2x16(a.yy); + } + STORE1(buf, offset, b); + } + else + { + STORE1(buf, offset, bottom_val); + } + } + } + } + } +} +#endif /* FILL_IMAGE_BORDERS_REPLICATE */ + +#ifdef FILL_IMAGE_BORDERS_CONSTANT +BUFFER_DECLARATION(buf, 1, uint, restrict); + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(buf); + uint width; + uint height; + int start_pos_x; + int start_pos_y; + float constant_value; +}; + +void set_constant(uint offset, int pos) +{ + uint packed_b; + LOAD1(packed_b, buf, offset); + + vec2 b = unpackHalf2x16(packed_b); + + if(pos % 2 == 0) + { + b.x = constant_value; + } + else + { + b.y = constant_value; + } + + packed_b = packHalf2x16(b); + + STORE1(buf, offset, packed_b); +} + +/** Fill N pixels of the padding edge of a single channel image with a constant value. + * + * @attention The border size for top, bottom, left, right needs to be passed at the compile time. + * e.g. BORDER_SIZE_TOP=0 BORDER_SIZE_BOTTOM=2 BORDER_SIZE_LEFT=0 BORDER_SIZE_RIGHT=2 + * + * @param[out] buf_ptr Pointer to the source image. Supported data types: F16 + * @param[in] buf_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] buf_step_x buf_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] buf_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] buf_step_y buf_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] buf_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] width Width of the valid region of the image + * @param[in] height Height of the valid region of the image + * @param[in] start_pos_x X coordinate indicating the start point of the valid region + * @param[in] start_pos_y Y coordinate indicating the start point of the valid region + * @param[in] constant_value Constant value to use to fill the edges + */ +void main() +{ + Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(buf); + + int total_width = BORDER_SIZE_LEFT + int(width) + BORDER_SIZE_RIGHT; + int gid0 = int(gl_GlobalInvocationID.x); + int gidH = gid0 - total_width; + int gidW = gid0 - BORDER_SIZE_LEFT; + + // Update pointer to point to the starting point of the valid region + buf.current_offset = uint(int(buf.current_offset) + ((start_pos_y * int(buf_stride_y) + start_pos_x * int(buf_stride_x)))); + + vec2 b = vec2(constant_value, constant_value); + + uint packed_b = packHalf2x16(b); + + if(gidH >= 0) + { + // Handle left border + for(int i = -BORDER_SIZE_LEFT; i < 0; ++i) + { + uint offset = offset_fp16(buf, i, gidH) >> 2; + int pos = i + BORDER_SIZE_LEFT; + + if(i == -1) + { + if(pos % 2 == 0) + { + set_constant(offset, pos); + } + } + else + { + if(pos % 2 == 0) + { + STORE1(buf, offset, packed_b); + } + } + } + // Handle right border + for(int i = 0; i < BORDER_SIZE_RIGHT; ++i) + { + uint offset = offset_fp16(buf, int(width) + i, gidH) >> 2; + int pos = i + BORDER_SIZE_LEFT + int(width); + + if(i == 0) + { + if(pos % 2 == 0) + { + STORE1(buf, offset, packed_b); + } + else + { + set_constant(offset, pos); + } + } + else + { + if(pos % 2 == 0) + { + STORE1(buf, offset, packed_b); + } + } + } + } + else + { + // Handle top border + for(int i = -BORDER_SIZE_TOP; i < 0; ++i) + { + uint offset = offset_fp16(buf, gidW, i) >> 2; + + if(gid0 % 2 == 0) + { + STORE1(buf, offset, packed_b); + } + } + // Handle bottom border + for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i) + { + uint offset = offset_fp16(buf, gidW, int(height) + i) >> 2; + + if(gid0 % 2 == 0) + { + STORE1(buf, offset, packed_b); + } + } + } +} +#endif /* FILL_IMAGE_BORDERS_CONSTANT */ +#endif /* DATA_TYPE_FP32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs new file mode 100755 index 0000000000..3313b88718 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs @@ -0,0 +1,623 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; +#include "helpers.h" + +#if defined(DATA_TYPE_FP32) +#define LOAD8(r, name, offset) \ + r.x = LOAD4(name, offset); \ + r.y = LOAD4(name, offset + uint(1)) + +#define LOAD16(r, name, offset) \ + r.x = LOAD4(name, offset); \ + r.y = LOAD4(name, offset + uint(1)); \ + r.z = LOAD4(name, offset + uint(2)); \ + r.w = LOAD4(name, offset + uint(3)) + +#define STORE16(name, offset, r) \ + STORE4(name, offset, r.x); \ + STORE4(name, offset + uint(1), r.y); \ + STORE4(name, offset + uint(2), r.z); \ + STORE4(name, offset + uint(3), r.w) + +#ifdef GEMM_TRANSPOSE1xW +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(src); + IMAGE_PARAM_DECLARATION(dst); +}; + +/** This OpenGL ES kernel computes the "vector" 1x4 transposition of input matrix + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +void main(void) +{ + /* Compute address for Matrix B - source */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Compute address for Matrix B transposed - destination. X and Y are swapped */ + uint dst_addr_in_bytes = (gl_GlobalInvocationID.y * uint(16) + gl_GlobalInvocationID.x * dst.stride_y + dst.offset_first_element_in_bytes) >> 2; + vec4 b0; + LOAD16(b0, src, offset(src, 0, 0)); + STORE16(dst, dst_addr_in_bytes, b0); +} +#endif /* GEMM_TRANSPOSE1xW */ + +#ifdef GEMM_INTERLEAVE4x4 +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(src); + IMAGE_PARAM_DECLARATION(dst); +}; + +/** This OpenGLES kernel reshapes the input matrix interleaving the values + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +void main(void) +{ + /* Compute source and destination addresses */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + int i; + int j; + + for(i = 0; i < 4; ++i) + { + for(j = 0; j < 4; ++j) + { + float res = LOAD4(src, offset(src, i, j)); + uint ofset0 = CURRENT_OFFSET(dst) + uint(i * 4 + j); + STORE4(dst, ofset0, res); + } + } +} +#endif /* GEMM_INTERLEAVE4x4 */ + +#ifdef GEMM_ACCUMULATE_BIASES +BUFFER_DECLARATION(accum, 1, float, restrict); +BUFFER_DECLARATION(biases, 2, float, readonly); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(accum); + VECTOR_PARAM_DECLARATION(biases); +}; + +/** This kernel accumulates each row with the biases vector + * + * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32 + * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes) + * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes) + * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor + * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr + * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Image accum = CONVERT_TO_IMAGE_STRUCT(accum); + Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); + + for(int i = 0; i < 16; ++i) + { + float accum_value = LOAD4(accum, CURRENT_OFFSET(accum) + uint(i)); + float biases_value = LOAD4(biases, CURRENT_OFFSET(biases) + uint(i)); + accum_value = biases_value + accum_value; + + // Store result in the accummulate buffer + STORE4(accum, CURRENT_OFFSET(accum) + uint(i), accum_value); + } +} +#endif /* GEMM_ACCUMULATE_BIASES */ + +#ifdef GEMM_MM_INTERLEAVED_TRANSPOSED /* unvalidate */ +BUFFER_DECLARATION(src0, 1, float, readonly); +BUFFER_DECLARATION(src1, 2, float, readonly); +BUFFER_DECLARATION(dst, 3, float, writeonly); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(src0); + IMAGE_PARAM_DECLARATION(src1); + IMAGE_PARAM_DECLARATION(dst); +}; + +/** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication + * + * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +void main() +{ + Image src0 = CONVERT_TO_IMAGE_STRUCT(src0); + Image src1 = CONVERT_TO_IMAGE_STRUCT(src1); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Compute address for matrix A and B */ + src0.current_offset = (src0.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.y) * uint(src0.stride_y))) >> uint(2); + src1.current_offset = (src1.offset_first_element_in_bytes + (uint(gl_GlobalInvocationID.x) * uint(src1.stride_y))) >> uint(2); + + /* Compute end row address for matrix B */ + int end_row_mtx_b = int(src1.current_offset) + int(COLS_B); + + /* Reset accumulators */ + vec4 c00 = vec4(0.0f); + vec4 c10 = vec4(0.0f); + vec4 c20 = vec4(0.0f); + vec4 c30 = vec4(0.0f); + + // FIXME: loop unrolling really needed for GLES? + for(; int(src1.current_offset) <= (end_row_mtx_b - 8); src0.current_offset += uint(8), src1.current_offset += uint(8)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + vec4 a0; + vec4 b0; + LOAD16(a0, src0, src0.current_offset); + LOAD16(b0, src1, src1.current_offset); + + c00 += vec4(a0.x) * b0; + c10 += vec4(a0.y) * b0; + c20 += vec4(a0.z) * b0; + c30 += vec4(a0.w) * b0; + + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + LOAD16(a0, src0, src0.current_offset + uint(4)); + LOAD16(b0, src1, src1.current_offset + uint(4)); + + c00 += vec4(a0.x) * b0; + c10 += vec4(a0.y) * b0; + c20 += vec4(a0.z) * b0; + c30 += vec4(a0.w) * b0; + } + + for(; int(src1.current_offset) < end_row_mtx_b; src0.current_offset += uint(4), src1.current_offset += uint(4)) + { + /* Load values from matrix A (interleaved) and matrix B (transposed) */ + vec4 a0; + vec4 b0; + LOAD16(a0, src0, src0.current_offset); + LOAD16(b0, src1, src1.current_offset); + + c00 += vec4(a0.x) * b0; + c10 += vec4(a0.y) * b0; + c20 += vec4(a0.z) * b0; + c30 += vec4(a0.w) * b0; + } + + /* Multiply by the weight of matrix product */ + c00 = c00 * vec4(ALPHA); + c10 = c10 * vec4(ALPHA); + c20 = c20 * vec4(ALPHA); + c30 = c30 * vec4(ALPHA); + + /* Store 4x4 block */ + STORE16(dst, offset(dst, 0, 0), c00); + STORE16(dst, offset(dst, 0, 1), c10); + STORE16(dst, offset(dst, 0, 2), c20); + STORE16(dst, offset(dst, 0, 3), c30); +} +#endif /* GEMM_MM_INTERLEAVED_TRANSPOSED */ + +#ifdef GEMM_MM_FLOATING_POINT +BUFFER_DECLARATION(src0, 1, float, readonly); +BUFFER_DECLARATION(src1, 2, float, readonly); +BUFFER_DECLARATION(dst, 3, float, writeonly); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(src0); + IMAGE_PARAM_DECLARATION(src1); + IMAGE_PARAM_DECLARATION(dst); +}; + +/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication + * + * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +void main() +{ + Image src0 = CONVERT_TO_IMAGE_STRUCT(src0); + Image src1 = CONVERT_TO_IMAGE_STRUCT(src1); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X); + /* Compute the address for the vector A and matrix B */ + src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y)) >> uint(2); + src1.current_offset = (src1_offset_first_element_in_bytes + uint(idx * 4)) >> uint(2); + + /* Compute end row address for matrix A */ + int end_row_vec_a = int(src0.current_offset) + ((COLS_A * 4) >> 2); + + /* Reset accumulators */ + vec4 acc0 = vec4(0.0f); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + vec4 acc1 = vec4(0.0f); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + vec4 acc2 = vec4(0.0f); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + vec4 acc3 = vec4(0.0f); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + for(; int(src0.current_offset) <= (end_row_vec_a - 2); src0.current_offset += uint(2), src1.current_offset += uint((2 * int(src1_stride_y)) >> 2)) + { + vec2 a0; + LOAD8(a0, src0, src0.current_offset); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + vec2 a1; + LOAD8(a1, src0, src0.current_offset + (src0_stride_y >> uint(2))); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + vec2 a2; + LOAD8(a2, src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2))); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + vec2 a3; + LOAD8(a3, src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2))); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + vec4 b0; + vec4 b1; + LOAD16(b0, src1, src1.current_offset); + LOAD16(b1, src1, src1.current_offset + (src1_stride_y >> uint(2))); + + acc0 += b0 * vec4(a0.x); + acc0 += b1 * vec4(a0.y); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += b0 * vec4(a1.x); + acc1 += b1 * vec4(a1.y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += b0 * vec4(a2.x); + acc2 += b1 * vec4(a2.y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += b0 * vec4(a3.x); + acc3 += b1 * vec4(a3.y); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + for(; int(src0.current_offset) < end_row_vec_a; src0.current_offset += uint(1), src1.current_offset += uint(int(src1_stride_y) >> 2)) + { + // Load values from matrix A + float a0; + a0 = LOAD4(src0, src0.current_offset); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + float a1; + a1 = LOAD4(src0, src0.current_offset + ((uint(1) * src0_stride_y) >> uint(2))); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + float a2; + a2 = LOAD4(src0, src0.current_offset + ((uint(2) * src0_stride_y) >> uint(2))); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + float a3; + a3 = LOAD4(src0, src0.current_offset + ((uint(3) * src0_stride_y) >> uint(2))); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + + vec4 b0; + LOAD16(b0, src1, src1.current_offset); + + acc0 += b0 * vec4(a0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 += b0 * vec4(a1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 += b0 * vec4(a2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 += b0 * vec4(a3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + } + + /* Multiply by the weight of vector-matrix product */ + acc0 = acc0 * vec4(ALPHA); + STORE16(dst, offset(dst, 0, 0), acc0); +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 + acc1 = acc1 * vec4(ALPHA); + STORE16(dst, offset(dst, 0, 1), acc1); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 + acc2 = acc2 * vec4(ALPHA); + STORE16(dst, offset(dst, 0, 2), acc2); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2 +#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 + acc3 = acc3 * vec4(ALPHA); + STORE16(dst, offset(dst, 0, 3), acc3); +#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3 +} +#endif /* GEMM_MM_FLOATING_POINT */ + +#ifdef GEMM_MATRIXADDITION +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, restrict); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(src); + IMAGE_PARAM_DECLARATION(dst); +}; + +/** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta: + * + * @attention The beta's value need to be passed at compile time using BETA + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +void main(void) +{ + /* Compute source and destination addresses */ + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + /* Load values from A x B */ + vec4 alpha_ab; + vec4 c; + vec4 out1; + + LOAD16(alpha_ab, dst, dst.current_offset); + LOAD16(c, src, src.current_offset); + + /* Computes alpha * axb + beta * c */ + out1 = alpha_ab + vec4(BETA * c); + + /* Store final result in axb matrix */ + STORE16(dst, dst.current_offset, out1); +} +#endif /* GEMM_MATRIXADDITION */ +#elif defined(DATA_TYPE_FP16) +precision mediump float; +#ifdef GEMM_MM_FLOATING_POINT +BUFFER_DECLARATION(src0, 1, uint, readonly); +BUFFER_DECLARATION(src1, 2, uvec2, readonly); +BUFFER_DECLARATION(dst, 3, uvec2, writeonly); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(src0); + IMAGE_PARAM_DECLARATION(src1); + IMAGE_PARAM_DECLARATION(dst); +}; + +/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication + * + * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +void main() +{ + Image src0 = GC_CONVERT_TO_IMAGE_STRUCT(src0); + Image src1 = GC_CONVERT_TO_IMAGE_STRUCT(src1); + Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst); + + int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X); + /* Compute the address for the vector A and matrix B */ + src0.current_offset = (src0_offset_first_element_in_bytes + uint(gl_GlobalInvocationID.y) * src0_stride_y * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y)); + src1.current_offset = src1_offset_first_element_in_bytes + uint(idx) * src1_stride_x; + + /* Compute end row address for matrix A */ + uint end_row_vec_a = src0.current_offset + uint(COLS_A << 1); + + /* Reset accumulators */ + vec4 acc0 = vec4(0.0f); + + for(; src0.current_offset < (end_row_vec_a - uint(2)); src0.current_offset += uint(2 * 2), src1.current_offset += uint(2) * src1_stride_y) + { + uint packed_a0; + vec2 a0; + + GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0); + a0 = vec2(unpackHalf2x16(packed_a0)); + + uvec2 packed_b0; + uvec2 packed_b1; + vec4 b0; + vec4 b1; + + GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0); + GC_LOAD1_2D_OFFSET(packed_b1, src1, 0, 1); + + b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y)); + b1 = vec4(unpackHalf2x16(packed_b1.x), unpackHalf2x16(packed_b1.y)); + + acc0 += b0 * vec4(a0.x); + acc0 += b1 * vec4(a0.y); + } + + for(; src0.current_offset < end_row_vec_a; src0.current_offset += uint(2 * 2), src1.current_offset += src1_stride_y) + { + uint packed_a0; + vec2 a0; + + GC_LOAD1_2D_OFFSET(packed_a0, src0, 0, 0); + a0 = vec2(unpackHalf2x16(packed_a0)); + + uvec2 packed_b0; + vec4 b0; + + GC_LOAD1_2D_OFFSET(packed_b0, src1, 0, 0); + + b0 = vec4(unpackHalf2x16(packed_b0.x), unpackHalf2x16(packed_b0.y)); + + acc0 += b0 * (a0.x); + } + + /* Multiply by the weight of vector-matrix product */ + acc0 = acc0 * vec4(ALPHA); + + uvec2 packed_d; + packed_d = uvec2(packHalf2x16(acc0.xy), packHalf2x16(acc0.zw)); + GC_STORE1_2D_OFFSET(packed_d, dst, 0, 0); +} +#endif /* GEMM_MM_FLOATING_POINT */ + +#ifdef GEMM_ACCUMULATE_BIASES +BUFFER_DECLARATION(accum, 1, uvec2, restrict); +BUFFER_DECLARATION(biases, 2, uvec2, readonly); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(accum); + VECTOR_PARAM_DECLARATION(biases); +}; + +/** This kernel accumulates each row with the biases vector + * + * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16 + * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes) + * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes) + * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor + * @param[in] biases_ptr Pointer to the biases vector. Same as @p accum_ptr + * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Image accum = GC_CONVERT_TO_IMAGE_STRUCT(accum); + Vector biases = GC_CONVERT_TO_VECTOR_STRUCT(biases); + + vec4 u[2]; + uvec2 packed_s[2]; + GC_LOAD1_2D_OFFSET(packed_s[0], accum, 0, 0); + GC_LOAD1_1D_OFFSET(packed_s[1], biases, 0); + u[0] = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y)); + u[1] = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y)); + + vec4 tmp; + tmp = u[0] + u[1]; + packed_s[0] = uvec2(packHalf2x16(tmp.xy), packHalf2x16(tmp.zw)); + GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0); +} +#endif /* GEMM_ACCUMULATE_BIASES */ +#else /* DATA_TYPE_F32 */ +#error Data type not supported +#endif /* DATA_TYPE_F32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers.h b/src/core/GLES_COMPUTE/cs_shaders/helpers.h new file mode 100644 index 0000000000..86dedf5a9c --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/helpers.h @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_HELPER_H +#define ARM_COMPUTE_HELPER_H + +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + +#define VEC_DATA_TYPE_STR(type, size) type##size +#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size) + +#define CONVERT(x, type) type(x) + +#define PACK(value, stype, dtype) \ + pack_##stype##_##dtype(value) + +#define UNPACK(value, stype, dtype) \ + unpack_##stype##_##dtype(value) + +#define BUFFER_DECLARATION(name, location, type, access) \ + layout(std430, binding = location) access buffer name##Buffer \ + { \ + type name##_ptr[]; \ + } + +#define VECTOR_PARAM_DECLARATION(name) \ + uint name##_stride_x; \ + uint name##_step_x; \ + uint name##_offset_first_element_in_bytes; \ + uint name##_buffer_data_type_size + +#define IMAGE_PARAM_DECLARATION(name) \ + uint name##_stride_x; \ + uint name##_step_x; \ + uint name##_stride_y; \ + uint name##_step_y; \ + uint name##_offset_first_element_in_bytes; \ + uint name##_buffer_data_type_size + +#define TENSOR3D_PARAM_DECLARATION(name) \ + uint name##_stride_x; \ + uint name##_step_x; \ + uint name##_stride_y; \ + uint name##_step_y; \ + uint name##_stride_z; \ + uint name##_step_z; \ + uint name##_offset_first_element_in_bytes; \ + uint name##_buffer_data_type_size + +/** Structure to hold Vector information */ +struct Vector +{ + uint current_offset; /**< Current offset of vector */ + uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + uint stride_x; /**< Stride of the image in X dimension (in bytes) */ +}; + +/** Structure to hold Image information */ +struct Image +{ + uint current_offset; /**< Current offset of image */ + uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + uint stride_x; /**< Stride of the image in X dimension (in bytes) */ + uint stride_y; /**< Stride of the image in Y dimension (in bytes) */ +}; + +/** Structure to hold 3D tensor information */ +struct Tensor3D +{ + uint current_offset; /**< Current offset of tensor */ + uint offset_first_element_in_bytes; /**< The offset of the first element in the source image */ + uint stride_x; /**< Stride of the image in X dimension (in bytes) */ + uint stride_y; /**< Stride of the image in Y dimension (in bytes) */ + uint stride_z; /**< Stride of the image in Z dimension (in bytes) */ +}; + +///////////////////////////////////////////////////////////// +// TODO: old to be removed + +#define CONVERT_TO_VECTOR_STRUCT(name) \ + update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) + +#define CONVERT_TO_VECTOR_STRUCT_FP16(name) \ + update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) + +#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ + update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0)) + +#define CONVERT_TO_VECTOR_STRUCT_NO_STEP_FP16(name) \ + update_vector_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0)) + +#define CONVERT_TO_IMAGE_STRUCT(name) \ + update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_FP16(name) \ + update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0)) + +#define CONVERT_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \ + update_image_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0)) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP_FP16(name) \ + update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_FP16(name) \ + update_image_from_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT(name) \ + update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ + name##_stride_z, name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT_FP16(name) \ + update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ + name##_stride_z, name##_step_z) + +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0)) + +#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP_FP16(name) \ + update_tensor3D_workitem_offset_fp16(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0)) + +// FIXME: Redesign the macros if different data types are supported. +#define LOAD4(name, offset) \ + name##_ptr[offset] + +#define STORE4(name, offset, value) \ + name##_ptr[offset] = value + +// Load 1 element, which size is determined by ssbo type. +#define LOAD1(r, name, offset) \ + r = name##_ptr[offset] + +#define STORE1(name, offset, value) \ + name##_ptr[offset] = value + +#define LOAD2(r, name, offset) \ + LOAD1(r[0], name, offset); \ + LOAD1(r[1], name, (offset) + uint(1)) + +#define STORE2(name, offset, value) \ + name##_ptr[offset] = value[0]; \ + name##_ptr[(offset) + uint(1)] = value[1] + +#define LOAD3(r, name, offset) \ + LOAD1(r[0], name, offset); \ + LOAD1(r[1], name, (offset) + uint(1)); \ + LOAD1(r[2], name, (offset) + uint(2)) + +#define CURRENT_OFFSET(name) \ + name.current_offset + +/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position. + * + * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector + * @param[in] stride_x Stride of the vector in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * + * @return An vector object + */ +Vector update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x) +{ + Vector vector; + vector.offset_first_element_in_bytes = offset_first_element_in_bytes; + vector.stride_x = stride_x; + vector.current_offset = (vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x) >> 2; + + return vector; +} + +/** Wrap vector information into an Vector structure, and make the offset to be this workitem's position. + * + * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector + * @param[in] stride_x Stride of the vector in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * + * @return An vector object + */ +Vector update_vector_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x) +{ + Vector vector; + vector.offset_first_element_in_bytes = offset_first_element_in_bytes; + vector.stride_x = stride_x; + vector.current_offset = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x; + + return vector; +} + +/** Wrap image information into an Image structure, and make the offset to be this workitem's position. + * + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) + * + * @return An image object + */ +Image update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img; + img.offset_first_element_in_bytes = offset_first_element_in_bytes; + img.stride_x = stride_x; + img.stride_y = stride_y; + img.current_offset = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y) >> 2; + + return img; +} + +/** Wrap image information into an Image structure, and make the offset to be this workitem's position. + * + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) + * + * @return An image object + */ +Image update_image_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img; + img.offset_first_element_in_bytes = offset_first_element_in_bytes; + img.stride_x = stride_x; + img.stride_y = stride_y; + img.current_offset = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y; + + return img; +} + +/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position. + * + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) + * + * @return A 2D Image object + */ +Image update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +{ + Image img; + img.offset_first_element_in_bytes = offset_first_element_in_bytes; + img.stride_x = stride_x; + img.stride_y = stride_y; + img.current_offset = (img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2; + + return img; +} + +/** Wrap 3D tensor information into an image structure, and make the offset to be this workitem's position. + * + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) + * + * @return A 2D Image object + */ +Image update_image_from_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +{ + Image img; + img.offset_first_element_in_bytes = offset_first_element_in_bytes; + img.stride_x = stride_x; + img.stride_y = stride_y; + img.current_offset = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z; + + return img; +} + +/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position. + * + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) + * + * @return A 3D tensor object + */ +Tensor3D update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +{ + Tensor3D tensor; + tensor.offset_first_element_in_bytes = offset_first_element_in_bytes; + tensor.stride_x = stride_x; + tensor.stride_y = stride_y; + tensor.stride_z = stride_z; + tensor.current_offset = (tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z) >> 2; + + return tensor; +} + +/** Wrap 3D tensor information into an tensor structure, and make the offset to be this workitem's position. + * + * @param[in] offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] stride_x Stride of the image in X dimension (in bytes) + * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] stride_y Stride of the image in Y dimension (in bytes) + * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] stride_z Stride of the image in Z dimension (in bytes) + * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes) + * + * @return A 3D tensor object + */ +Tensor3D update_tensor3D_workitem_offset_fp16(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +{ + Tensor3D tensor; + tensor.offset_first_element_in_bytes = offset_first_element_in_bytes; + tensor.stride_x = stride_x; + tensor.stride_y = stride_y; + tensor.stride_z = stride_z; + tensor.current_offset = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z; + + return tensor; +} + +/** Get the pointer position of a Vector + * + * @param[in] vec Pointer to the starting position of the buffer + * @param[in] x Relative X position + */ +uint vector_offset(Vector vec, int x) +{ + return CONVERT(CONVERT(vec.current_offset << 2, int) + x * CONVERT(vec.stride_x, int), uint) >> 2; +} + +/** Get the pointer position of a Vector + * + * @param[in] vec Pointer to the starting position of the buffer + * @param[in] x Relative X position + */ +uint vector_offset_fp16(Vector vec, int x) +{ + return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint); +} + +/** Get the pointer position of a Image + * + * @param[in] img Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + */ +uint offset(Image img, int x, int y) +{ + return CONVERT(CONVERT(img.current_offset << 2, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint) >> 2; +} + +/** Get the pointer position of a Image + * + * @param[in] img Pointer to the starting position of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + */ +uint offset_fp16(Image img, int x, int y) +{ + return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint); +} + +/** Get the pointer position of a Tensor3D + * + * @param[in] tensor Pointer to the starting postion of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + */ +uint tensor3D_offset(Tensor3D tensor, int x, int y, int z) +{ + return CONVERT(CONVERT(tensor.current_offset << 2, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint) >> 2; +} + +/** Get the pointer position of a Tensor3D + * + * @param[in] tensor Pointer to the starting postion of the buffer + * @param[in] x Relative X position + * @param[in] y Relative Y position + * @param[in] z Relative Z position + */ +uint tensor3D_offset_fp16(Tensor3D tensor, int x, int y, int z) +{ + return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint); +} + +///////////////////////////////////////////////////////////// +// new one + +#define GC_CONVERT_TO_VECTOR_STRUCT(name) \ + gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x) + +#define GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \ + gc_update_vector_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0)) + +#define GC_CONVERT_TO_IMAGE_STRUCT(name) \ + gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y) + +#define GC_CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \ + gc_update_image_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0)) + +#define GC_CONVERT_TO_TENSOR3D_STRUCT(name) \ + gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \ + name##_stride_z, name##_step_z) + +#define GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \ + gc_update_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, uint(0)) + +#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \ + gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z) + +#define GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \ + gc_update_image_from_tensor3D_workitem_offset(name##_offset_first_element_in_bytes, name##_stride_x, uint(0), name##_stride_y, uint(0), name##_stride_z, name##_step_z) + +Vector gc_update_vector_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x) +{ + Vector vector; + vector.offset_first_element_in_bytes = offset_first_element_in_bytes; + vector.stride_x = stride_x; + vector.current_offset = vector.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x; + + return vector; +} + +Image gc_update_image_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y) +{ + Image img; + img.offset_first_element_in_bytes = offset_first_element_in_bytes; + img.stride_x = stride_x; + img.stride_y = stride_y; + img.current_offset = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y; + + return img; +} + +Tensor3D gc_update_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +{ + Tensor3D tensor; + tensor.offset_first_element_in_bytes = offset_first_element_in_bytes; + tensor.stride_x = stride_x; + tensor.stride_y = stride_y; + tensor.stride_z = stride_z; + tensor.current_offset = tensor.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z; + + return tensor; +} + +Image gc_update_image_from_tensor3D_workitem_offset(uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z) +{ + Image img; + img.offset_first_element_in_bytes = offset_first_element_in_bytes; + img.stride_x = stride_x; + img.stride_y = stride_y; + img.current_offset = img.offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z; + + return img; +} + +#define GC_CURRENT_OFFSET(name) \ + name.current_offset + +uint gc_vector_offset(Vector vec, int x) +{ + return CONVERT(CONVERT(vec.current_offset, int) + x * CONVERT(vec.stride_x, int), uint); +} + +uint gc_image_offset(Image img, int x, int y) +{ + return CONVERT(CONVERT(img.current_offset, int) + x * CONVERT(img.stride_x, int) + y * CONVERT(img.stride_y, int), uint); +} + +uint gc_tensor3D_offset(Tensor3D tensor, int x, int y, int z) +{ + return CONVERT(CONVERT(tensor.current_offset, int) + x * CONVERT(tensor.stride_x, int) + y * CONVERT(tensor.stride_y, int) + z * CONVERT(tensor.stride_z, int), uint); +} + +// load/store number of element depends on buffer type +#define GC_LOAD1(r, name, offset) \ + r = name##_ptr[offset] + +#define GC_LOAD2(r, name, offset) \ + GC_LOAD1(r[0], name, offset); \ + GC_LOAD1(r[1], name, (offset) + uint(1)) + +#define GC_LOAD3(r, name, offset) \ + GC_LOAD1(r[0], name, offset); \ + GC_LOAD1(r[1], name, (offset) + uint(1)); \ + GC_LOAD1(r[2], name, (offset) + uint(2)) + +#define GC_STORE1(value, name, offset) \ + name##_ptr[offset] = value + +#define GC_STORE2(value, name, offset) \ + GC_STORE1(value[0], name, offset); \ + GC_STORE1(value[1], name, (offset) + uint(1)) + +#define GC_STORE3(value, name, offset) \ + GC_STORE1(value[0], name, offset); \ + GC_STORE1(value[1], name, (offset) + uint(1)); \ + GC_STORE1(value[2], name, (offset) + uint(2)) + +// has to manually expand them since not supported by compiler +#define GC_LOAD1_1D_OFFSET(r, name, x) \ + GC_LOAD1(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size) + +#define GC_LOAD1_2D_OFFSET(r, name, x, y) \ + GC_LOAD1(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size) + +#define GC_LOAD1_3D_OFFSET(r, name, x, y, z) \ + GC_LOAD1(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size) + +#define GC_STORE1_1D_OFFSET(value, name, x) \ + GC_STORE1(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size) + +#define GC_STORE1_2D_OFFSET(value, name, x, y) \ + GC_STORE1(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size) + +#define GC_STORE1_3D_OFFSET(value, name, x, y, z) \ + GC_STORE1(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size) + +#define GC_LOAD2_1D_OFFSET(r, name, x) \ + GC_LOAD2(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size) + +#define GC_LOAD2_2D_OFFSET(r, name, x, y) \ + GC_LOAD2(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size) + +#define GC_LOAD2_3D_OFFSET(r, name, x, y, z) \ + GC_LOAD2(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size) + +#define GC_STORE2_1D_OFFSET(value, name, x) \ + GC_STORE2(value, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size) + +#define GC_STORE2_2D_OFFSET(value, name, x, y) \ + GC_STORE2(value, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size) + +#define GC_STORE2_3D_OFFSET(value, name, x, y, z) \ + GC_STORE2(value, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size) + +#define GC_LOAD3_1D_OFFSET(r, name, x) \ + GC_LOAD3(r, name, gc_vector_offset(name, int(x)) >> name##_buffer_data_type_size) + +#define GC_LOAD3_2D_OFFSET(r, name, x, y) \ + GC_LOAD3(r, name, gc_image_offset(name, int(x), int(y)) >> name##_buffer_data_type_size) + +#define GC_LOAD3_3D_OFFSET(r, name, x, y, z) \ + GC_LOAD3(r, name, gc_tensor3D_offset(name, int(x), int(y), int(z)) >> name##_buffer_data_type_size) + +///////////////////////////////////////////////////////////// + +#endif // _HELPER_H diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs new file mode 100755 index 0000000000..5699340c14 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; + +#include "helpers.h" + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src1); + TENSOR3D_PARAM_DECLARATION(src2); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +BUFFER_DECLARATION(src1, 1, float, readonly); +BUFFER_DECLARATION(src2, 2, float, readonly); +BUFFER_DECLARATION(dst, 3, float, writeonly); + +#ifdef CROSS_MAP +/** Apply cross map normalization. + * + * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x" + * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x" + * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x" + * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x" + * + * @param[in] src1_ptr Pointer to the first source tensor. Supported data types: F32 + * @param[in] src1_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src1_step_z src1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] src2_ptr Pointer to the second source tensor. Supported data types: Same as @p src1_ptr + * @param[in] src2_stride_x Stride of the second source tensor in X dimension (in bytes) + * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src2_stride_y Stride of the second source tensor in Y dimension (in bytes) + * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src2_stride_z Stride of the second source tensor in Z dimension (in bytes) + * @param[in] src2_step_z src2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes The offset of the second element in the second source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: Same as @p src1_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1); + Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + float acc = 0.0; + + int num_of_slices = int(gl_NumWorkGroups.z * gl_WorkGroupSize.z); + int current_slice = int(gl_GlobalInvocationID.z); + + int left_slice = max(current_slice - int(RADIUS), int(0)); + int right_slice = min(current_slice + int(RADIUS), int(num_of_slices - 1)); + + for(int i = left_slice; i <= right_slice; i++) + { + acc += src2_ptr[tensor3D_offset(src2, 0, 0, i - current_slice)]; + } + + float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA)); + + float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized; + + dst_ptr[dst.current_offset] = normalized_pixel; +} + +#elif defined(IN_MAP_1D) +/** Apply in map normalization. + * + * @note Alpha parameter / norm_size should be given as a preprocessor argument using "#define COEFF x" + * @note BETA parameter in the normalization equation should be given as a preprocessor argument using "#define BETA x" + * @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x" + * @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x" + * + * @param[in] src1_ptr Pointer to the first source tensor. Supported data types: F32 + * @param[in] src1_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src1_step_z src1_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] src2_ptr Pointer to the second source tensor. Supported data types: Same as @p src1_ptr + * @param[in] src2_stride_x Stride of the second source tensor in X dimension (in bytes) + * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src2_stride_y Stride of the second source tensor in Y dimension (in bytes) + * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src2_stride_z Stride of the second source tensor in Z dimension (in bytes) + * @param[in] src2_step_z src2_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes The offset of the second element in the second source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: Same as @p src1_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1); + Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + float acc = 0.0; + + int num_of_items_x = int(gl_NumWorkGroups.x * gl_WorkGroupSize.x); + int current_pos = int(gl_GlobalInvocationID.x); + + int left_pos = max(current_pos - int(RADIUS), int(0)); + int right_pos = min(current_pos + int(RADIUS), int(num_of_items_x + -1)); + + for(int i = left_pos; i <= right_pos; i++) + { + acc += src2_ptr[tensor3D_offset(src2, i - current_pos, 0, 0)]; + } + + float normalized = pow(float(KAPPA) + float(COEFF) * acc, float(BETA)); + + float normalized_pixel = (src1_ptr[src1.current_offset]) / normalized; + + dst_ptr[dst.current_offset] = normalized_pixel; +} +#endif /*CROSS_MAP*/ diff --git a/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs new file mode 100644 index 0000000000..031687af0c --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/pixelwise_mul_float.cs @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; +#include "helpers.h" + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src1); + TENSOR3D_PARAM_DECLARATION(src2); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +BUFFER_DECLARATION(src1, 1, float, readonly); +BUFFER_DECLARATION(src2, 2, float, readonly); +BUFFER_DECLARATION(dst, 3, float, writeonly); + +/** Performs a pixelwise multiplication with float scale of either integer or float inputs. + * + * @param[in] src1_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src1_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] src1_step_z src1_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src2_ptr Pointer to the source image. Supported data types: Same as @p src1_ptr + * @param[in] src2_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src2_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src2_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] src2_step_z src2_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: Same as @p src1_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] scale Float scaling factor. Supported data types: F32 + */ +void main() +{ + // Get pixels pointer + Tensor3D src1 = CONVERT_TO_TENSOR3D_STRUCT(src1); + Tensor3D src2 = CONVERT_TO_TENSOR3D_STRUCT(src2); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + dst_ptr[dst.current_offset] = (src1_ptr[src1.current_offset] * src2_ptr[src2.current_offset] * float(SCALE)); +} diff --git a/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs new file mode 100644 index 0000000000..1e0fee4688 --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/pooling_layer.cs @@ -0,0 +1,1444 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; +#include "helpers.h" + +#if defined(DATA_TYPE_FP32) + +float calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int); +float calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int); + +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +#define LOAD8(r, name, offset) \ + r.x = LOAD4(name, offset); \ + r.y = LOAD4(name, offset + uint(1)) + +#define LOAD16(r, name, offset) \ + r.x = LOAD4(name, offset); \ + r.y = LOAD4(name, offset + uint(1)); \ + r.z = LOAD4(name, offset + uint(2)); \ + r.w = LOAD4(name, offset + uint(3)) + +#define STORE16(name, offset, r) \ + STORE4(name, offset, r.x); \ + STORE4(name, offset + uint(1), r.y); \ + STORE4(name, offset + uint(2), r.z); \ + STORE4(name, offset + uint(3), r.w) + +#if defined(POOL_AVG) || defined(POOL_L2) +#define POOL_OP(res, a, b) ((res) = (a) + (b)) +#define POOL_OP_float(res, a, b) (res = a + b) +#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b)) +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define POOL_OP(res, a, b) \ + (res) = (a); \ + if(isnan(a.x) || (a.x < b.x)) \ + { \ + res.x = b.x; \ + } \ + if(isnan(a.y) || (a.y < b.y)) \ + { \ + res.y = b.y; \ + } \ + if(isnan(a.z) || (a.z < b.z)) \ + { \ + res.z = b.z; \ + } \ + if(isnan(a.w) || (a.w < b.w)) \ + { \ + res.w = b.w; \ + } +#define POOL_OP_float(res, a, b) \ + (res) = (a); \ + if(isnan(a) || (a < b)) \ + { \ + res = b; \ + } +#define POOL_OP_vec2(res, a, b) \ + (res) = (a); \ + if(isnan(a.x) || (a.x < b.x)) \ + { \ + res.x = b.x; \ + } \ + if(isnan(a.y) || (a.y < b.y)) \ + { \ + res.y = b.y; \ + } +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ + +#define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) + +#if defined(POOL_SIZE) +// Set the initial value for the pooling operation accordingly with the data type +#if defined(POOL_AVG) || defined(POOL_L2) +#define INITIAL_VALUE 0.0f +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define INITIAL_VALUE -3.402823466385289e+38 +#endif // POOL_AVG +#endif //POOL_SIZE + +#define POOLING3x3_STRIDE1(res, input, output) \ + vec4 data00; \ + vec2 data01; \ + vec4 data10; \ + vec2 data11; \ + vec4 data20; \ + vec2 data21; \ + LOAD16(data00, input, tensor3D_offset(input, 0, 0, 0)); \ + LOAD8(data01, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \ + LOAD16(data10, input, tensor3D_offset(input, 0, 1, 0)); \ + LOAD8(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \ + LOAD16(data20, input, tensor3D_offset(input, 0, 2, 0)); \ + LOAD8(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \ + data00 = POW2_OP(data00, 4); \ + data01 = POW2_OP(data01, 2); \ + data10 = POW2_OP(data10, 4); \ + data11 = POW2_OP(data11, 2); \ + data20 = POW2_OP(data20, 4); \ + data21 = POW2_OP(data21, 2); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data00.xyzy; \ + values001.xyzw = data00.zwzw; \ + values010.x = data01.x; \ + values010.y = data00.w; \ + values010.zw = data01.xy; \ + values100.xyzw = data10.xyzy; \ + values101.xyzw = data10.zwzw; \ + values11.x = data11.x; \ + values11.y = data10.w; \ + values11.zw = data11.xy; \ + values200.xyzw = data20.xyzy; \ + values201.xyzw = data20.zwzw; \ + values21.x = data21.x; \ + values21.y = data20.w; \ + values21.zw = data21.xy; \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) + +#define POOLING3x3_STRIDE2(res, input, output) \ + vec4 data000; \ + vec4 data001; \ + float data010; \ + vec4 data100; \ + vec4 data101; \ + float data11; \ + vec4 data200; \ + vec4 data201; \ + float data21; \ + LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \ + LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \ + data010 = LOAD4(input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \ + LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \ + LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \ + data11 = LOAD4(input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \ + LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \ + LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \ + data21 = LOAD4(input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 1); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 1); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 1); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data000.xyzz; \ + values001.xyzw = vec4(data000.w, data001.xxy); \ + values010.xyzw = vec4(data001.zzw, data010); \ + values100.xyzw = data100.xyzz; \ + values101.xyzw = vec4(data100.w, data101.xxy); \ + values11.xyzw = vec4(data101.zzw, data11); \ + values200.xyzw = data200.xyzz; \ + values201.xyzw = vec4(data200.w, data201.xxy); \ + values21.xyzw = vec4(data201.zzw, data21); \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) + +#define POOLING3x3_STRIDE3(res, input, output) \ + vec4 data000; \ + vec4 data001; \ + vec4 data010; \ + vec4 data100; \ + vec4 data101; \ + vec4 data11; \ + vec4 data200; \ + vec4 data201; \ + vec4 data21; \ + LOAD16(data000, input, tensor3D_offset(input, 0, 0, 0)); \ + LOAD16(data001, input, tensor3D_offset(input, 0, 0, 0) + uint(4)); \ + LOAD16(data010, input, tensor3D_offset(input, 0, 0, 0) + uint(8)); \ + LOAD16(data100, input, tensor3D_offset(input, 0, 1, 0)); \ + LOAD16(data101, input, tensor3D_offset(input, 0, 1, 0) + uint(4)); \ + LOAD16(data11, input, tensor3D_offset(input, 0, 1, 0) + uint(8)); \ + LOAD16(data200, input, tensor3D_offset(input, 0, 2, 0)); \ + LOAD16(data201, input, tensor3D_offset(input, 0, 2, 0) + uint(4)); \ + LOAD16(data21, input, tensor3D_offset(input, 0, 2, 0) + uint(8)); \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 4); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 4); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 4); \ + \ + POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ + POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ + POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw)) + +float calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x; + int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y; + int end_x = int(min(start_x + pool_size, upper_bound_w)); + int end_y = int(min(start_y + pool_size, upper_bound_h)); + + float data_max; + data_max = LOAD4(src, tensor3D_offset(src, 0, 0, 0)); + + for(int i = 0; (start_x + i) < end_x; ++i) + { + for(int j = 0; (start_y + j) < end_y; ++j) + { + float data = LOAD4(src, tensor3D_offset(src, i, j, 0)); + POOL_OP_float(data_max, data_max, data); + } + } + + return data_max; +} + +float calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = int(gl_GlobalInvocationID.x) * stride_x - pad_x; + int start_y = int(gl_GlobalInvocationID.y) * stride_y - pad_y; + int end_x = int(min(start_x + pool_size, upper_bound_w)); + int end_y = int(min(start_y + pool_size, upper_bound_h)); + + float data_total = 0.0f; + for(int i = 0; (start_x + i) < end_x; i++) + { + for(int j = 0; (start_y + j) < end_y; ++j) + { + float data = LOAD4(src, tensor3D_offset(src, i, j, 0)); + if(isnan(data)) + { + data = 0.0f; + } +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data = POW2_OP(data, 1); +#endif /* defined(POOL_L2) */ + data_total = data_total + data; + } + } + + return data_total / float((end_y - start_y) * (end_x - start_x)); +} + +#ifdef POOLING_LAYER_2 +/** Performs a pooling function of pool size equal to 2. + * + * @note Supported data types are F32; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + //Load and calculate data + float res; +#if defined(POOL_AVG) || defined(POOL_L2) + res = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + res = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + STORE4(dst, CURRENT_OFFSET(dst), res); +} + +#elif defined(POOLING_LAYER_3) +/** Performs a pooling function of pool size equal to 3. + * + * @note Supported data types are F32; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + //Load and calculate data + float res; +#if defined(POOL_AVG) || defined(POOL_L2) + res = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + res = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + STORE4(dst, CURRENT_OFFSET(dst), res); +} + +#elif defined(POOLING_LAYER_3_OPTIMIZED) +/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 + * + * @note Supported data types are F32; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + vec4 res; + // Perform pooling 3x3 for 4 output elements +#if STRIDE_X == 1 + POOLING3x3_STRIDE1(res, src, dst); +#elif STRIDE_X == 2 + POOLING3x3_STRIDE2(res, src, dst); +#elif STRIDE_X == 3 + POOLING3x3_STRIDE3(res, src, dst); +#endif /*STRIDE_X == 1*/ + + // Divide by pool region in case of average pooling +#if defined(POOL_AVG) || defined(POOL_L2) + ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X)); + int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; + ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH))); + int end_y = min((start_y + 3), MAX_HEIGHT); + res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x))); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + STORE16(dst, CURRENT_OFFSET(dst), res); +} + +#elif defined(POOLING_LAYER_7) +/** Performs a pooling function of pool size equal to 7. + * + * @note Supported data types are F32; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + //Load and calculate data + float res; +#if defined(POOL_AVG) || defined(POOL_L2) + res = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + res = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + STORE4(dst, CURRENT_OFFSET(dst), res); +} + +#elif defined(POOLING_LAYER_N) +/** Performs a pooling function of pool size equal to N + * + * @note Supported data types are F32; + * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + + vec4 vdata0; + vdata0 = vec4(INITIAL_VALUE); + vec4 vdata1; + vdata1 = vec4(INITIAL_VALUE); + float sdata; + sdata = float(INITIAL_VALUE); + + for(int y = 0; y < int(POOL_SIZE); y++) + { + int x = 0; + for(; x <= (int(POOL_SIZE) - 8); x += 8) + { + vec4 data2; + vec4 data3; + LOAD16(data2, src, tensor3D_offset(src, x, y, 0)); + LOAD16(data3, src, tensor3D_offset(src, x, y, 0) + uint(4)); + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 *= data2; + data3 *= data3; +#endif /* defined(POOL_L2) */ + + POOL_OP(vdata0, vdata0, data2); + POOL_OP(vdata1, vdata1, data3); + } + + // Leftover + for(; x < int(POOL_SIZE); ++x) + { + float data4 = LOAD4(src, tensor3D_offset(src, x, y, 0)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data4 *= data4; +#endif /* defined(POOL_L2) */ + POOL_OP_float(sdata, sdata, data4); + } + } + + //Reduce result + vec4 reduce4; + POOL_OP(reduce4, vdata0.xyzw, vdata1.xyzw); + vec2 reduce2; + POOL_OP_vec2(reduce2, reduce4.xy, reduce4.zw); + float res; + POOL_OP_float(res, reduce2.x, reduce2.y); + POOL_OP_float(res, res, sdata); + +#if defined(POOL_AVG) || defined(POOL_L2) + { + // Divide by pool region in case of average pooling + int start_x = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X; + int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; + int end_x = int(min(STRIDE_X + POOL_SIZE, MAX_WIDTH)); + int end_y = int(min(STRIDE_Y + POOL_SIZE, MAX_HEIGHT)); + float res1 = float((end_y - start_y) * (end_x - start_x)); + res = DIV_OP(res, res1); + } +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + STORE4(dst, CURRENT_OFFSET(dst), res); +} +#endif /* POOLING_LAYER_2 */ + +#elif defined(DATA_TYPE_FP16) + +precision mediump float; + +vec2 load_and_unpack(Tensor3D, uint); +vec2 calculate_max(const int, Tensor3D, const int, const int, const int, const int, const int, const int); +vec2 calculate_avg(const int, Tensor3D, const int, const int, const int, const int, const int, const int); + +BUFFER_DECLARATION(src, 1, uint, readonly); +BUFFER_DECLARATION(dst, 2, uint, writeonly); + +layout(std140) uniform shader_params +{ + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); +}; + +#define LOAD2_fp16(r, name, offset) \ + r.xy = load_and_unpack(name, offset) + +#define LOAD4_fp16(r, name, offset) \ + r.xy = load_and_unpack(name, offset); \ + r.zw = load_and_unpack(name, offset + uint(1)) + +#define STORE4_fp16(name, offset, r) \ + uint datastore1; \ + uint datastore2; \ + datastore1 = uint(packHalf2x16(r.xy)); \ + datastore2 = uint(packHalf2x16(r.zw)); \ + STORE1(name, offset << uint(1), datastore1); \ + STORE1(name, (offset << uint(1)) + uint(1), datastore2) + +#if defined(POOL_AVG) || defined(POOL_L2) +#define POOL_OP(res, a, b) ((res) = (a) + (b)) +#define POOL_OP_float(res, a, b) (res = a + b) +#define POOL_OP_vec2(res, a, b) ((res) = (a) + (b)) +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define POOL_OP(res, a, b) \ + (res) = (a); \ + if(isnan(a.x) || (a.x < b.x)) \ + { \ + res.x = b.x; \ + } \ + if(isnan(a.y) || (a.y < b.y)) \ + { \ + res.y = b.y; \ + } \ + if(isnan(a.z) || (a.z < b.z)) \ + { \ + res.z = b.z; \ + } \ + if(isnan(a.w) || (a.w < b.w)) \ + { \ + res.w = b.w; \ + } +#define POOL_OP_float(res, a, b) \ + (res) = (a); \ + if(isnan(a) || (a < b)) \ + { \ + res = b; \ + } +#define POOL_OP_vec2(res, a, b) \ + (res) = (a); \ + if(isnan(a.x) || (a.x < b.x)) \ + { \ + res.x = b.x; \ + } \ + if(isnan(a.y) || (a.y < b.y)) \ + { \ + res.y = b.y; \ + } +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ + +#define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) + +#if defined(POOL_SIZE) +// Set the initial value for the pooling operation accordingly with the data type +#if defined(POOL_AVG) || defined(POOL_L2) +#define INITIAL_VALUE 0.0f +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define INITIAL_VALUE -65504.0f +#endif //POOL_AVG +#endif //POOL_SIZE + +#define POOLING3x3_STRIDE1_fp16(res, input, output) \ + vec4 data00; \ + vec2 data01; \ + vec4 data10; \ + vec2 data11; \ + vec4 data20; \ + vec2 data21; \ + LOAD4_fp16(data00, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \ + LOAD2_fp16(data01, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data10, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \ + LOAD2_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data20, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \ + LOAD2_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \ + data00 = POW2_OP(data00, 4); \ + data01 = POW2_OP(data01, 2); \ + data10 = POW2_OP(data10, 4); \ + data11 = POW2_OP(data11, 2); \ + data20 = POW2_OP(data20, 4); \ + data21 = POW2_OP(data21, 2); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data00.xyzy; \ + values001.xyzw = data00.zwzw; \ + values010.x = data01.x; \ + values010.y = data00.w; \ + values010.zw = data01.xy; \ + values100.xyzw = data10.xyzy; \ + values101.xyzw = data10.zwzw; \ + values11.x = data11.x; \ + values11.y = data10.w; \ + values11.zw = data11.xy; \ + values200.xyzw = data20.xyzy; \ + values201.xyzw = data20.zwzw; \ + values21.x = data21.x; \ + values21.y = data20.w; \ + values21.zw = data21.xy; \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) + +#define POOLING3x3_STRIDE2_fp16(res, input, output) \ + vec4 data000; \ + vec4 data001; \ + float data010; \ + vec4 data100; \ + vec4 data101; \ + float data11; \ + vec4 data200; \ + vec4 data201; \ + float data21; \ + vec2 datamiddle0; \ + vec2 datamiddle1; \ + vec2 datamiddle2; \ + LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \ + LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \ + datamiddle0 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \ + data010 = datamiddle0.x; \ + LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \ + LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \ + datamiddle1 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \ + data11 = datamiddle1.x; \ + LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \ + LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \ + datamiddle2 = load_and_unpack(input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \ + data21 = datamiddle2.x; \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 1); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 1); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 1); \ + \ + vec4 values000; \ + vec4 values001; \ + vec4 values010; \ + vec4 values100; \ + vec4 values101; \ + vec4 values11; \ + vec4 values200; \ + vec4 values201; \ + vec4 values21; \ + values000.xyzw = data000.xyzz; \ + values001.xyzw = vec4(data000.w, data001.xxy); \ + values010.xyzw = vec4(data001.zzw, data010); \ + values100.xyzw = data100.xyzz; \ + values101.xyzw = vec4(data100.w, data101.xxy); \ + values11.xyzw = vec4(data101.zzw, data11); \ + values200.xyzw = data200.xyzz; \ + values201.xyzw = vec4(data200.w, data201.xxy); \ + values21.xyzw = vec4(data201.zzw, data21); \ + POOL_OP(values000.xyzw, values000.xyzw, values100.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values101.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values11.xyzw); \ + POOL_OP(values000.xyzw, values000.xyzw, values200.xyzw); \ + POOL_OP(values001.xyzw, values001.xyzw, values201.xyzw); \ + POOL_OP(values010.xyzw, values010.xyzw, values21.xyzw); \ + POOL_OP(res.xyzw, vec4(values000.xw, values001.z, values010.y), vec4(values000.y, values001.xw, values010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(values000.z, values001.y, values010.xw)) + +#define POOLING3x3_STRIDE3_fp16(res, input, output) \ + vec4 data000; \ + vec4 data001; \ + vec4 data010; \ + vec4 data100; \ + vec4 data101; \ + vec4 data11; \ + vec4 data200; \ + vec4 data201; \ + vec4 data21; \ + LOAD4_fp16(data000, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2))); \ + LOAD4_fp16(data001, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data010, input, (tensor3D_offset_fp16(input, 0, 0, 0) >> uint(2)) + uint(4)); \ + LOAD4_fp16(data100, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2))); \ + LOAD4_fp16(data101, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data11, input, (tensor3D_offset_fp16(input, 0, 1, 0) >> uint(2)) + uint(4)); \ + LOAD4_fp16(data200, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2))); \ + LOAD4_fp16(data201, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(2)); \ + LOAD4_fp16(data21, input, (tensor3D_offset_fp16(input, 0, 2, 0) >> uint(2)) + uint(4)); \ + data000 = POW2_OP(data000, 4); \ + data001 = POW2_OP(data001, 4); \ + data010 = POW2_OP(data010, 4); \ + data100 = POW2_OP(data100, 4); \ + data101 = POW2_OP(data101, 4); \ + data11 = POW2_OP(data11, 4); \ + data200 = POW2_OP(data200, 4); \ + data201 = POW2_OP(data201, 4); \ + data21 = POW2_OP(data21, 4); \ + \ + POOL_OP(data000.xyzw, data000.xyzw, data100.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data101.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data11.xyzw); \ + POOL_OP(data000.xyzw, data000.xyzw, data200.xyzw); \ + POOL_OP(data001.xyzw, data001.xyzw, data201.xyzw); \ + POOL_OP(data010.xyzw, data010.xyzw, data21.xyzw); \ + POOL_OP(res.xyzw, vec4(data000.xw, data001.z, data010.y), vec4(data000.y, data001.xw, data010.z)); \ + POOL_OP(res.xyzw, res.xyzw, vec4(data000.z, data001.y data010.xw)) + +vec2 load_and_unpack(Tensor3D src, uint offset) +{ + uint packed_s; + vec2 s; + LOAD1(packed_s, src, offset); + + s = vec2(unpackHalf2x16(packed_s)); + return s; +} + +vec2 calculate_max(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x; + int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y; + int end_x1 = int(min(start_x1 + pool_size, upper_bound_w)); + int end_y1 = int(min(start_y1 + pool_size, upper_bound_h)); + + int start_x2 = start_x1 + stride_x; + int start_y2 = start_y1; + int end_x2 = int(min(start_x2 + pool_size, upper_bound_w)); + int end_y2 = int(min(start_y2 + pool_size, upper_bound_h)); + + //Initialize maximum + vec2 data_max = vec2(0); + + //Load and Set initial maximum1 + vec2 data_init1 = load_and_unpack(src, tensor3D_offset_fp16(src, 0, 0, 0) >> uint(2)); + data_max.x = data_init1.x; + + //Load and Set initial maximum2 + if(end_x1 < upper_bound_w) + { + if((stride_x % 2) == 0) + { + vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x, 0, 0) >> uint(2)); + data_max.y = data_init2.x; + } + else + { + vec2 data_init2 = load_and_unpack(src, tensor3D_offset_fp16(src, stride_x - 1, 0, 0) >> uint(2)); + data_max.y = data_init2.y; + } + } + + for(int i = 0; (start_y1 + i) < end_y1; i++) + for(int j = 0; (start_x1 + j) < end_x1; j = j + 2) + { + //Calculate maximum1 + if((start_x1 + j + 1) < end_x1) + { + vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2)); + float data_mr1; + POOL_OP_float(data_mr1, data1.x, data1.y); + POOL_OP_float(data_max.x, data_max.x, data_mr1); + } + else + { + vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2)); + POOL_OP_float(data_max.x, data_max.x, data1.x); + } + + //Calculate maximum2 + if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w) + { + if((stride_x % 2) == 0) + { + vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x), i, 0) >> uint(2))); + + if((start_x2 + j + 1) < end_x2) + { + float data_mr2; + POOL_OP_float(data_mr2, data2.x, data2.y); + POOL_OP_float(data_max.y, data_max.y, data_mr2); + } + else + { + POOL_OP_float(data_max.y, data_max.y, data2.x); + } + } + else + { + vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2))); + vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2))); + if((start_x2 + j + 1) < end_x2) + { + float data_mr2; + POOL_OP_float(data_mr2, data3.x, data2.y); + POOL_OP_float(data_max.y, data_max.y, data_mr2); + } + else + { + POOL_OP_float(data_max.y, data_max.y, data2.y); + } + } + } + } + return data_max; +} + +vec2 calculate_avg(const int pool_size, Tensor3D src, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x1 = int(gl_GlobalInvocationID.x) * stride_x - pad_x; + int start_y1 = int(gl_GlobalInvocationID.y) * stride_y - pad_y; + int end_x1 = int(min(start_x1 + pool_size, upper_bound_w)); + int end_y1 = int(min(start_y1 + pool_size, upper_bound_h)); + + int start_x2 = start_x1 + stride_x; + int start_y2 = start_y1; + int end_x2 = int(min(start_x2 + pool_size, upper_bound_w)); + int end_y2 = int(min(start_y2 + pool_size, upper_bound_h)); + + //Initialize sum + float data_total1 = float(0); + float data_total2 = float(0); + for(int i = 0; (start_y1 + i) < end_y1; i++) + for(int j = 0; (start_x1 + j) < end_x1; j = j + 2) + { + vec2 data1 = load_and_unpack(src, tensor3D_offset_fp16(src, j, i, 0) >> uint(2)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data1 = POW2_OP(data1, 2); +#endif /* defined(POOL_L2) */ + //Calculate sum1 + if((start_x1 + j + 1) < end_x1) + { + data_total1 = data_total1 + data1.x + data1.y; + } + else + { + data_total1 = data_total1 + data1.x; + } + + //Calculate sum2 + if((start_x2 + j) < end_x2 && end_x1 < upper_bound_w) + { + if((stride_x % 2) == 0) + { + vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 = POW2_OP(data2, 2); +#endif /* defined(POOL_L2) */ + if((start_x2 + j + 1) < end_x2) + { + data_total2 = data_total2 + data2.x + data2.y; + } + else + { + data_total2 = data_total2 + data2.x; + } + } + else + { + vec2 data2 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x - 1), i, 0) >> uint(2))); + vec2 data3 = load_and_unpack(src, (tensor3D_offset_fp16(src, (j + stride_x + 1), i, 0) >> uint(2))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 = POW2_OP(data2, 2); + data3 = POW2_OP(data3, 2); +#endif /* defined(POOL_L2) */ + if((start_x2 + j + 1) < end_x2) + { + data_total2 = data_total2 + data3.x + data2.y; + } + else + { + data_total2 = data_total2 + data2.y; + } + } + } + } + //Calculate average + vec2 data_avg; + data_avg.x = data_total1 / float((end_y1 - start_y1) * (end_x1 - start_x1)); + data_avg.y = data_total2 / float((end_y2 - start_y2) * (end_x2 - start_x2)); + + return data_avg; +} + +#ifdef POOLING_LAYER_2 +/** Performs a pooling function of pool size equal to 2. + * + * @note Supported data types are F16; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + //Load and calculate data + vec2 data; + uint res; +#if defined(POOL_AVG) || defined(POOL_L2) + data = calculate_avg(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + data = calculate_max(2, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + data = SQRT_OP(data); +#endif /* defined(POOL_L2) */ + + res = uint(packHalf2x16(data)); + + // Store result + STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); +} + +#elif defined(POOLING_LAYER_3) +/** Performs a pooling function of pool size equal to 3. + * + * @note Supported data types are F16; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + //Load and calculate data + vec2 data; + uint res; +#if defined(POOL_AVG) || defined(POOL_L2) + data = calculate_avg(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + data = calculate_max(3, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + data = SQRT_OP(data); +#endif /* defined(POOL_L2) */ + + res = uint(packHalf2x16(data)); + + // Store result + STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); +} + +#elif defined(POOLING_LAYER_3_OPTIMIZED) +/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 + * + * @note Supported data types are F16; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + vec4 res; + // Perform pooling 3x3 for 4 output elements +#if STRIDE_X == 1 + POOLING3x3_STRIDE1_fp16(res, src, dst); +#elif STRIDE_X == 2 + POOLING3x3_STRIDE2_fp16(res, src, dst); +#elif STRIDE_X == 3 + POOLING3x3_STRIDE3_fp16(res, src, dst); +#endif /*STRIDE_X == 1*/ + + // Divide by pool region in case of average pooling +#if defined(POOL_AVG) || defined(POOL_L2) + ivec4 start_x = ((ivec4(int(gl_GlobalInvocationID.x) * 4) + ivec4(0, 1, 2, 3)) * (ivec4(STRIDE_X))) - (ivec4(PAD_X)); + int start_y = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; + ivec4 end_x = min((start_x + (ivec4(3))), (ivec4(MAX_WIDTH))); + int end_y = min((start_y + 3), MAX_HEIGHT); + res *= (vec4((1.f)) / vec4((ivec4(end_y - start_y)) * (end_x - start_x))); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + STORE4_fp16(dst, CURRENT_OFFSET(dst) >> uint(3), res); +} + +#elif defined(POOLING_LAYER_7) +/** Performs a pooling function of pool size equal to 7. + * + * @note Supported data types are F16; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + //Load and calculate data + vec2 data; + uint res; +#if defined(POOL_AVG) || defined(POOL_L2) + data = calculate_avg(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#else /*POOL_AVG*/ + data = calculate_max(7, src, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /*POOL_AVG*/ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + data = SQRT_OP(data); +#endif /* defined(POOL_L2) */ + + res = uint(packHalf2x16(data)); + + // Store result + STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); +} + +#elif defined(POOLING_LAYER_N) +/** Performs a pooling function of pool size equal to N + * + * @note Supported data types are F16; + * @note Pool size must be passed using POOL_SIZE e.g. POOL_SIZE=13; + * @note In case of average pooling the following information must be passed at compile time: + * POOL_AVG must be provided otherwise max pooling will be performed. + * MAX_WIDTH and MAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * STRIDE_X and STRIDE_Y which are the steps of the window along the x and y directions + * PAD_X and PAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +void main(void) +{ + // Get pixels pointer + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_FP16(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT_FP16(dst); + + vec4 vdata00; + vdata00 = vec4(INITIAL_VALUE); + vec4 vdata01; + vdata01 = vec4(INITIAL_VALUE); + vec4 vdata10; + vdata10 = vec4(INITIAL_VALUE); + vec4 vdata11; + vdata11 = vec4(INITIAL_VALUE); + vec2 sdata; + sdata = vec2(INITIAL_VALUE); + + for(int y = 0; y < int(POOL_SIZE); y++) + { + int x = 0; + for(; x <= (int(POOL_SIZE) - 8); x += 8) + { + vec4 data2; + vec4 data3; + LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2))); + LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2)) + uint(2)); + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 *= data2; + data3 *= data3; +#endif /* defined(POOL_L2) */ + + POOL_OP(vdata00, vdata00, data2); + POOL_OP(vdata10, vdata10, data3); + } + + // Leftover + for(; x < int(POOL_SIZE); x = x + 2) + { + vec2 data4middle; + data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x, y, 0) >> uint(2))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data4middle *= data4middle; +#endif /* defined(POOL_L2) */ + if((x + 1) >= int(POOL_SIZE)) + { + POOL_OP_float(sdata.x, sdata.x, data4middle.x); + } + else + { + float data4; + POOL_OP_float(data4, data4middle.x, data4middle.y); + POOL_OP_float(sdata.x, sdata.x, data4); + } + } + } + + for(int y = STRIDE_X; y < int(POOL_SIZE + STRIDE_X); y++) + { + int x1 = STRIDE_X; + for(; x1 <= (int(POOL_SIZE + STRIDE_X) - 8); x1 += 8) + { + vec4 data2; + vec4 data3; + LOAD4_fp16(data2, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2))); + LOAD4_fp16(data3, src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2)) + uint(2)); + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data2 *= data2; + data3 *= data3; +#endif /* defined(POOL_L2) */ + + POOL_OP(vdata01, vdata01, data2); + POOL_OP(vdata11, vdata11, data3); + } + + // Leftover + for(; x1 < int(POOL_SIZE + STRIDE_X); x1 = x1 + 2) + { + vec2 data4middle; + data4middle = load_and_unpack(src, (tensor3D_offset_fp16(src, x1, y, 0) >> uint(2))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data4middle *= data4middle; +#endif /* defined(POOL_L2) */ + if((x1 + 1) >= int(POOL_SIZE + STRIDE_X)) + { + POOL_OP_float(sdata.y, sdata.y, data4middle.x); + } + else + { + float data4; + POOL_OP_float(data4, data4middle.x, data4middle.y); + POOL_OP_float(sdata.y, sdata.y, data4); + } + } + } + + //Reduce result + vec4 reduce40; + POOL_OP(reduce40, vdata00.xyzw, vdata10.xyzw); + vec2 reduce20; + POOL_OP_vec2(reduce20, reduce40.xy, reduce40.zw); + vec4 reduce41; + POOL_OP(reduce41, vdata01.xyzw, vdata11.xyzw); + vec2 reduce21; + POOL_OP_vec2(reduce21, reduce41.xy, reduce41.zw); + vec2 data; + POOL_OP_float(data.x, reduce20.x, reduce20.y); + POOL_OP_float(data.x, data.x, sdata.x); + POOL_OP_float(data.y, reduce21.x, reduce21.y); + POOL_OP_float(data.y, data.y, sdata.y); + +#if defined(POOL_AVG) || defined(POOL_L2) + { + // Divide by pool region in case of average pooling + int start_x1 = int(gl_GlobalInvocationID.x) * STRIDE_X - PAD_X; + int start_y1 = int(gl_GlobalInvocationID.y) * STRIDE_Y - PAD_Y; + int end_x1 = int(min(start_x1 + POOL_SIZE, MAX_WIDTH)); + int end_y1 = int(min(start_y1 + POOL_SIZE, MAX_HEIGHT)); + int start_x2 = start_x1 + STRIDE_X; + int start_y2 = start_y1; + int end_x2 = int(min(start_x2 + POOL_SIZE, MAX_WIDTH)); + int end_y2 = int(min(start_y2 + POOL_SIZE, MAX_HEIGHT)); + vec2 res1; + res1.x = float((end_y1 - start_y1) * (end_x1 - start_x1)); + res1.y = float((end_y2 - start_y2) * (end_x2 - start_x2)); + data.x = DIV_OP(data.x, res1.x); + data.y = DIV_OP(data.y, res1.y); + } +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + data = SQRT_OP(data); +#endif /* defined(POOL_L2) */ + uint res; + res = uint(packHalf2x16(data)); + + // Store result + STORE1(dst, CURRENT_OFFSET(dst) >> uint(2), res); +} +#endif /*POOLING_LAYER_2*/ +#endif /*DATA_TYPE_FP32 */ diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs new file mode 100644 index 0000000000..0bbabeaafc --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; + +#include "helpers.h" + +#define MAX_OP(x, y) max((x), (y)) +#define ADD_OP(x, y) ((x) + (y)) +#define SUB_OP(x, y) ((x) - (y)) +#define DIV_OP(x, y) ((x) / (y)) +#define EXP_OP(x) exp((x)) + +#if defined(DATA_TYPE_FP32) +const float MINVAL = -1.0 / 0.0; +vec4 type_min = CONVERT(MINVAL, vec4); + +#define LOAD16(name, offset) \ + vec4(LOAD4(name, offset), \ + LOAD4(name, offset + uint(1)), \ + LOAD4(name, offset + uint(2)), \ + LOAD4(name, offset + uint(3))) + +#define STORE16(name, offset, value) \ + STORE4(name, offset, value.x); \ + STORE4(name, offset + uint(1), value.y); \ + STORE4(name, offset + uint(2), value.z); \ + STORE4(name, offset + uint(3), value.w) + +#ifdef SOFTMAX_LAYER_MAX +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); +#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(max, 2, float, readonly); +BUFFER_DECLARATION(dst, 3, float, writeonly); +BUFFER_DECLARATION(sum, 4, float, writeonly); +#elif defined(SOFTMAX_LAYER_NORM) +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(sum, 2, float, readonly); +BUFFER_DECLARATION(dst, 3, float, writeonly); +#endif // SOFTMAX_LAYER_MAX + +layout(std140) uniform shader_params +{ +#ifdef SOFTMAX_LAYER_MAX + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); + uint width; +#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(max); + TENSOR3D_PARAM_DECLARATION(dst); + TENSOR3D_PARAM_DECLARATION(sum); + uint width; +#elif defined(SOFTMAX_LAYER_NORM) + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(sum); + TENSOR3D_PARAM_DECLARATION(dst); +#endif // SOFTMAX_LAYER_MAX +}; + +#ifdef SOFTMAX_LAYER_MAX +/** Identifies the maximum value across the 1st dimension. + * + * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32" + * + * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] width Input image width + */ +void main(void) +{ + Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + + // Initialize local maximum + vec4 max_val = CONVERT(type_min, vec4); + + // Calculate max of row + uint width2 = width >> 2; + for(int i = 0; i < int(width2); i++) + { + vec4 data = LOAD16(src, offset(src, i << 2, 0)); + max_val = MAX_OP(data, max_val); + } + +#ifdef NON_MULTIPLE_OF_4 + // Handle non multiple of 4 + for(int i = int(width2 << 2); i < int(width); i++) + { + float data = LOAD4(src, offset(src, i, 0)); + max_val.x = MAX_OP(data, max_val.x); + } +#endif /* NON_MULTIPLE_OF_4 */ + + // Perform max reduction + max_val.xy = MAX_OP(max_val.xy, max_val.zw); + max_val.x = MAX_OP(max_val.x, max_val.y); + + // Store result + STORE4(dst, CURRENT_OFFSET(dst), max_val.x); +} +#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX +/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel, + * then gets the exponent of each element as sums all elements across each row. + * + * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32" + * + * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed. + * + * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr + * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes) + * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes) + * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes) + * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor + * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr + * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes) + * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes) + * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes) + * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor + * @param[in] width Input image width + */ +void main(void) +{ + Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max); + Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum); + + // Load max value of 1D logits vector (row) + vec4 max_val = CONVERT(LOAD4(max, CURRENT_OFFSET(max)), vec4); + + // Set sum vector + vec4 sum1D = CONVERT(0, vec4); + + // Shift values, exp and sum + uint width2 = width >> 2; + for(int i = 0; i < int(width2); i++) + { + vec4 data = LOAD16(src, offset(src, i << 2, 0)); + data = SUB_OP(data, max_val); + data = EXP_OP(data); + STORE16(dst, offset(dst, i << 2, 0), data); + sum1D = ADD_OP(sum1D, data); + } + +#ifdef NON_MULTIPLE_OF_4 + // Handle non multiple of 4 + for(int i = int(width2 << 2); i < int(width); i++) + { + float data; + data = LOAD4(src, offset(src, i, 0)); + data = SUB_OP(data, max_val.x); + data = EXP_OP(data); + STORE4(dst, offset(dst, i, 0), data); + sum1D.x = ADD_OP(sum1D.x, data); + } +#endif /* NON_MULTIPLE_OF_4 */ + + // Perform min/max reduction + sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw); + sum1D.x = ADD_OP(sum1D.x, sum1D.y); + + // Calculate and store result + STORE4(sum, CURRENT_OFFSET(sum), sum1D.x); +} +#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX +/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel. + * + * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32" + * + * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr + * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes) + * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes) + * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes) + * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor + * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum); + + // Load max value of 1D logits vector (row) + vec4 sum_val = CONVERT(LOAD4(sum, offset(sum, 0, int(gl_GlobalInvocationID.y))), vec4); + vec4 data = LOAD16(src, CURRENT_OFFSET(src)); + STORE16(dst, CURRENT_OFFSET(dst), DIV_OP(data, sum_val)); +} +#endif // SOFTMAX_LAYER_MAX + +#elif defined(DATA_TYPE_FP16) +precision mediump float; + +const float MINVAL1 = -1.0 / 0.0; +vec4 type_min1 = CONVERT(MINVAL1, vec4); + +#define GC_LOAD4_IMAGE(r, name, x, y) \ + load_and_unpack(r.xy, name, x, y); \ + load_and_unpack(r.zw, name, (x + 2), y) + +#define GC_STORE4_IMAGE(r, name, x, y) \ + GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.xy)), name, x, y); \ + GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.zw)), name, (x + 2), y) + +#ifdef SOFTMAX_LAYER_MAX +BUFFER_DECLARATION(src, 1, uint, readonly); +BUFFER_DECLARATION(dst, 2, uint, writeonly); +#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) +BUFFER_DECLARATION(src, 1, uint, readonly); +BUFFER_DECLARATION(max, 2, uint, readonly); +BUFFER_DECLARATION(dst, 3, uint, writeonly); +BUFFER_DECLARATION(sum, 4, uint, writeonly); +#elif defined(SOFTMAX_LAYER_NORM) +BUFFER_DECLARATION(src, 1, uint, readonly); +BUFFER_DECLARATION(sum, 2, uint, readonly); +BUFFER_DECLARATION(dst, 3, uint, writeonly); +#endif // SOFTMAX_LAYER_MAX + +layout(std140) uniform shader_params +{ +#ifdef SOFTMAX_LAYER_MAX + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(dst); + uint width; +#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(max); + TENSOR3D_PARAM_DECLARATION(dst); + TENSOR3D_PARAM_DECLARATION(sum); + uint width; +#elif defined(SOFTMAX_LAYER_NORM) + TENSOR3D_PARAM_DECLARATION(src); + TENSOR3D_PARAM_DECLARATION(sum); + TENSOR3D_PARAM_DECLARATION(dst); +#endif // SOFTMAX_LAYER_MAX +}; + +#define load_and_unpack(rs, names, xs, ys) \ + do \ + { \ + uint packed_s; \ + GC_LOAD1_2D_OFFSET(packed_s, names, xs, ys); \ + rs = vec2(unpackHalf2x16(packed_s)); \ + } while(false) + +#ifdef SOFTMAX_LAYER_MAX +/** Identifies the maximum value across the 1st dimension. + * + * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16" + * + * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] width Input image width + */ +void main(void) +{ + Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + + // Initialize local maximum + vec4 max_val1 = CONVERT(type_min1, vec4); + + // Calculate max of row + uint width2 = width >> 2; + for(int i = 0; i < int(width2); i++) + { + vec4 data1; + GC_LOAD4_IMAGE(data1, src, (i << 2), 0); + max_val1 = MAX_OP(data1, max_val1); + } + +#ifdef NON_MULTIPLE_OF_4 + // Handle non multiple of 4 + for(int i = int(width2 << 2); i < int(width); i = i + 2) + { + vec2 data; + load_and_unpack(data, src, i, 0); + max_val1.x = MAX_OP(data.x, max_val1.x); + if((i + 1) < int(width)) + { + max_val1.x = MAX_OP(data.y, max_val1.x); + } + } +#endif /* NON_MULTIPLE_OF_4 */ + + // Perform max reduction + max_val1.xy = MAX_OP(max_val1.xy, max_val1.zw); + max_val1.x = MAX_OP(max_val1.x, max_val1.y); + vec2 res1 = vec2(max_val1.x, 0.f); + uint res; + res = uint(packHalf2x16(res1)); + + // Store result + GC_STORE1_2D_OFFSET(res, dst, 0, 0); +} +#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX +/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel, + * then gets the exponent of each element as sums all elements across each row. + * + * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16" + * + * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed. + * + * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr + * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes) + * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes) + * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes) + * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor + * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr + * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes) + * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes) + * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes) + * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor + * @param[in] width Input image width + */ +void main(void) +{ + Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Image max = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max); + Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum); + + // Load max value of 1D logits vector (row) + vec2 datamaxinit; + load_and_unpack(datamaxinit, max, 0, 0); + vec4 max_val = CONVERT(datamaxinit.x, vec4); + + // Set sum vector + vec4 sum1D1 = CONVERT(0.f, vec4); + + // Shift values, exp and sum + uint width2 = width >> 2; + for(int i = 0; i < int(width2); i++) + { + vec4 data; + GC_LOAD4_IMAGE(data, src, (i << 2), 0); + data = SUB_OP(data, max_val); + data = EXP_OP(data); + GC_STORE4_IMAGE(data, dst, (i << 2), 0); + sum1D1 = ADD_OP(sum1D1, data); + } + +#ifdef NON_MULTIPLE_OF_4 + // Handle non multiple of 4 + for(int i = int(width2 << 2); i < int(width); i = i + 2) + { + vec2 datamiddle; + float data1; + load_and_unpack(datamiddle, src, i, 0); + data1 = SUB_OP(datamiddle.x, max_val.x); + data1 = EXP_OP(data1); + vec2 datares1; + if((i + 1) < int(width)) + { + float data2; + data2 = SUB_OP(datamiddle.y, max_val.x); + data2 = EXP_OP(data2); + datares1 = vec2(data1, data2); + data1 = ADD_OP(data2, data1); + } + else + { + datares1 = vec2(data1, 0.f); + } + uint datares; + datares = uint(packHalf2x16(datares1)); + GC_STORE1_2D_OFFSET(datares, dst, i, 0); + sum1D1.x = ADD_OP(sum1D1.x, data1); + } +#endif /* NON_MULTIPLE_OF_4 */ + + // Perform min/max reduction + sum1D1.xy = ADD_OP(sum1D1.xy, sum1D1.zw); + sum1D1.x = ADD_OP(sum1D1.x, sum1D1.y); + vec2 res1 = vec2(sum1D1.x, 0.f); + uint res; + res = uint(packHalf2x16(res1)); + // Calculate and store result + GC_STORE1_2D_OFFSET(res, sum, 0, 0); +} +#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX +/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel. + * + * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16" + * + * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr + * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes) + * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes) + * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes) + * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor + * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +void main(void) +{ + Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src); + Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst); + Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum); + + // Load max value of 1D logits vector (row) + vec2 sum1; + load_and_unpack(sum1, sum, 0, int(gl_GlobalInvocationID.y)); + vec4 sum_val1 = CONVERT(sum1.x, vec4); + + vec4 data1; + GC_LOAD4_IMAGE(data1, src, 0, 0); + vec4 res = DIV_OP(data1, sum_val1); + GC_STORE4_IMAGE(res, dst, 0, 0); +} +#endif // SOFTMAX_LAYER_MAX +#endif // DATA_TYPE_FP32 \ No newline at end of file diff --git a/src/core/GLES_COMPUTE/cs_shaders/transpose.cs b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs new file mode 100755 index 0000000000..6d020fe70d --- /dev/null +++ b/src/core/GLES_COMPUTE/cs_shaders/transpose.cs @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in; +#include "helpers.h" + +#ifdef DATA_TYPE_FP32 +precision highp float; + +BUFFER_DECLARATION(src, 1, float, readonly); +BUFFER_DECLARATION(dst, 2, float, writeonly); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(src); + IMAGE_PARAM_DECLARATION(dst); +}; + +#define LOAD16(r, name, offset) \ + r.x = LOAD4(name, offset); \ + r.y = LOAD4(name, offset + uint(1)); \ + r.z = LOAD4(name, offset + uint(2)); \ + r.w = LOAD4(name, offset + uint(3)) + +#define STORE16(name, offset, r) \ + STORE4(name, offset, r.x); \ + STORE4(name, offset + uint(1), r.y); \ + STORE4(name, offset + uint(2), r.z); \ + STORE4(name, offset + uint(3), r.w) + +/** This OpenGL ES kernel computes the matrix transposition of input matrix + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F32 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +void main(void) +{ + // Compute source address + Image src = CONVERT_TO_IMAGE_STRUCT(src); + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + + // Load the NxN block at (x, y) + vec4 u0; + vec4 u1; + vec4 u2; + vec4 u3; + LOAD16(u0, src, offset(src, 0, 0)); + LOAD16(u1, src, offset(src, 0, 1)); + LOAD16(u2, src, offset(src, 0, 2)); + LOAD16(u3, src, offset(src, 0, 3)); + + // Transpose the block + vec4 tmp; + tmp.xyz = u0.yzw; + u0.y = u1.x; + u0.z = u2.x; + u0.w = u3.x; + u1.x = tmp.x; + u2.x = tmp.y; + u3.x = tmp.z; + tmp.xy = u1.zw; + u1.z = u2.y; + u1.w = u3.y; + u2.y = tmp.x; + u3.y = tmp.y; + tmp.x = u2.w; + u2.w = u3.z; + u3.z = tmp.x; + + // Store the block at (y, x) + uint dst_offset_in_bytes = uint(16) * uint(gl_GlobalInvocationID.y) + uint(4) * uint(gl_GlobalInvocationID.x) * (dst.stride_y) + (dst.offset_first_element_in_bytes); + + STORE16(dst, uint((dst_offset_in_bytes + uint(0) * dst.stride_y) >> 2), u0); + STORE16(dst, uint((dst_offset_in_bytes + uint(1) * dst.stride_y) >> 2), u1); + STORE16(dst, uint((dst_offset_in_bytes + uint(2) * dst.stride_y) >> 2), u2); + STORE16(dst, uint((dst_offset_in_bytes + uint(3) * dst.stride_y) >> 2), u3); +} + +#elif defined(DATA_TYPE_FP16) +precision mediump float; + +BUFFER_DECLARATION(src, 1, uvec2, readonly); +BUFFER_DECLARATION(dst, 2, uvec2, writeonly); + +layout(std140) uniform shader_params +{ + IMAGE_PARAM_DECLARATION(src); + IMAGE_PARAM_DECLARATION(dst); +}; + +/** This OpenGL ES kernel computes the matrix transposition of input matrix + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as src_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + */ +void main(void) +{ + // Compute source address + Image src = GC_CONVERT_TO_IMAGE_STRUCT(src); + Image dst = GC_CONVERT_TO_IMAGE_STRUCT(dst); + + // Load the NxN block at (x, y) + vec4 u0; + vec4 u1; + vec4 u2; + vec4 u3; + uvec2 packed_s[4]; + GC_LOAD1_2D_OFFSET(packed_s[0], src, 0, 0); + GC_LOAD1_2D_OFFSET(packed_s[1], src, 0, 1); + GC_LOAD1_2D_OFFSET(packed_s[2], src, 0, 2); + GC_LOAD1_2D_OFFSET(packed_s[3], src, 0, 3); + u0 = vec4(unpackHalf2x16(packed_s[0].x), unpackHalf2x16(packed_s[0].y)); + u1 = vec4(unpackHalf2x16(packed_s[1].x), unpackHalf2x16(packed_s[1].y)); + u2 = vec4(unpackHalf2x16(packed_s[2].x), unpackHalf2x16(packed_s[2].y)); + u3 = vec4(unpackHalf2x16(packed_s[3].x), unpackHalf2x16(packed_s[3].y)); + + // Transpose the block + vec4 tmp; + tmp.xyz = u0.yzw; + u0.y = u1.x; + u0.z = u2.x; + u0.w = u3.x; + u1.x = tmp.x; + u2.x = tmp.y; + u3.x = tmp.z; + tmp.xy = u1.zw; + u1.z = u2.y; + u1.w = u3.y; + u2.y = tmp.x; + u3.y = tmp.y; + tmp.x = u2.w; + u2.w = u3.z; + u3.z = tmp.x; + + // Store the block at (y, x) + uint dst_offset_in_bytes = uint(8) * uint(gl_GlobalInvocationID.y) + uint(gl_GlobalInvocationID.x) * (dst_step_y) + (dst.offset_first_element_in_bytes); + + packed_s[0] = uvec2(packHalf2x16(u0.xy), packHalf2x16(u0.zw)); + packed_s[1] = uvec2(packHalf2x16(u1.xy), packHalf2x16(u1.zw)); + packed_s[2] = uvec2(packHalf2x16(u2.xy), packHalf2x16(u2.zw)); + packed_s[3] = uvec2(packHalf2x16(u3.xy), packHalf2x16(u3.zw)); + GC_STORE1(packed_s[0], dst, uint((dst_offset_in_bytes + uint(0) * dst_stride_y) >> 3)); + GC_STORE1(packed_s[1], dst, uint((dst_offset_in_bytes + uint(1) * dst_stride_y) >> 3)); + GC_STORE1(packed_s[2], dst, uint((dst_offset_in_bytes + uint(2) * dst_stride_y) >> 3)); + GC_STORE1(packed_s[3], dst, uint((dst_offset_in_bytes + uint(3) * dst_stride_y) >> 3)); +} +#endif /*ARM_COMPUTE_ENABLE_FP16*/ diff --git a/src/core/GLES_COMPUTE/egl_entries.in b/src/core/GLES_COMPUTE/egl_entries.in new file mode 100644 index 0000000000..64ccda63c9 --- /dev/null +++ b/src/core/GLES_COMPUTE/egl_entries.in @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +EGL_ENTRY(eglGetProcAddress) +EGL_ENTRY(eglBindAPI) +EGL_ENTRY(eglChooseConfig) +EGL_ENTRY(eglCreateContext) +EGL_ENTRY(eglDestroyContext) +EGL_ENTRY(eglGetDisplay) +EGL_ENTRY(eglInitialize) +EGL_ENTRY(eglMakeCurrent) +EGL_ENTRY(eglTerminate) +EGL_ENTRY(eglGetError) +EGL_ENTRY(eglQueryString) diff --git a/src/core/GLES_COMPUTE/gl_entries.in b/src/core/GLES_COMPUTE/gl_entries.in new file mode 100644 index 0000000000..15ce8ee819 --- /dev/null +++ b/src/core/GLES_COMPUTE/gl_entries.in @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +GL_ENTRY(glAttachShader) +GL_ENTRY(glCompileShader) +GL_ENTRY(glCreateProgram) +GL_ENTRY(glCreateShader) +GL_ENTRY(glDeleteProgram) +GL_ENTRY(glDeleteShader) +GL_ENTRY(glDetachShader) +GL_ENTRY(glGetProgramInfoLog) +GL_ENTRY(glGetProgramiv) +GL_ENTRY(glGetShaderInfoLog) +GL_ENTRY(glGetShaderiv) +GL_ENTRY(glLinkProgram) +GL_ENTRY(glShaderSource) +GL_ENTRY(glUseProgram) +GL_ENTRY(glBindBuffer) +GL_ENTRY(glBindBufferBase) +GL_ENTRY(glBufferData) +GL_ENTRY(glDeleteBuffers) +GL_ENTRY(glDispatchCompute) +GL_ENTRY(glFlush) +GL_ENTRY(glGenBuffers) +GL_ENTRY(glGetProgramResourceIndex) +GL_ENTRY(glGetUniformLocation) +GL_ENTRY(glMapBufferRange) +GL_ENTRY(glMemoryBarrier) +GL_ENTRY(glUniform1ui) +GL_ENTRY(glUnmapBuffer) +GL_ENTRY(glGetError) +GL_ENTRY(glGetActiveUniformBlockiv) +GL_ENTRY(glUniformBlockBinding) +GL_ENTRY(glGetUniformBlockIndex) +GL_ENTRY(glGenTextures) +GL_ENTRY(glDeleteTextures) +GL_ENTRY(glBindTexture) +GL_ENTRY(glTexImage2D) +GL_ENTRY(glGenFramebuffers) +GL_ENTRY(glDeleteFramebuffers) +GL_ENTRY(glBindFramebuffer) +GL_ENTRY(glFramebufferTexture2D) diff --git a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp new file mode 100644 index 0000000000..d76ae8ff1c --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/ToolchainSupport.h" + +#include +#include + +using namespace arm_compute; + +GCAbsoluteDifferenceKernel::GCAbsoluteDifferenceKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void GCAbsoluteDifferenceKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output); + + _input1 = input1; + _input2 = input2; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 4; + + // Set kernel build options + std::set build_opts; + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("absdiff", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowRectangle input1_access(input1->info(), 0, 0, 4, 1); + AccessWindowRectangle input2_access(input2->info(), 0, 0, 4, 1); + AccessWindowRectangle output_access(output->info(), 0, 0, 4, 1); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + _kernel.clear_params(); + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCAbsoluteDifferenceKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_2D(); + do + { + unsigned int idx = 0; + unsigned int binding = 1; // SSBO binding starts from 1. + add_2D_tensor_argument(idx, _input1, binding++, slice); + add_2D_tensor_argument(idx, _input2, binding++, slice); + add_2D_tensor_argument(idx, _output, binding++, slice); + + _kernel.update_shader_params(); + + enqueue(*this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp new file mode 100644 index 0000000000..42433cf076 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h" + +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "support/ToolchainSupport.h" + +#include +#include + +using namespace arm_compute; + +GCActivationLayerKernel::GCActivationLayerKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void GCActivationLayerKernel::configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + + // Make sure _kernel is initialized before calling the parent's configure + _input = input; + _output = input; + + if(output != nullptr) + { + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + _output = output; + } + + unsigned int num_elems_processed_per_iteration = 4 / input->info()->element_size(); + + // Set build options + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + string_from_activation_func(act_info.activation()))); + build_opts.emplace(("#define " + dt_name)); + build_opts.emplace(("#define A_VAL " + float_to_string_with_full_precision(act_info.a()))); + build_opts.emplace(("#define B_VAL " + float_to_string_with_full_precision(act_info.b()))); + build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1))); + build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1))); + build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1))); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("activation_layer", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + if(output != nullptr) + { + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + } + else + { + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + } + + _kernel.clear_params(); + + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCActivationLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + unsigned int binding = 1; + add_3D_tensor_argument(idx, _input, binding++, slice); + add_3D_tensor_argument(idx, _output, binding++, slice); + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp new file mode 100644 index 0000000000..9c24d2ef42 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +GCBatchNormalizationLayerKernel::GCBatchNormalizationLayerKernel() + : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0.0f) +{ +} + +void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, + float epsilon) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, mean, var, beta, gamma); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, mean, var, beta, gamma); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0)); + + _input = input; + _output = output; + _mean = mean; + _var = var; + _beta = beta; + _gamma = gamma; + _epsilon = epsilon; + + const unsigned int num_elems_processed_per_iteration = 4 / input->info()->element_size(); + + // Set build options + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + dt_name)); + build_opts.emplace(("#define ESPILON " + float_to_string_with_full_precision(_epsilon))); + build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1))); + build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1))); + build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1))); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("batchnormalization_layer", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic mean_access(mean->info(), 0, 0, mean->info()->dimension(0) + 1, mean->info()->dimension(1)); + AccessWindowStatic var_access(var->info(), 0, 0, var->info()->dimension(0) + 1, var->info()->dimension(1)); + AccessWindowStatic beta_access(beta->info(), 0, 0, beta->info()->dimension(0) + 1, beta->info()->dimension(1)); + AccessWindowStatic gamma_access(gamma->info(), 0, 0, gamma->info()->dimension(0) + 1, gamma->info()->dimension(1)); + + update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + _kernel.clear_params(); + + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCBatchNormalizationLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_3D(); + + Window vector_slice = window.first_slice_window_1D(); + vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0)); + + unsigned int idx = 2 * num_arguments_per_3D_tensor(); + add_1D_tensor_argument(idx, _mean, 3, vector_slice); + add_1D_tensor_argument(idx, _var, 4, vector_slice); + add_1D_tensor_argument(idx, _beta, 5, vector_slice); + add_1D_tensor_argument(idx, _gamma, 6, vector_slice); + + do + { + idx = 0; + add_3D_tensor_argument(idx, _input, 1, slice); + add_3D_tensor_argument(idx, _output, 2, slice); + + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp new file mode 100644 index 0000000000..10716232c9 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +GCCol2ImKernel::GCCol2ImKernel() + : _input(nullptr), _output(nullptr), _convolved_dims() +{ +} + +void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor *output, + std::pair convolved_dims) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _kernel.clear_params(); + + _input = input; + _output = output; + _convolved_dims = convolved_dims; + + // Create kernel + std::set build_opts; + constexpr unsigned int num_elems_processed_per_iteration = 8; + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.insert("#define COL2IM"); + _kernel = static_cast(GCKernelLibrary::get().create_kernel("col2im", build_opts)); + + // Set static kernel arguments + unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor(); + _kernel.set_params(idx++, _convolved_dims.first); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps()); + + // The GCCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCCol2ImKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window); + + Window slice_in = window.first_slice_window_2D(); + Window slice_out = window.first_slice_window_3D(); + + _kernel.use(); + + do + { + // Set inputs + unsigned int idx = 0; + unsigned int binding = 1; + add_2D_tensor_argument(idx, _input, binding++, slice_in); + add_3D_tensor_argument(idx, _output, binding++, slice_out); + _kernel.update_shader_params(); + enqueue(*this, slice_in); + } + while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp new file mode 100644 index 0000000000..7f9f438a46 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +GCDepthConcatenateKernel::GCDepthConcatenateKernel() + : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0) +{ +} + +BorderSize GCDepthConcatenateKernel::border_size() const +{ + return BorderSize(_top_bottom, _left_right); +} + +void GCDepthConcatenateKernel::configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + + // The gaps between the two lowest dimensions of input and output need to be divisible by 2 + // Otherwise it is not clear how the padding should be added onto the input tensor + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); + + _input = input; + _output = output; + + // Add build options + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + dt_name)); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + + // Configure kernel window + _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; + _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; + + const int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2]; + + build_opts.emplace("#define OFFSETS_X " + support::cpp11::to_string(_left_right)); + build_opts.emplace("#define OFFSETS_Y " + support::cpp11::to_string(_top_bottom)); + build_opts.emplace("#define OFFSETS_Z " + support::cpp11::to_string(offset_to_first_elements_in_bytes)); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("concatenate_depth", build_opts)); + + unsigned int num_elems_processed_per_iteration = 1; + unsigned int num_elems_read_per_iteration = 1; + if(input->info()->data_type() == DataType::F32) + { + num_elems_processed_per_iteration = 1; + num_elems_read_per_iteration = 1; + } + else if(input->info()->data_type() == DataType::F16) + { + num_elems_processed_per_iteration = 4; + num_elems_read_per_iteration = 4; + } + const unsigned int num_rows_read_per_iteration = 1; + + // The window needs to be based on input as we copy all the depths of input + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); + + AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + _kernel.clear_params(); + _kernel.set_shader_params_binding_point(0); + IGCKernel::configure(win); +} + +void GCDepthConcatenateKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_3D(); + + do + { + if(_input->info()->data_type() == DataType::F32) + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, 1, slice); + add_3D_tensor_argument(idx, _output, 2, slice); + } + else if(_input->info()->data_type() == DataType::F16) + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice); + add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice); + } + + _kernel.update_shader_params(); + + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp new file mode 100644 index 0000000000..1fa2a71fff --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +template +GCDirectConvolutionLayerKernel::GCDirectConvolutionLayerKernel() + : _input(nullptr), _bias(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0), _lws(gles::NDRange(1U, 1U, 1U)) +{ +} + +template +BorderSize GCDirectConvolutionLayerKernel::border_size() const +{ + return _border_size; +} + +template +void GCDirectConvolutionLayerKernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.stride()) > 2), "Strides larger than 2 not supported in 3x3 direct convolution!"); + ARM_COMPUTE_ERROR_ON(kernel_size != weights->info()->dimension(0)); + + if(bias != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias); + // FIXME: Bug in framework, workaround it in tests currently. + //ARM_COMPUTE_ERROR_ON(bias->info()->dimension(0) != weights->info()->dimension(3)); + ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1); + } + + _conv_stride_x = std::get<0>(conv_info.stride()); + _conv_stride_y = std::get<1>(conv_info.stride()); + _conv_pad_x = std::get<0>(conv_info.pad()); + _conv_pad_y = std::get<1>(conv_info.pad()); + + _input = input; + _weights = weights; + _output = output; + _bias = bias; + _border_size = BorderSize(_conv_pad_y, _conv_pad_x); + + std::set options; + + options.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0])); + options.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1])); + options.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2])); + options.emplace("#define STRIDE_X " + support::cpp11::to_string(_conv_stride_x)); + + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + options.emplace(("#define " + dt_name)); + + unsigned int num_elems_read_per_iteration_x = kernel_size * _conv_stride_x; + unsigned int num_elems_read_per_iteration_y = 1; + unsigned int num_elems_written_per_iteration_x = 1; + unsigned int num_elems_written_per_iteration_y = 1; + unsigned int num_elems_written_per_iteration_z = 1; + + if(kernel_size == 3) + { + if((_conv_stride_x == 1) && (_conv_stride_y == 1)) + { + switch(input->info()->data_type()) + { + // TODO(APPBROWSER-299): Choose the most optimal path and remove others. +#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16 + + case DataType::F16: +#if defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16) + options.emplace("#define PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16"); + num_elems_read_per_iteration_x = 16; + num_elems_read_per_iteration_y = 5; + num_elems_written_per_iteration_x = 8; + num_elems_written_per_iteration_y = 3; +#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16) + options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16"); + num_elems_read_per_iteration_x = 8; + num_elems_read_per_iteration_y = 5; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 3; +#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16) + options.emplace("#define PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16"); + num_elems_read_per_iteration_x = 8; + num_elems_read_per_iteration_y = 6; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 4; +#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16) + options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16"); + num_elems_read_per_iteration_x = 8; + num_elems_read_per_iteration_y = 5; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 3; + num_elems_written_per_iteration_z = 2; +#endif /* PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16 */ + break; + + case DataType::F32: + options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS"); + num_elems_read_per_iteration_x = 8; + num_elems_read_per_iteration_y = 5; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 3; + break; + + default: + ARM_COMPUTE_ERROR("Current data type is not supported"); + break; + } + } + // FIXME: Just keep one in release + else + { + switch(input->info()->data_type()) + { + case DataType::F16: + options.emplace("#define PROCESS_X_4ELEMENTS_FP16"); + num_elems_read_per_iteration_x = 8; + num_elems_written_per_iteration_x = 4; + break; + + case DataType::F32: + // TODO(APPBROWSER-299): Choose the most optimal path and remove others. +#define PROCESS_4_ELEMENT + +#if defined(PROCESS_1_ELEMENT) + options.emplace("#define PROCESS_1_ELEMENT"); + num_elems_read_per_iteration_x = 3; + num_elems_written_per_iteration_x = 1; +#elif defined(PROCESS_4_ELEMENT) + options.emplace("#define PROCESS_4_ELEMENT"); + num_elems_read_per_iteration_x = 8; + num_elems_written_per_iteration_x = 4; +#elif defined(PROCESS_8_ELEMENT) + options.emplace("#define PROCESS_8_ELEMENT"); + num_elems_read_per_iteration_x = 12; + num_elems_written_per_iteration_x = 8; +#else /* PROCESS_1_ELEMENT */ +#error Have to declare how many elements to process in one thread. +#endif /* PROCESS_1_ELEMENT */ + break; + + default: + ARM_COMPUTE_ERROR("Current data type is not supported"); + break; + } + } + } + else if(kernel_size == 1) + { + switch(input->info()->data_type()) + { + case DataType::F16: + num_elems_read_per_iteration_x = 8; + num_elems_written_per_iteration_x = 8; + break; + + case DataType::F32: + num_elems_read_per_iteration_x = 1; + num_elems_written_per_iteration_x = 1; + break; + + default: + break; + } + } + else if(kernel_size == 5) + { + switch(input->info()->data_type()) + { + case DataType::F16: + num_elems_read_per_iteration_x = 8; + num_elems_written_per_iteration_x = 4; + + default: + break; + } + } + else + { + } + + if(_bias != nullptr) + { + options.emplace("#define BIAS"); + } + + std::stringstream kernel_name; + kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; + + _kernel = static_cast(GCKernelLibrary::get().create_kernel(kernel_name.str(), options)); + + _kernel.clear_params(); + + unsigned int idx = (_bias == nullptr) ? 3 * num_arguments_per_3D_tensor() : (num_arguments_per_1D_tensor() + 3 * num_arguments_per_3D_tensor()); + + // Calculate output right and bottom border + const int output_width = output->info()->dimension(0); + const int output_height = output->info()->dimension(1); + const int output_padding_right = ceil_to_multiple(output_width, num_elems_written_per_iteration_x * _lws[0]) - output_width; + const int output_padding_bottom = ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height; + + // Calculate input right and bottom border + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + const int upper_bound_w = ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x - input_width; + const int upper_bound_h = ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y - input_height; + const int padding_right = std::max(upper_bound_w, _conv_pad_x); + const int padding_bottom = std::max(upper_bound_h, _conv_pad_y); + + BorderSize border = BorderSize(0, output_padding_right, output_padding_bottom, 0); + + Window win = calculate_max_enlarged_window(*output->info(), Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, num_elems_written_per_iteration_z), border); + + AccessWindowStatic input_access(input->info(), -_conv_pad_x, -_conv_pad_y, input_width + padding_right, input_height + padding_bottom); + AccessWindowStatic weights_access = AccessWindowStatic(nullptr, 0, 0, 0, 0); + AccessWindowStatic bias_access = AccessWindowStatic(nullptr, 0, 0, 0, 1); + + switch(weights->info()->data_type()) + { + case DataType::F16: + weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size + 1, kernel_size); + if(_bias != nullptr) + { + bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0) + 1, 1); + } + break; + + case DataType::F32: + weights_access = AccessWindowStatic(weights->info(), 0, 0, kernel_size, kernel_size); + if(_bias != nullptr) + { + bias_access = AccessWindowStatic(_bias->info(), 0, 0, _bias->info()->dimension(0), 1); + } + break; + + default: + ARM_COMPUTE_ERROR("Current data type is not supported"); + break; + } + + AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom); + + if(_bias != nullptr) + { + update_window_and_padding(win, input_access, weights_access, bias_access, output_access); + } + else + { + update_window_and_padding(win, input_access, weights_access, output_access); + } + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + _kernel.set_params(idx++, _weights->info()->strides_in_bytes()[3]); // weights_stride_w + _kernel.set_params(idx++, _weights->info()->dimension(2)); // weights_depth + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +template +void GCDirectConvolutionLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.use(); + + // Get initial windows + Window slice = window.first_slice_window_3D(); + Window win_in = window; + + win_in.adjust(Window::DimX, -_conv_pad_x, true); + win_in.adjust(Window::DimY, -_conv_pad_y, true); + win_in.set_dimension_step(Window::DimX, window.x().step() * _conv_stride_x); + win_in.set_dimension_step(Window::DimY, window.y().step() * _conv_stride_y); + + Window slice_in = win_in.first_slice_window_3D(); + + unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); + add_3D_tensor_argument(idx1, _weights, BufferParam(3, 2), slice); + + if(_bias != nullptr) + { + Window slice_bias; + slice_bias.use_tensor_dimensions(_bias->info()->tensor_shape()); + add_1D_tensor_argument(idx1, _bias, BufferParam(4, 2), slice_bias); + } + + do + { + unsigned int idx = 0; + + switch(_input->info()->data_type()) + { + case DataType::F16: + switch(kernel_size) + { + case 1: + add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in); + add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice); + break; + + case 3: + add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in); + add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice); + break; + + case 5: + add_3D_tensor_argument(idx, _input, BufferParam(1, 3), slice_in); + add_3D_tensor_argument(idx, _output, BufferParam(2, 3), slice); + break; + + default: + ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size); + break; + } + break; + + case DataType::F32: + switch(kernel_size) + { + case 1: + case 5: + add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice_in); + add_3D_tensor_argument(idx, _output, BufferParam(2, 2), slice); + break; + + case 3: + add_3D_tensor_argument(idx, _input, BufferParam(1, 4), slice_in); + add_3D_tensor_argument(idx, _output, BufferParam(2, 4), slice); + break; + + default: + ARM_COMPUTE_ERROR("Current kernel size %d is not supported", kernel_size); + break; + } + break; + + default: + ARM_COMPUTE_ERROR("Current data type is not supported"); + break; + } + + _kernel.update_shader_params(); + enqueue(*this, slice, _lws); + } + while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); +} + +template class arm_compute::GCDirectConvolutionLayerKernel<1>; +template class arm_compute::GCDirectConvolutionLayerKernel<3>; +template class arm_compute::GCDirectConvolutionLayerKernel<5>; diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp new file mode 100644 index 0000000000..6244fbef80 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "support/ToolchainSupport.h" + +#include +#include +#include + +using namespace arm_compute; + +GCDropoutKernel::GCDropoutKernel() + : _input(nullptr), _mask(nullptr), _output(nullptr), _num_elems_processed_per_iteration(0) +{ +} + +void GCDropoutKernel::configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mask, output); + + _input = input; + _mask = mask; + _output = output; + _kernel.clear_params(); + + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + std::string fporbp = forward ? "FORWARD" : "BACKWARD"; + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_real_distribution dist(0.f, 1.f); + + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.emplace("#define RATIO " + support::cpp11::to_string(ratio)); + build_opts.emplace("#define SCALE " + support::cpp11::to_string(1. / (1. - ratio))); + build_opts.emplace("#define SEED " + support::cpp11::to_string(dist(mt))); + build_opts.emplace("#define " + dt_name); + build_opts.emplace("#define " + fporbp); + + _num_elems_processed_per_iteration = 4 / input->info()->element_size(); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("dropout", build_opts)); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration)); + + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + IGCKernel::configure(win); +} + +void GCDropoutKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + + add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice); + add_3D_tensor_argument(idx, _mask, BufferParam(2, 2), slice); + add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice); + + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp new file mode 100644 index 0000000000..36742ef81e --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +GCFillBorderKernel::GCFillBorderKernel() + : IGCKernel(), _tensor(nullptr) +{ +} + +bool GCFillBorderKernel::is_parallelisable() const +{ + return false; +} + +template +void GCFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value) +{ + T value; + constant_border_value.get(value); + _kernel.set_params(idx, static_cast(value)); +} + +void GCFillBorderKernel::configure(const IGCTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +{ + ARM_COMPUTE_ERROR_ON(tensor == nullptr); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1); + + border_size.limit(tensor->info()->padding()); + + // If there is no border: early exit + if(border_size.empty() || border_mode == BorderMode::UNDEFINED) + { + return; + } + + // Select appropriate kernel + std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode)); + + // Define build options + std::set build_opts; + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.emplace("#define BORDER_SIZE_TOP " + support::cpp11::to_string(border_size.top)); + build_opts.emplace("#define BORDER_SIZE_BOTTOM " + support::cpp11::to_string(border_size.bottom)); + build_opts.emplace("#define BORDER_SIZE_LEFT " + support::cpp11::to_string(border_size.left)); + build_opts.emplace("#define BORDER_SIZE_RIGHT " + support::cpp11::to_string(border_size.right)); + + if(border_mode == BorderMode::REPLICATE) + { + build_opts.emplace("#define FILL_IMAGE_BORDERS_REPLICATE\n"); + } + else + { + build_opts.emplace("#define FILL_IMAGE_BORDERS_CONSTANT\n"); + } + + switch(tensor->info()->data_type()) + { + case DataType::F16: + build_opts.emplace("#define DATA_TYPE_FP16"); + break; + + case DataType::F32: + build_opts.emplace("#define DATA_TYPE_FP32"); + break; + + default: + ARM_COMPUTE_ERROR("Current data type is not supported"); + break; + } + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel(kernel_name, build_opts)); + _tensor = tensor; + + _kernel.clear_params(); + + // Create static kernel arguments + const unsigned int valid_width = tensor->info()->valid_region().shape[0]; + const unsigned int valid_height = tensor->info()->valid_region().shape[1]; + const unsigned int total_valid_width = border_size.left + valid_width + border_size.right; + + // Set static kernel arguments + unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters + _kernel.set_params(idx++, valid_width); + _kernel.set_params(idx++, valid_height); + _kernel.set_params(idx++, tensor->info()->valid_region().anchor[0]); + _kernel.set_params(idx++, tensor->info()->valid_region().anchor[1]); + + if(BorderMode::CONSTANT == border_mode) + { + set_constant_border(idx++, constant_border_value); + } + + // Configure kernel window + Window win; + win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ); + + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCFillBorderKernel::run(const Window &window) +{ + // Border mode undefined or border width == 0 + if(_kernel.get_program() == 0) + { + return; + } + + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window); + + _kernel.use(); + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _tensor, 1, slice); + + _kernel.update_shader_params(); + + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp new file mode 100644 index 0000000000..5e3788af99 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +GCGEMMInterleave4x4Kernel::GCGEMMInterleave4x4Kernel() + : _input(nullptr), _output(nullptr) +{ +} + +void GCGEMMInterleave4x4Kernel::configure(const IGCTensor *input, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + TensorShape output_shape = input->info()->tensor_shape(); + output_shape.set(0, input->info()->dimension(0) * 4); + output_shape.set(1, std::ceil(input->info()->dimension(1) / 4.0f)); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + dt_name)); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + + // Create kernel + build_opts.emplace("#define GEMM_INTERLEAVE4x4"); + _kernel = static_cast(GCKernelLibrary::get().create_kernel("gemm_interleave4x4", build_opts)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input->info()->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + _kernel.clear_params(); + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCGEMMInterleave4x4Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + /* + * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | + * |a30 a31 a32 a33| + * + * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ] + */ + Window in_slice = window.first_slice_window_2D(); + Window out_slice = window.first_slice_window_2D(); + + // Change x and y steps for the slide of output tensor + out_slice.scale(Window::DimX, 4.f); + out_slice.scale(Window::DimY, 0.25f); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, 1, in_slice); + add_2D_tensor_argument(idx, _output, 2, out_slice); + + _kernel.update_shader_params(); + + enqueue(*this, in_slice); + } + while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp new file mode 100644 index 0000000000..434070a46c --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +GCGEMMMatrixAccumulateBiasesKernel::GCGEMMMatrixAccumulateBiasesKernel() + : _accum(nullptr), _biases(nullptr) +{ +} + +void GCGEMMMatrixAccumulateBiasesKernel::configure(IGCTensor *accum, const IGCTensor *biases) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1); + + _biases = biases; + _accum = accum; + + std::set build_opts; + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + + // Create kernel + build_opts.emplace("#define GEMM_ACCUMULATE_BIASES"); + std::string dt_name = (accum->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + dt_name)); + _kernel = GCKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts); + + // Configure kernel window + unsigned int num_elems_processed_per_iteration = 1; + + if(_accum->info()->data_type() == DataType::F32) + { + num_elems_processed_per_iteration = 16; + } + else if(_accum->info()->data_type() == DataType::F16) + { + num_elems_processed_per_iteration = 4; + } + + Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1)); + AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, biases_access, accum_access); + + _kernel.clear_params(); + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCGEMMMatrixAccumulateBiasesKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window); + + _kernel.use(); + + Window accum_slice = window.first_slice_window_2D(); + + Window biases_slice(accum_slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + + // Run kernel + do + { + // Set arguments + unsigned int idx = 0; + if(_accum->info()->data_type() == DataType::F32) + { + add_2D_tensor_argument(idx, _accum, 1, accum_slice); + add_1D_tensor_argument(idx, _biases, 2, biases_slice); + } + else if(_accum->info()->data_type() == DataType::F16) + { + add_2D_tensor_argument(idx, _accum, BufferParam(1, 3), accum_slice); + add_1D_tensor_argument(idx, _biases, BufferParam(2, 3), biases_slice); + } + + _kernel.update_shader_params(); + + enqueue(*this, accum_slice); + } + while(window.slide_window_slice_2D(accum_slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp new file mode 100644 index 0000000000..fa0415249a --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +GCGEMMMatrixAdditionKernel::GCGEMMMatrixAdditionKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void GCGEMMMatrixAdditionKernel::configure(const IGCTensor *input, IGCTensor *output, float beta) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1)); + + _input = input; + _output = output; + const unsigned int num_elems_processed_per_iteration = max_gc_vector_width / data_size_from_type(input->info()->data_type()); + + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + dt_name)); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.emplace("#define BETA " + float_to_string_with_full_precision(beta)); + + // Create kernel + build_opts.emplace("#define GEMM_MATRIXADDITION"); + std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type())); + _kernel = GCKernelLibrary::get().create_kernel(("gemm_ma"), build_opts); + + // Configure kernel window + Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + _kernel.clear_params(); + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCGEMMMatrixAdditionKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, 1, slice); + add_2D_tensor_argument(idx, _output, 2, slice); + + _kernel.update_shader_params(); + + enqueue(*this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..ea9b3874b2 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include + +using namespace arm_compute; + +GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + + if(!is_interleaved_transposed) + { + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + } + + _input0 = input0; + _input1 = input1; + _output = output; + + std::set build_opts; + Window win; + + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.emplace("#define COLS_A " + support::cpp11::to_string(input0->info()->dimension(0))); + build_opts.emplace("#define COLS_B " + support::cpp11::to_string(input1->info()->dimension(0))); + build_opts.emplace("#define ALPHA " + float_to_string_with_full_precision(alpha)); + + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication + if(is_interleaved_transposed) + { + switch(input0->info()->data_type()) + { + case DataType::F16: + build_opts.emplace("#define DATA_TYPE_FP16"); + break; + + case DataType::F32: + build_opts.emplace("#define DATA_TYPE_FP32"); + break; + + default: + ARM_COMPUTE_ERROR("Current data type is not supported"); + break; + } + + build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED"); + + // Create kernel + _kernel = GCKernelLibrary::get().create_kernel(("gemm_mm_interleaved_transposed"), build_opts); + + // Configure window kernel + const unsigned int num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); + AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + } + else + { + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + + // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor + unsigned int num_elems_processed_per_iteration_x; + unsigned int num_elems_processed_per_iteration_y; + + switch(input0->info()->data_type()) + { + case DataType::F16: + num_elems_processed_per_iteration_x = 4; + num_elems_processed_per_iteration_y = 1; + build_opts.emplace("#define DATA_TYPE_FP16"); + break; + + case DataType::F32: + num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type()); + num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); + build_opts.emplace("#define DATA_TYPE_FP32"); + break; + + default: + ARM_COMPUTE_ERROR("Current data type is not supported"); + break; + } + + build_opts.emplace("#define GEMM_MM_FLOATING_POINT"); + build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elems_processed_per_iteration_x)); + build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elems_processed_per_iteration_y)); + + // Create kernel + _kernel = GCKernelLibrary::get().create_kernel("gemm_mm_floating_point", build_opts); + + win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1), + num_elems_processed_per_iteration_y)); + AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1)); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); + } + + _kernel.clear_params(); + _kernel.set_shader_params_binding_point(0); + IGCKernel::configure(win); +} + +void GCGEMMMatrixMultiplyKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_2D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(_input1->info()->num_dimensions() < 3) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + switch(_input0->info()->data_type()) + { + case DataType::F16: + add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice); + add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b); + add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice); + break; + + case DataType::F32: + add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice); + add_2D_tensor_argument(idx, _input1, BufferParam(2, 2), slice_b); + add_2D_tensor_argument(idx, _output, BufferParam(3, 2), slice); + break; + + default: + ARM_COMPUTE_ERROR("Current data type is not supported"); + break; + } + + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp new file mode 100644 index 0000000000..a1270b4c3d --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include + +using namespace arm_compute; + +void GCGEMMTranspose1xWKernel::configure(const IGCTensor *input, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + TensorShape output_shape{ input->info()->tensor_shape() }; + const size_t transpose_w = 16 / input->info()->element_size(); + output_shape.set(0, input->info()->dimension(1) * transpose_w); + output_shape.set(1, static_cast(std::ceil((input->info()->dimension(0) / static_cast(transpose_w))))); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const int scale_x = num_elems_processed_per_iteration; + + _input = input; + _output = output; + + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + dt_name)); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + /* + * Following an example of how the transposition1xW works when the input data type is F32 + * + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 | + * |a30 a31 a32 a33| + * + * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + */ + // Create kernel + build_opts.emplace("#define GEMM_TRANSPOSE1xW"); + _kernel = GCKernelLibrary::get().create_kernel("gemm_transpose1x4", build_opts); + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension"); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape())); + + _kernel.clear_params(); + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCGEMMTranspose1xWKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + // Output is transposed + Window out_window(window); + out_window.set(Window::DimX, window.y()); + out_window.set(Window::DimY, window.x()); + + Window in_slice = window.first_slice_window_2D(); + Window out_slice = out_window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, _input, 1, in_slice); + add_2D_tensor_argument(idx, _output, 2, out_slice); + + _kernel.update_shader_params(); + + enqueue(*this, in_slice); + } + while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp new file mode 100644 index 0000000000..935d8420ff --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "support/ToolchainSupport.h" + +#include +#include + +using namespace arm_compute; + +GCIm2ColKernel::GCIm2ColKernel() + : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _run_func(nullptr) +{ +} + +void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::pair kernel_dims, const PadStrideInfo &conv_info, bool has_bias) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_UNUSED(kernel_dims); + + _input = input; + _output = output; + _kernel.clear_params(); + + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.insert("#define " + dt_name); + + if(has_bias) + { + build_opts.emplace("#define HAS_BIAS"); + } + + int pad_x = 0; + int pad_y = 0; + int stride_x = 0; + int stride_y = 0; + std::tie(pad_x, pad_y) = conv_info.pad(); + std::tie(stride_x, stride_y) = conv_info.stride(); + + const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4) + && (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)) + && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0)); + + if(!run_img2col_reduced) + { + // this path is currently not used and not validated + build_opts.insert("#define IM2COL_GENERIC"); + _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), + kernel_dims.first, kernel_dims.second, + conv_info); + _num_elems_processed_per_iteration = output->info()->dimension(0); + + build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.first)); + build_opts.emplace("#define KERNEL_HEIGHT " + support::cpp11::to_string(kernel_dims.second)); + build_opts.emplace("#define KERNEL_DEPTH " + support::cpp11::to_string(input->info()->dimension(2))); + build_opts.emplace("#define CONVOLVED_WIDTH " + support::cpp11::to_string(_convolved_dims.first)); + build_opts.emplace("#define CONVOLVED_HEIGHT " + support::cpp11::to_string(_convolved_dims.second)); + build_opts.emplace("#define STRIDE_X " + support::cpp11::to_string(conv_info.stride().first)); + build_opts.emplace("#define STRIDE_Y " + support::cpp11::to_string(conv_info.stride().second)); + build_opts.emplace("#define PAD_X " + support::cpp11::to_string(conv_info.pad().first)); + build_opts.emplace("#define PAD_Y " + support::cpp11::to_string(conv_info.pad().second)); + build_opts.emplace("#define SRC_WIDTH " + support::cpp11::to_string(input->info()->dimension(0))); + build_opts.emplace("#define SRC_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1))); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("im2col_generic", build_opts)); + + _run_func = &GCIm2ColKernel::run_generic; + } + else + { + build_opts.insert("#define IM2COL_REDUCED"); + _num_elems_processed_per_iteration = 4 / input->info()->element_size(); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("im2col_reduced", build_opts)); + + _run_func = &GCIm2ColKernel::run_reduced; + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration)); + + if(input->info()->data_type() == DataType::F16) + { + // Calculate input right and bottom border + AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration); + + // Calculate output right and bottom border + const int output_width = output->info()->dimension(0); + const int output_height = output->info()->dimension(1); + const int output_padding_right = ceil_to_multiple(output_width, _num_elems_processed_per_iteration) - output_width; + AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height); + + update_window_and_padding(win, input_access, output_access); + } + + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + if(!run_img2col_reduced) + { + // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension + win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start()); + } + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + IGCKernel::configure(win); +} + +void GCIm2ColKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON(_run_func == nullptr); + (this->*_run_func)(window); +} + +void GCIm2ColKernel::run_generic(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window); + + // Get initial windows + Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ); + // Change the Z dimension's step back to 1 + window_collapsed.set_dimension_step(Window::DimZ, 1); + + Window slice = window_collapsed.first_slice_window_3D(); + Window slice_in = window_collapsed.first_slice_window_3D(); + Window slice_out = window_collapsed.first_slice_window_3D(); + + // Setup slice + slice.set(Window::DimX, Window::Dimension(0, static_cast(_convolved_dims.first), 1)); + slice.set(Window::DimY, Window::Dimension(0, static_cast(_convolved_dims.second), 1)); + + // Setup input slice + // The first three dimensions of the input are increased by the inner loops + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Setup output slice + slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration)); + slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1)); + slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + _kernel.use(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, 1, slice_in); + add_2D_tensor_argument(idx, _output, 2, slice_out); + + _kernel.set_params(idx++, static_cast(_input->info()->dimension(2))); + _kernel.set_params(idx++, static_cast(_input->info()->strides_in_bytes()[3])); + _kernel.set_params(idx++, static_cast(_output->info()->strides_in_bytes()[3])); + _kernel.update_shader_params(); + + enqueue(*this, slice); + } + while(window_collapsed.slide_window_slice_3D(slice) && window_collapsed.slide_window_slice_3D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in)); +} + +void GCIm2ColKernel::run_reduced(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IGCKernel::window(), window); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()->tensor_shape()); + + Window out_slice = out_window.first_slice_window_1D(); + Window in_slice = window.first_slice_window_3D(); + + _kernel.use(); + + // Run kernel + do + { + // Set arguments + unsigned int idx = 0; + + add_3D_tensor_argument(idx, _input, 1, in_slice); + add_1D_tensor_argument(idx, _output, 2, out_slice); + _kernel.set_params(idx++, _input->info()->dimension(0)); + _kernel.set_params(idx++, _input->info()->dimension(1)); + _kernel.update_shader_params(); + + enqueue(*this, in_slice); + } + while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp new file mode 100644 index 0000000000..65e54f538c --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h" + +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include + +using namespace arm_compute; + +GCNormalizationLayerKernel::GCNormalizationLayerKernel() + : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0) +{ +} + +BorderSize GCNormalizationLayerKernel::border_size() const +{ + return _border_size; +} + +void GCNormalizationLayerKernel::configure(const IGCTensor *input, const IGCTensor *squared_input, IGCTensor *output, NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); + ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented"); + + // Set build options + std::set build_opts; + + _input = input; + _squared_input = squared_input; + _output = output; + + const bool is_in_map = (norm_info.type() == NormType::IN_MAP_1D); + const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0; + _border_size = BorderSize(0, border_width); + + // Set kernel static arguments + std::string func_name = ((norm_info.type() == NormType::IN_MAP_1D) ? "IN_MAP_1D" : "CROSS_MAP"); + build_opts.emplace(("#define " + func_name)); + build_opts.emplace(("#define COEFF " + float_to_string_with_full_precision(norm_info.scale_coeff()))); + build_opts.emplace(("#define BETA " + float_to_string_with_full_precision(norm_info.beta()))); + build_opts.emplace(("#define KAPPA " + float_to_string_with_full_precision(norm_info.kappa()))); + build_opts.emplace(("#define RADIUS " + support::cpp11::to_string(norm_info.norm_size() / 2))); + build_opts.emplace(("#define LOCAL_SIZE_X " + support::cpp11::to_string(1))); + build_opts.emplace(("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1))); + build_opts.emplace(("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1))); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("normalization_layer", build_opts)); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 1; + const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration); + AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, squared_input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + _kernel.clear_params(); + + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCNormalizationLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + unsigned int binding = 1; + add_3D_tensor_argument(idx, _input, binding++, slice); + add_3D_tensor_argument(idx, _squared_input, binding++, slice); + add_3D_tensor_argument(idx, _output, binding++, slice); + + _kernel.update_shader_params(); + + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp new file mode 100644 index 0000000000..2b5cee455c --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include +using namespace arm_compute; + +GCPixelWiseMultiplicationKernel::GCPixelWiseMultiplicationKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void GCPixelWiseMultiplicationKernel::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. "); + + // Auto initialize output if not initialized + { + set_shape_if_empty(*output->info(), input1->info()->tensor_shape()); + set_format_if_unknown(*output->info(), Format::F32); + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. "); + + _input1 = input1; + _input2 = input2; + _output = output; + + std::string data_type; + std::string compute_type; + + // Set kernel build options + std::set build_opts; + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + + build_opts.emplace("#define SCALE " + support::cpp11::to_string(scale)); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("pixelwise_mul_float", build_opts)); + + _kernel.clear_params(); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 1; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input1_access, input2_access, output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + output_access.set_valid_region(win, valid_region); + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCPixelWiseMultiplicationKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + unsigned int binding = 1; + add_3D_tensor_argument(idx, _input1, binding++, slice); + add_3D_tensor_argument(idx, _input2, binding++, slice); + add_3D_tensor_argument(idx, _output, binding++, slice); + + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp new file mode 100644 index 0000000000..c877da3783 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +GCPoolingLayerKernel::GCPoolingLayerKernel() + : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1) +{ +} + +BorderSize GCPoolingLayerKernel::border_size() const +{ + return _border_size; +} + +void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info) +{ + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + const PoolingType pool_type = pool_info.pool_type(); + const int pool_size = pool_info.pool_size(); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); + std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size); + ARM_COMPUTE_ERROR_ON(pool_size > 7 && is_data_type_fixed_point(input->info()->data_type())); + + // Check output dimensions + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), + input->info()->dimension(1), + pool_size, + pool_size, + pool_info.pad_stride_info()); + + // Output auto initialization if not yet initialized + { + TensorShape output_shape{ input->info()->tensor_shape() }; + output_shape.set(0, pooled_w); + output_shape.set(1, pooled_h); + + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h)); + + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + + // Set instance variables + _input = input; + _output = output; + _pool_info = pool_info; + _border_size = BorderSize(pool_pad_y, pool_pad_x); + + // Set build options + std::set build_opts; + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + if(input->info()->data_type() == DataType::F32) + { + build_opts.insert("#define DATA_TYPE_FP32"); + } + else + { + build_opts.insert("#define DATA_TYPE_FP16"); + } + build_opts.emplace(("#define POOL_" + string_from_pooling_type(pool_type))); + build_opts.emplace(("#define STRIDE_X " + support::cpp11::to_string(pool_stride_x))); + build_opts.emplace(("#define MAX_WIDTH " + support::cpp11::to_string(input->info()->dimension(0) + pool_pad_x))); + build_opts.emplace(("#define MAX_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1) + pool_pad_y))); + build_opts.emplace(("#define STRIDE_Y " + support::cpp11::to_string(pool_stride_y))); + build_opts.emplace(("#define PAD_X " + support::cpp11::to_string(pool_pad_x))); + build_opts.emplace(("#define PAD_Y " + support::cpp11::to_string(pool_pad_y))); + + // Create kernel + if((pool_size == 2) || (pool_size == 3) || (pool_size == 7)) + { + // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenGLES kernel where + // each thread computes 4 output elements + const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type()); + + int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size; + + if(input->info()->data_type() == DataType::F32) + { + if(is_pool3x3_stride_le3) + { + // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3 + _num_elems_processed_per_iteration = 4; + num_elements_read_per_iteration = pool_size * (pool_stride_x + 1); + } + } + else + { + num_elements_read_per_iteration = pool_size; + if(is_pool3x3_stride_le3) + { + _num_elems_processed_per_iteration = 4; + } + else + { + _num_elems_processed_per_iteration = 2; + } + } + + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + _border_size.right = std::max(upper_bound_w, pool_pad_x); + _border_size.bottom = std::max(upper_bound_h, pool_pad_y); + + std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size); + if(is_pool3x3_stride_le3) + { + build_opts.insert("#define POOLING_LAYER_3_OPTIMIZED"); + _kernel = static_cast(GCKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts)); + } + else + { + build_opts.insert("#define POOLING_LAYER_" + support::cpp11::to_string(pool_size)); + _kernel = static_cast(GCKernelLibrary::get().create_kernel(kernel_name, build_opts)); + } + } + else // Run general case + { + if(input->info()->data_type() == DataType::F32) + { + _num_elems_processed_per_iteration = 1; + } + else + { + _num_elems_processed_per_iteration = 2; + } + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + _border_size.right = std::max(upper_bound_w, pool_pad_x); + _border_size.bottom = std::max(upper_bound_h, pool_pad_y); + + build_opts.emplace(("#define POOL_SIZE " + support::cpp11::to_string(pool_size))); + + build_opts.insert("#define POOLING_LAYER_N"); + _kernel = static_cast(GCKernelLibrary::get().create_kernel("pooling_layer_n", build_opts)); + } + + Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration)); + + if(input->info()->data_type() == DataType::F32) + { + AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); + AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + } + else + { + // Calculate output right and bottom border + const int output_width = output->info()->dimension(0); + const int output_height = output->info()->dimension(1); + const int output_padding_right = ceil_to_multiple(output_width, _num_elems_processed_per_iteration) - output_width; + const int output_padding_bottom = ceil_to_multiple(output_height, 1) - output_height; + const int input_padding_right = ceil_to_multiple(input_width + 2 * _border_size.right, _num_elems_processed_per_iteration) - (input_width + 2 * _border_size.right); + const int input_padding_bottom = ceil_to_multiple(input_height + 2 * _border_size.bottom, 1) - (input_height + 2 * _border_size.bottom); + + // Configure kernel window + AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right + input_padding_right, input_height + _border_size.bottom + input_padding_bottom); + AccessWindowStatic output_access(output->info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + } + + _kernel.clear_params(); + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCPoolingLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0; + std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad(); + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + + _kernel.use(); + + Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_3D(); + + do + { + // Upsample input by pool size + Window in_slice(slice); + in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x * _num_elems_processed_per_iteration)); + in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y)); + + // Set inputs + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, 1, in_slice); + add_3D_tensor_argument(idx, _output, 2, slice); + + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window_collapsed.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp new file mode 100644 index 0000000000..09a0f79ab2 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include + +using namespace arm_compute; + +void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + // Softmax across the x dimension + TensorShape output_shape{ input->info()->tensor_shape() }; + output_shape.set(0, 1); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + _input = input; + _output = output; + + // Set build options + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.insert("#define " + dt_name); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.insert("#define SOFTMAX_LAYER_MAX"); + + // Tell the kernel that the width is not a multiple of 4 + if((input->info()->dimension(0) % 4) != 0) + { + build_opts.insert("#define NON_MULTIPLE_OF_4"); + } + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("softmax_layer_max", build_opts)); + + _kernel.clear_params(); + + // Set fixed arguments + unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters + _kernel.set_params(idx++, input->info()->dimension(0)); + + // Configure kernel window + // The kernel loops over all elements in steps of 4 + const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4); + unsigned int num_elems_written_per_iteration = 1; + if(input->info()->data_type() == DataType::F16) + { + num_elems_written_per_iteration = 2; + } + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCLogits1DMaxKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window slice = window.first_slice_window_3D(); + + _kernel.use(); + + do + { + unsigned int idx1 = 0; + switch(_input->info()->data_type()) + { + case DataType::F16: + add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice); + add_3D_tensor_argument(idx1, _output, BufferParam(2, 2), slice); + break; + + case DataType::F32: + add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice); + add_3D_tensor_argument(idx1, _output, BufferParam(2, 2), slice); + break; + + default: + ARM_COMPUTE_ERROR("Current data type is mot supported"); + break; + } + + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_3D(slice)); +} + +GCLogits1DShiftExpSumKernel::GCLogits1DShiftExpSumKernel() + : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr) +{ +} + +void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTensor *max, IGCTensor *output, IGCTensor *sum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*sum->info(), max->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum); + + _input = input; + _max = max; + _output = output; + _sum = sum; + + // Set build options + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.insert("#define " + dt_name); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.insert("#define SOFTMAX_LAYER_SHIFT_EXP_SUM"); + + // Tell the kernel that the width is not a multiple of 4 + if((input->info()->dimension(0) % 4) != 0) + { + build_opts.insert("#define NON_MULTIPLE_OF_4"); + } + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts)); + + _kernel.clear_params(); + + // Set fixed arguments + unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters + _kernel.set_params(idx++, input->info()->dimension(0)); + + // Configure window + // The kernel loops over all elements in steps of 4 + const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 4); + unsigned int num_elems_written_per_iteration = 1; + if(input->info()->data_type() == DataType::F16) + { + num_elems_written_per_iteration = 2; + } + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal max_access(max->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal sum_access(sum->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, input_access, max_access, output_access, sum_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape())); + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCLogits1DShiftExpSumKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_3D(); + + _kernel.use(); + + do + { + unsigned int idx = 0; + switch(_input->info()->data_type()) + { + case DataType::F16: + add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice); + add_3D_tensor_argument(idx, _max, BufferParam(2, 2), slice); + add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice); + add_3D_tensor_argument(idx, _sum, BufferParam(4, 2), slice); + break; + + case DataType::F32: + add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice); + add_3D_tensor_argument(idx, _max, BufferParam(2, 2), slice); + add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice); + add_3D_tensor_argument(idx, _sum, BufferParam(4, 2), slice); + break; + + default: + ARM_COMPUTE_ERROR("Current data type is mot supported"); + break; + } + + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window_collapsed.slide_window_slice_3D(slice)); +} + +GCLogits1DNormKernel::GCLogits1DNormKernel() + : _input(nullptr), _sum(nullptr), _output(nullptr) +{ +} + +void GCLogits1DNormKernel::configure(const IGCTensor *input, const IGCTensor *sum, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, sum, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + _input = input; + _sum = sum; + _output = output; + + // Set build options + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.insert("#define " + dt_name); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.insert("#define SOFTMAX_LAYER_NORM"); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts)); + + // Configure window + constexpr unsigned int num_elems_processed_per_iteration = 4; + unsigned int num_elems_written_per_iteration = 1; + if(input->info()->data_type() == DataType::F16) + { + num_elems_written_per_iteration = 2; + } + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic sum_access(sum->info(), 0, 0, num_elems_written_per_iteration, sum->info()->dimension(1)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, sum_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + _kernel.clear_params(); + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCLogits1DNormKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(IGCKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_3D(); + + _kernel.use(); + + do + { + Window sum_slice = slice; + sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1)); + + unsigned int idx1 = 0; + switch(_input->info()->data_type()) + { + case DataType::F16: + add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice); + add_3D_tensor_argument(idx1, _sum, BufferParam(2, 2), slice); + add_3D_tensor_argument(idx1, _output, BufferParam(3, 2), slice); + break; + + case DataType::F32: + add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice); + add_3D_tensor_argument(idx1, _sum, BufferParam(2, 2), slice); + add_3D_tensor_argument(idx1, _output, BufferParam(3, 2), slice); + break; + + default: + ARM_COMPUTE_ERROR("Current data type is mot supported"); + break; + } + + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window_collapsed.slide_window_slice_3D(slice)); +} diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp new file mode 100644 index 0000000000..b891b42ef8 --- /dev/null +++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" + +#include +#include + +using namespace arm_compute; + +void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + TensorShape output_shape{ input->info()->tensor_shape() }; + const size_t w_out = input->info()->dimension(1); + const size_t h_out = input->info()->dimension(0); + output_shape.set(0, w_out); + output_shape.set(1, h_out); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + std::set build_opts; + std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; + build_opts.emplace(("#define " + dt_name)); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + + // Create kernel + _kernel = static_cast(GCKernelLibrary::get().create_kernel("transpose", build_opts)); + + _kernel.clear_params(); + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = 4; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration)); + + AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + // set shader params binding point + _kernel.set_shader_params_binding_point(0); + + IGCKernel::configure(win); +} + +void GCTransposeKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IGCKernel::window(), window); + + _kernel.use(); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + if(_input->info()->data_type() == DataType::F32) + { + add_2D_tensor_argument(idx, _input, 1, slice); + add_2D_tensor_argument(idx, _output, 2, slice); + } + else if(_input->info()->data_type() == DataType::F16) + { + add_2D_tensor_argument(idx, _input, BufferParam(1, 3), slice); + add_2D_tensor_argument(idx, _output, BufferParam(2, 3), slice); + } + + _kernel.update_shader_params(); + enqueue(*this, slice); + } + while(window.slide_window_slice_2D(slice)); +} diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp index fc0b6e9361..151d7de9a4 100644 --- a/src/core/Helpers.cpp +++ b/src/core/Helpers.cpp @@ -106,6 +106,13 @@ Window arm_compute::calculate_max_enlarged_window(const ITensorInfo &info, const ++n; } + if(tensor_shape.num_dimensions() > 2) + { + window.set(2, Window::Dimension(0, std::max(1, tensor_shape[n]), steps[2])); + + ++n; + } + for(; n < Coordinates::num_max_dimensions; ++n) { window.set(n, Window::Dimension(0, std::max(1, tensor_shape[n]))); diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index bd6911fd2b..af864f57f7 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -353,6 +353,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); break; case DataType::F16: + print_consecutive_elements_impl(s, reinterpret_cast(ptr), n, stream_width, element_delim); break; default: ARM_COMPUTE_ERROR("Undefined element size for given data type"); @@ -380,7 +381,7 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp case DataType::F32: return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); case DataType::F16: - return 0; + return max_consecutive_elements_display_width_impl(s, reinterpret_cast(ptr), n); default: ARM_COMPUTE_ERROR("Undefined element size for given data type"); } diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp index f4bd49406c..648ce6b3a6 100644 --- a/src/runtime/CL/functions/CLNormalizationLayer.cpp +++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp @@ -37,7 +37,7 @@ CLNormalizationLayer::CLNormalizationLayer() { } -void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info) +void CLNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON(input == nullptr); diff --git a/src/runtime/GLES_COMPUTE/GCScheduler.cpp b/src/runtime/GLES_COMPUTE/GCScheduler.cpp new file mode 100644 index 0000000000..b2235ea6f9 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/GCScheduler.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" + +#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h" + +using namespace arm_compute; + +GCScheduler::GCScheduler() = default; + +void GCScheduler::default_init() +{ + GCKernelLibrary::get().init("./cs_shaders/"); +} + +void GCScheduler::init(EGLDisplay dpy, EGLContext ctx) +{ + GCKernelLibrary::get().init("./cs_shaders/", dpy, ctx); +} + +GCScheduler &GCScheduler::get() +{ + static GCScheduler scheduler; + return scheduler; +} + +void GCScheduler::enqueue(IGCKernel &kernel, bool flush) +{ + kernel.run(kernel.window()); + if(flush) + { + ARM_COMPUTE_GL_CHECK(glFlush()); + } +} + +void GCScheduler::sync() +{ + ARM_COMPUTE_GL_CHECK(glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT)); +} diff --git a/src/runtime/GLES_COMPUTE/GCTensor.cpp b/src/runtime/GLES_COMPUTE/GCTensor.cpp new file mode 100644 index 0000000000..edbd16dc1d --- /dev/null +++ b/src/runtime/GLES_COMPUTE/GCTensor.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" + +using namespace arm_compute; + +GCTensor::GCTensor() + : _allocator() +{ +} + +ITensorAllocator *GCTensor::allocator() +{ + return &_allocator; +} + +TensorInfo *GCTensor::info() const +{ + return &_allocator.info(); +} + +TensorInfo *GCTensor::info() +{ + return &_allocator.info(); +} + +uint8_t *GCTensor::buffer() const +{ + return _allocator.data(); +} + +GLuint GCTensor::gc_buffer() const +{ + return _allocator.get_gl_ssbo_name(); +} + +void GCTensor::map(bool blocking) +{ + IGCTensor::map(blocking); +} + +void GCTensor::unmap() +{ + IGCTensor::unmap(); +} + +uint8_t *GCTensor::do_map(bool blocking) +{ + return _allocator.map(blocking); +} + +void GCTensor::do_unmap() +{ + _allocator.unmap(); +} \ No newline at end of file diff --git a/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp new file mode 100644 index 0000000000..694b34f1ec --- /dev/null +++ b/src/runtime/GLES_COMPUTE/GCTensorAllocator.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +GCTensorAllocator::GCTensorAllocator() + : _gl_buffer(), _mapping(nullptr) +{ +} + +uint8_t *GCTensorAllocator::data() +{ + return _mapping; +} + +void GCTensorAllocator::allocate() +{ + _gl_buffer = support::cpp14::make_unique(); + ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name)); + ARM_COMPUTE_GL_CHECK(glBufferData(GL_SHADER_STORAGE_BUFFER, static_cast(info().total_size()), nullptr, GL_STATIC_DRAW)); + ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0)); + info().set_is_resizable(false); +} + +void GCTensorAllocator::free() +{ + _gl_buffer.reset(); + info().set_is_resizable(true); +} + +uint8_t *GCTensorAllocator::lock() +{ + return map(true); +} + +void GCTensorAllocator::unlock() +{ + unmap(); +} + +GLuint GCTensorAllocator::get_gl_ssbo_name() const +{ + return _gl_buffer->_ssbo_name; +} + +uint8_t *GCTensorAllocator::map(bool blocking) +{ + ARM_COMPUTE_ERROR_ON(_mapping != nullptr); + ARM_COMPUTE_UNUSED(blocking); + + ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name)); + void *p = ARM_COMPUTE_GL_CHECK(glMapBufferRange(GL_SHADER_STORAGE_BUFFER, 0, static_cast(info().total_size()), GL_MAP_READ_BIT | GL_MAP_WRITE_BIT)); + _mapping = reinterpret_cast(p); + + return _mapping; +} + +void GCTensorAllocator::unmap() +{ + ARM_COMPUTE_ERROR_ON(_mapping == nullptr); + + ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, _gl_buffer->_ssbo_name)); + ARM_COMPUTE_GL_CHECK(glUnmapBuffer(GL_SHADER_STORAGE_BUFFER)); + ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0)); + _mapping = nullptr; +} \ No newline at end of file diff --git a/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp new file mode 100644 index 0000000000..19f178f445 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/IGCSimpleFunction.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" + +using namespace arm_compute; + +IGCSimpleFunction::IGCSimpleFunction() //NOLINT + : _kernel(), + _border_handler() +{ +} + +void IGCSimpleFunction::run() +{ + ARM_COMPUTE_ERROR_ON_MSG(!_kernel, "The child class didn't set the GLES kernel or function isn't configured"); + + // FIXME(APPBROWSER-300): We may need to rename "enqueue" to "dispatch" and "sync" to "memory_barrier". + GCScheduler::get().enqueue(_border_handler, false); + GCScheduler::get().sync(); + GCScheduler::get().enqueue(*_kernel); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp b/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp new file mode 100644 index 0000000000..781b357ce7 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h" +#include "arm_compute/core/Helpers.h" +#include "support/ToolchainSupport.h" + +#include + +using namespace arm_compute; + +void GCAbsoluteDifference::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input1, input2, output); + _kernel = std::move(k); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp new file mode 100644 index 0000000000..8686416616 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCActivationLayer.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h" +#include "arm_compute/core/Helpers.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void GCActivationLayer::configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output, act_info); + _kernel = std::move(k); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp new file mode 100755 index 0000000000..2e546a663a --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" + +using namespace arm_compute; + +GCBatchNormalizationLayer::GCBatchNormalizationLayer() + : _norm_kernel() +{ +} + +void GCBatchNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon) +{ + _norm_kernel.configure(input, output, mean, var, beta, gamma, epsilon); +} + +void GCBatchNormalizationLayer::run() +{ + GCScheduler::get().enqueue(_norm_kernel, true); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp new file mode 100755 index 0000000000..ed756cf261 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthConcatenate.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +GCDepthConcatenate::GCDepthConcatenate() //NOLINT + : _concat_kernels_vector(), + _border_handlers_vector(), + _num_inputs(0) +{ +} + +void GCDepthConcatenate::configure(std::vector inputs_vector, IGCTensor *output) //NOLINT +{ + ARM_COMPUTE_ERROR_ON(inputs_vector.size() < 2); + + _num_inputs = inputs_vector.size(); + + unsigned int depth_offset = 0; + + _concat_kernels_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + _border_handlers_vector = arm_compute::support::cpp14::make_unique(_num_inputs); + + for(unsigned int i = 0; i < _num_inputs; i++) + { + _concat_kernels_vector[i].configure(inputs_vector.at(i), depth_offset, output); + _border_handlers_vector[i].configure(inputs_vector.at(i), _concat_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0)); + + depth_offset += inputs_vector.at(i)->info()->dimension(2); + } +} + +void GCDepthConcatenate::run() +{ + for(unsigned i = 0; i < _num_inputs; i++) + { + GCScheduler::get().enqueue(_border_handlers_vector[i], false); + GCScheduler::get().enqueue(_concat_kernels_vector[i], true); + } +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp new file mode 100644 index 0000000000..ae9dd51b8e --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h" + +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void GCDirectConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info) +{ + int kernel_size = weights->info()->dimension(0); + + if(kernel_size == 1) + { + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, weights, biases, output, conv_info); + _kernel = std::move(k); + } + else if(kernel_size == 3) + { + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, weights, biases, output, conv_info); + _kernel = std::move(k); + } + else if(kernel_size == 5) + { + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, weights, biases, output, conv_info); + _kernel = std::move(k); + } + else + { + ARM_COMPUTE_ERROR("kernel size unsupported!"); + return; + } + + _border_handler.configure(input, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0)); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp new file mode 100644 index 0000000000..032c2fdb1e --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCDropoutLayer.cpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h" + +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h" + +using namespace arm_compute; + +GCDropoutLayer::GCDropoutLayer() + : _dropout_kernel() +{ +} + +void GCDropoutLayer::configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, mask, output); + + // Configure kernel + _dropout_kernel.configure(input, mask, output, ratio, forward); +} + +void GCDropoutLayer::run() +{ + GCScheduler::get().enqueue(_dropout_kernel); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp b/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp new file mode 100644 index 0000000000..5c2431fa13 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCFillBorder.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h" +#include "arm_compute/core/Helpers.h" +#include "support/ToolchainSupport.h" + +#include + +using namespace arm_compute; + +void GCFillBorder::configure(IGCTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(tensor, BorderSize(border_width), border_mode, constant_border_value); + _kernel = std::move(k); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp new file mode 100644 index 0000000000..63cb40e616 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h" + +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" +#include "support/ToolchainSupport.h" + +#include + +using namespace arm_compute; + +void GCFullyConnectedLayerReshapeWeights::configure(const IGCTensor *input, IGCTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output); + _kernel = std::move(k); +} + +GCFullyConnectedLayer::GCFullyConnectedLayer() + : _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(), _are_weights_reshaped(true), _is_fc_after_conv(true), + _accumulate_biases(false) +{ +} + +void GCFullyConnectedLayer::configure_conv_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON((weights->info()->dimension(1) != (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)))); + + const DataType dt = input->info()->data_type(); + + // If the fully connected layer is called after a convolution layer, the input tensor must be linearized + + // Initialize output tensor for im2col + TensorShape shape_im2col; + shape_im2col.set(0, input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2)); + shape_im2col.set(1, input->info()->dimension(3)); + shape_im2col.set(2, input->info()->dimension(4)); + shape_im2col.set(3, input->info()->dimension(5)); + _im2col_output.allocator()->init(TensorInfo(shape_im2col, 1, dt)); + + // Configure im2col kernel + _im2col_kernel.configure(input, &_im2col_output, std::make_pair(1, 1), PadStrideInfo(1, 1, 0, 0), false); + + // Configure matrix multiply kernel + _mm_kernel.configure(&_im2col_output, weights, output, 1.0f, false); + + // Allocate the output tensor for im2col once all the configure methods have been called + _im2col_output.allocator()->allocate(); +} + +void GCFullyConnectedLayer::configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != weights->info()->dimension(1)); + + // Configure matrix multiply kernel + _mm_kernel.configure(input, weights, output, 1.0f, false); +} + +void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, bool transpose_weights, bool are_weights_reshaped) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2); + + _are_weights_reshaped = transpose_weights ? are_weights_reshaped : true; + _is_fc_after_conv = true; + _accumulate_biases = false; + + if(biases != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + + _accumulate_biases = true; + + // Configure accumulate biases kernel + _accumulate_biases_kernel.configure(output, biases); + } + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const IGCTensor *weights_to_use = weights; + + if(!_are_weights_reshaped) + { + weights_to_use = &_reshape_weights_output; + + // Reshape the weights + _reshape_weights_kernel.configure(weights, &_reshape_weights_output); + } + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = output->info()->dimension(1) > 1; + + if(is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = input->info()->num_dimensions() > 1; + } + + if(_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(input, weights_to_use, output); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(input, weights_to_use, output); + } + + // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called + if(!_are_weights_reshaped) + { + // Allocate the tensor for the weights reshaped + _reshape_weights_output.allocator()->allocate(); + } +} + +void GCFullyConnectedLayer::run() +{ + // Reshape of the weights (happens only once) + if(!_are_weights_reshaped) + { + _are_weights_reshaped = true; + _reshape_weights_kernel.run(); + } + + // Linearize input if it comes from a convolutional layer + if(_is_fc_after_conv) + { + GCScheduler::get().enqueue(_im2col_kernel, false); + } + + GCScheduler::get().sync(); + + // Run matrix multiply + GCScheduler::get().enqueue(_mm_kernel, !_accumulate_biases); + + // Accumulate biases if provided + if(_accumulate_biases) + { + GCScheduler::get().sync(); + + GCScheduler::get().enqueue(_accumulate_biases_kernel); + } +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp new file mode 100644 index 0000000000..c47a0e71fb --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" +#include "arm_compute/runtime/ITensorAllocator.h" + +using namespace arm_compute; + +GCGEMM::GCGEMM() + : _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false) +{ +} + +void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b, output); + + if(c != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, c); + ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(1) != c->info()->dimension(1), "The C matrix must have the same number of rows as the matrix A"); + ARM_COMPUTE_ERROR_ON_MSG(b->info()->dimension(0) != c->info()->dimension(0), "The C matrix must have the same number of columns as the matrix C"); + ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(0) != output->info()->dimension(0), "The C matrix must have the same number of rows as the output matrix"); + ARM_COMPUTE_ERROR_ON_MSG(c->info()->dimension(1) != output->info()->dimension(1), "The C matrix must have the same number of columns as the output matrix"); + } + + ARM_COMPUTE_ERROR_ON_MSG(a->info()->dimension(0) != b->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B"); + + // If the input tensor has less than 16 rows, we run a special version of GEMM without reshaping the input tensors + _is_interleaved_transposed = a->info()->dimension(1) > 16; + + const IGCTensor *matrix_a = a; + const IGCTensor *matrix_b = b; + + if(_is_interleaved_transposed) + { + matrix_a = &_tmp_a; + matrix_b = &_tmp_b; + + TensorShape shape_tmp_a = a->info()->tensor_shape(); + TensorShape shape_tmp_b = b->info()->tensor_shape(); + + shape_tmp_a.set(0, a->info()->dimension(0) * 4); + shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.0f)); + + const unsigned int transpose_w = max_gc_vector_width / data_size_from_type(b->info()->data_type()); + shape_tmp_b.set(0, b->info()->dimension(1) * transpose_w); + shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / static_cast(transpose_w))); + + TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type(), a->info()->fixed_point_position()); + _tmp_a.allocator()->init(info_a); + + TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type(), b->info()->fixed_point_position()); + _tmp_b.allocator()->init(info_b); + + // Configure interleave kernel + _interleave_kernel.configure(a, &_tmp_a); + + // Configure transpose kernel + _transpose_kernel.configure(b, &_tmp_b); + } + + _mm_kernel.configure(matrix_a, matrix_b, output, alpha, _is_interleaved_transposed); + + if(_is_interleaved_transposed) + { + // Allocate intermediate tensors + _tmp_a.allocator()->allocate(); + _tmp_b.allocator()->allocate(); + } + + // Configure matrix addition kernel + if(beta != 0 && c != nullptr) + { + _ma_kernel.configure(c, output, beta); + _run_addition = true; + } +} + +void GCGEMM::run() +{ + if(_is_interleaved_transposed) + { + // Run interleave kernel + GCScheduler::get().enqueue(_interleave_kernel, false); + + // Run transpose kernel + GCScheduler::get().enqueue(_transpose_kernel, false); + } + + // Run matrix multiply kernel + GCScheduler::get().enqueue(_mm_kernel, !_run_addition); + + // Run matrix addition kernel + if(_run_addition) + { + GCScheduler::get().enqueue(_ma_kernel); + } +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp new file mode 100644 index 0000000000..44c940e126 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void GCGEMMInterleave4x4::configure(const IGCTensor *input, IGCTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp new file mode 100644 index 0000000000..893fa5572b --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h" + +#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" +#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h" +#include "arm_compute/core/Types.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void GCGEMMTranspose1xW::configure(const IGCTensor *input, IGCTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp new file mode 100644 index 0000000000..d30ed52d5c --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" + +using namespace arm_compute; + +GCNormalizationLayer::GCNormalizationLayer() + : _squared_input(), _norm_kernel(), _multiply_kernel(), _border_handler() +{ +} + +void GCNormalizationLayer::configure(const IGCTensor *input, IGCTensor *output, const NormalizationLayerInfo &norm_info) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr); + + _squared_input.allocator()->init(TensorInfo(input->info()->tensor_shape(), 1, input->info()->data_type())); + + _norm_kernel.configure(input, &_squared_input, output, norm_info); + _multiply_kernel.configure(input, input, &_squared_input, 1.0f); + // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel + _border_handler.configure(&_squared_input, _norm_kernel.border_size(), BorderMode::CONSTANT, PixelValue(0)); + + // Allocate intermediate buffers + _squared_input.allocator()->allocate(); +} + +void GCNormalizationLayer::run() +{ + GCScheduler::get().enqueue(_multiply_kernel, false); + GCScheduler::get().enqueue(_border_handler, false); + GCScheduler::get().enqueue(_norm_kernel, false); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp b/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp new file mode 100755 index 0000000000..0cd87ea875 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h" +#include "support/ToolchainSupport.h" + +#include + +using namespace arm_compute; + +void GCPixelWiseMultiplication::configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input1, input2, output, scale); + _kernel = std::move(k); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp new file mode 100644 index 0000000000..46a60cddef --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCPoolingLayer.cpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h" +#include "arm_compute/core/PixelValue.h" +#include "support/ToolchainSupport.h" + +using namespace arm_compute; + +void GCPoolingLayer::configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info) +{ + // Configure pooling kernel + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output, pool_info); + _kernel = std::move(k); + + // Configure border depending on operation required + BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT; + _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0.0f)); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp new file mode 100644 index 0000000000..d7d47d2802 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h" + +using namespace arm_compute; + +GCSoftmaxLayer::GCSoftmaxLayer() + : _max_kernel(), _shift_exp_sum_kernel(), _norm_kernel(), _max(), _sum(), _tmp() +{ +} + +void GCSoftmaxLayer::configure(const IGCTensor *input, IGCTensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + + // Create intermediate tensors shapes + _tmp.allocator()->init(TensorInfo(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position())); + + TensorShape shape = input->info()->tensor_shape(); + shape.set(0, 1); + TensorInfo tensor_info_max_sum(shape, input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()); + _max.allocator()->init(tensor_info_max_sum); + _sum.allocator()->init(tensor_info_max_sum); + + // Configure Kernels + _max_kernel.configure(input, &_max); + _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum); + _norm_kernel.configure(&_tmp, &_sum, output); + + // Allocate intermediate buffers + _tmp.allocator()->allocate(); + _max.allocator()->allocate(); + _sum.allocator()->allocate(); +} + +void GCSoftmaxLayer::run() +{ + GCScheduler::get().enqueue(_max_kernel, false); + GCScheduler::get().enqueue(_shift_exp_sum_kernel, false); + GCScheduler::get().enqueue(_norm_kernel); +} diff --git a/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp b/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp new file mode 100644 index 0000000000..c2dc122e64 --- /dev/null +++ b/src/runtime/GLES_COMPUTE/functions/GCTranspose.cpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h" + +#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h" +#include "support/ToolchainSupport.h" + +#include + +using namespace arm_compute; + +void GCTranspose::configure(const IGCTensor *input, IGCTensor *output) +{ + auto k = arm_compute::support::cpp14::make_unique(); + k->configure(input, output); + _kernel = std::move(k); +} diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp index e01ef6660d..da4314b5ed 100644 --- a/src/runtime/NEON/functions/NENormalizationLayer.cpp +++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp @@ -37,7 +37,7 @@ NENormalizationLayer::NENormalizationLayer(std::shared_ptr memor { } -void NENormalizationLayer::configure(const ITensor *input, ITensor *output, NormalizationLayerInfo norm_info) +void NENormalizationLayer::configure(const ITensor *input, ITensor *output, const NormalizationLayerInfo &norm_info) { ARM_COMPUTE_ERROR_ON(input == nullptr); -- cgit v1.2.1