aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoel Liang <joel.liang@arm.com>2017-11-10 09:59:19 +0800
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commitf1f3ebd517089e934cf3f06e64d90619a395ad87 (patch)
tree8dac05909b5f522a1c78e0ac4423cb6f65254391
parent283c1790da45ab562ecfb2aa7741297191886d85 (diff)
downloadComputeLibrary-f1f3ebd517089e934cf3f06e64d90619a395ad87.tar.gz
APPBROWSER-298, APPBROWSER-306: Reimplement the common code of compute shader
The new common code of compute shader is in file helpers_cs.h Rewrite the direct_convolution1x1.cs and softmax_layer.cs to use the new common code. It will also remove the dependence of the token pasting operator (##). We'll remove the "##" support after we rewrite all of the compute shader code. Change-Id: Icd8553ef6b61ad484a8507590ac8ed499bd47061 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95455 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Frank Lei <frank.lei@arm.com> (cherry picked from commit 0a4f83570d261f839d9866b68979efe8d7a95883) Reviewed-on: http://mpd-gerrit.cambridge.arm.com/95601 Reviewed-by: Jim He <jim.he@arm.com>
-rw-r--r--arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h26
-rw-r--r--arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h3
-rwxr-xr-xscripts/format_doxygen.py1
-rw-r--r--src/core/CL/cl_kernels/direct_convolution1x1.cl4
-rw-r--r--src/core/CL/cl_kernels/direct_convolution3x3.cl4
-rw-r--r--src/core/CL/cl_kernels/direct_convolution5x5.cl4
-rw-r--r--src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl2
-rw-r--r--src/core/CL/cl_kernels/gemv.cl2
-rw-r--r--src/core/GLES_COMPUTE/GCKernelLibrary.cpp31
-rw-r--r--src/core/GLES_COMPUTE/IGCKernel.cpp19
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs64
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs128
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs220
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs18
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs4
-rwxr-xr-xsrc/core/GLES_COMPUTE/cs_shaders/gemm.cs4
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/helpers.h4
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h396
-rwxr-xr-xsrc/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs96
-rw-r--r--src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs596
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp5
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp4
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp4
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp7
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp2
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp9
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp3
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp14
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp5
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp4
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp4
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp2
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp4
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp13
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp4
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp5
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp3
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp107
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp5
39 files changed, 908 insertions, 922 deletions
diff --git a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
index e601b529ed..9a5376c876 100644
--- a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
+++ b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
@@ -129,28 +129,28 @@ public:
* @return program id.
*/
void unuse();
- /** Set value at uniform idx.
+ /** Set argument value at index of shader params.
*
- * @param[in] idx Index in vector.
- * @param[in] value Set value.
+ * @param[in] idx Index in shader params.
+ * @param[in] value Argument value to be set.
*/
template <class T>
- void set_params(unsigned int idx, T value)
+ void set_argument(unsigned int idx, T value)
{
- if(idx >= _params.size())
+ if(idx >= _shader_arguments.size())
{
- _params.resize(idx + 1, 0);
+ _shader_arguments.resize(idx + 1, 0);
}
- unsigned int *p = reinterpret_cast<unsigned int *>(&value);
- _params[idx] = *p;
+ unsigned int *p = reinterpret_cast<unsigned int *>(&value);
+ _shader_arguments[idx] = *p;
}
- /** Clear params.
+ /** Clear shader arguments.
*
*/
- void clear_params()
+ void clear_arguments()
{
- _params.clear();
+ _shader_arguments.clear();
}
/** Set shader params binding point.
*
@@ -172,8 +172,8 @@ public:
private:
std::string _name; /**< Kernel name */
GLuint _program; /**< Linked program id */
- std::vector<unsigned int> _params; /**< Store all the values of the shader parameters */
- GLuint _shader_params; /**< Uniform buffer object name for shader parameters */
+ std::vector<unsigned int> _shader_arguments; /**< Store all the values of the shader arguments */
+ GLuint _shader_params_ubo_name; /**< Uniform buffer object name for shader parameters */
GLuint _shader_params_binding_point; /**< The binding point of the uniform block for shader parameters */
GLuint _shader_params_index; /**< The index of the uniform block */
GLint _shader_params_size; /**< The uniform block data size in the shader */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
index b9eb305bab..483e19b213 100644
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
+++ b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
@@ -40,9 +40,6 @@ public:
* @param[out] output Destination tensor. Data types supported: same as @p input
*/
void configure(const IGCTensor *input, IGCTensor *output);
-
- // Inherited methods overridden:
- void run(const Window &window) override;
};
/** Interface for shifting the logits values around the max value and exponentiating the result */
diff --git a/scripts/format_doxygen.py b/scripts/format_doxygen.py
index 43c0ff49a3..423c2bfbf9 100755
--- a/scripts/format_doxygen.py
+++ b/scripts/format_doxygen.py
@@ -84,6 +84,7 @@ if __name__ == "__main__":
for path in paths:
if (path[-3:] not in ("cpp", "inl") and
path[-2:] not in ("cl") and
+ path[-2:] not in ("cs") and
path[-1] not in ("h")):
continue
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
index 484bc35ef1..817c261ba2 100644
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ b/src/core/CL/cl_kernels/direct_convolution1x1.cl
@@ -153,7 +153,7 @@ inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_T
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -241,7 +241,7 @@ __kernel void direct_convolution1x1(
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
index e6e3007c95..a7abc9ff1d 100644
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ b/src/core/CL/cl_kernels/direct_convolution3x3.cl
@@ -102,7 +102,7 @@ MULQ_SAT_IMPL(qs32x8, qs32x8)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -198,7 +198,7 @@ __kernel void direct_convolution3x3(
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
index 12cf0fb68e..e678f6f51b 100644
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl
@@ -91,7 +91,7 @@
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -197,7 +197,7 @@ __kernel void direct_convolution5x5(
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
index 7a860f2008..c94f81e390 100644
--- a/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
+++ b/src/core/CL/cl_kernels/direct_convolution_1x1_3x3_5x5_quantized.cl
@@ -168,7 +168,7 @@ inline uchar8 extract_input_stride3(__global const uchar *input_pixel)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p weights_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
index 76128f7033..3e38c735fe 100644
--- a/src/core/CL/cl_kernels/gemv.cl
+++ b/src/core/CL/cl_kernels/gemv.cl
@@ -35,7 +35,7 @@
* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
diff --git a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
index fd362f1665..f446859a8d 100644
--- a/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
+++ b/src/core/GLES_COMPUTE/GCKernelLibrary.cpp
@@ -116,7 +116,8 @@ GLuint GCProgram::compile_shader(const std::string &build_options)
output_stream << std::setw(6) << line_num << ": " << line << std::endl;
line_num++;
}
- ARM_COMPUTE_LOG_INFO_STREAM_CORE("GLES Shader source code:" << output_stream.rdbuf());
+ ARM_COMPUTE_LOG_INFO_STREAM_CORE("GLES Shader source code:\n"
+ << output_stream.rdbuf());
#endif /* ARM_COMPUTE_DEBUG_ENABLED */
ARM_COMPUTE_ERROR("Error: Compiler log:\n%s\n", log.data());
@@ -128,22 +129,22 @@ GLuint GCProgram::compile_shader(const std::string &build_options)
}
GCKernel::GCKernel()
- : _name(), _program(), _params(), _shader_params(), _shader_params_binding_point(), _shader_params_index(), _shader_params_size()
+ : _name(), _program(), _shader_arguments(), _shader_params_ubo_name(), _shader_params_binding_point(), _shader_params_index(), _shader_params_size()
{
}
GCKernel::GCKernel(std::string name, GLuint program)
: _name(std::move(name)),
_program(program),
- _params(),
- _shader_params(0),
+ _shader_arguments(),
+ _shader_params_ubo_name(0),
_shader_params_binding_point(0),
_shader_params_index(0),
_shader_params_size(0)
{
- _params.clear();
+ _shader_arguments.clear();
- ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params));
+ ARM_COMPUTE_GL_CHECK(glGenBuffers(1, &_shader_params_ubo_name));
_shader_params_index = ARM_COMPUTE_GL_CHECK(glGetUniformBlockIndex(_program, _shader_params_name));
ARM_COMPUTE_ERROR_ON_MSG((_shader_params_index == GL_INVALID_INDEX), "Failed to get index of %s", _shader_params_name);
@@ -153,7 +154,7 @@ GCKernel::GCKernel(std::string name, GLuint program)
void GCKernel::cleanup()
{
- ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_shader_params));
+ ARM_COMPUTE_GL_CHECK(glDeleteBuffers(1, &_shader_params_ubo_name));
ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0));
ARM_COMPUTE_GL_CHECK(glDeleteProgram(_program));
ARM_COMPUTE_GL_CHECK(glUseProgram(0));
@@ -171,13 +172,13 @@ void GCKernel::unuse()
void GCKernel::update_shader_params()
{
- ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size != (int)(_params.size() * sizeof(_params[0]))), "Params size (%d) is not equal to shader params block size (%d)", _params.size() * sizeof(_params[0]),
- _shader_params_size);
+ ARM_COMPUTE_ERROR_ON_MSG((_shader_params_size != (int)(_shader_arguments.size() * sizeof(_shader_arguments[0]))), "Arguments size (%d) is not equal to shader params block size (%d)",
+ _shader_arguments.size() * sizeof(_shader_arguments[0]), _shader_params_size);
ARM_COMPUTE_GL_CHECK(glUniformBlockBinding(_program, _shader_params_index, _shader_params_binding_point));
- ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_UNIFORM_BUFFER, _shader_params_binding_point, _shader_params));
- ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, _shader_params));
- ARM_COMPUTE_GL_CHECK(glBufferData(GL_UNIFORM_BUFFER, _shader_params_size, _params.data(), GL_DYNAMIC_DRAW));
+ ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_UNIFORM_BUFFER, _shader_params_binding_point, _shader_params_ubo_name));
+ ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, _shader_params_ubo_name));
+ ARM_COMPUTE_GL_CHECK(glBufferData(GL_UNIFORM_BUFFER, _shader_params_size, _shader_arguments.data(), GL_DYNAMIC_DRAW));
ARM_COMPUTE_GL_CHECK(glBindBuffer(GL_UNIFORM_BUFFER, 0));
}
@@ -319,7 +320,6 @@ GCKernel GCKernelLibrary::create_kernel(const std::string &shader_name, const St
{
// If program has been built, retrieve to create kernel from it
kernel = built_program_it->second;
- kernel.use();
}
else
{
@@ -340,6 +340,11 @@ GCKernel GCKernelLibrary::create_kernel(const std::string &shader_name, const St
_built_programs_map.emplace(built_program_name, kernel);
}
+ kernel.use();
+ kernel.clear_arguments();
+ // set shader params binding point
+ kernel.set_shader_params_binding_point(0);
+
return kernel;
}
diff --git a/src/core/GLES_COMPUTE/IGCKernel.cpp b/src/core/GLES_COMPUTE/IGCKernel.cpp
index 154a2c0c66..d6ad6c47d9 100644
--- a/src/core/GLES_COMPUTE/IGCKernel.cpp
+++ b/src/core/GLES_COMPUTE/IGCKernel.cpp
@@ -74,7 +74,8 @@ GCKernel &IGCKernel::kernel()
template <unsigned int dimension_size>
unsigned int IGCKernel::num_arguments_per_tensor() const
{
- return 2 + 2 * dimension_size;
+ // Rounding up the tensor attributes structure in compute shader to a multiple of a vec4
+ return ceil_to_multiple(1 + 2 * dimension_size, 4);
}
template <unsigned int dimension_size>
@@ -97,12 +98,20 @@ void IGCKernel::add_tensor_argument(unsigned int &idx, const IGCTensor *tensor,
for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
{
- _kernel.set_params(idx++, strides[dimension]);
- _kernel.set_params(idx++, strides[dimension] * window[dimension].step());
+ _kernel.set_argument(idx++, strides[dimension]);
+ _kernel.set_argument(idx++, strides[dimension] * window[dimension].step());
}
- _kernel.set_params(idx++, offset_first_element);
- _kernel.set_params(idx++, param.buffer_data_type_shift);
+ _kernel.set_argument(idx++, offset_first_element);
+ _kernel.set_argument(idx++, param.buffer_data_type_shift);
+
+ // Rounding up the tensor attributes structure in compute shader to a multiple of a vec4
+ unsigned int idx_end = ceil_to_multiple(idx, 4);
+ for(unsigned int i = idx; i < idx_end; ++i)
+ {
+ _kernel.set_argument(i, 0);
+ }
+ idx = idx_end;
ARM_COMPUTE_GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, param.binding_point, tensor->gc_buffer()));
diff --git a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
index fc9da114f7..38ba183d2a 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/activation_layer.cs
@@ -129,22 +129,22 @@ BUFFER_DECLARATION(dst, 2, float, writeonly);
* @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
* @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y ride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y ride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
*/
void main(void)
{
@@ -193,22 +193,22 @@ BUFFER_DECLARATION(dst, 2, uint, writeonly);
* @note Activation function should be given as a preprocessor argument using "#define act_name". e.g. "#define TANH"
* @note A, B variables required by some activation functions are set using A_VAL= and B_VAL= respectively.
*
- * @param[in] src_ptr Pointer to the source image. Supported data types: F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y ride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y ride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
*/
void main(void)
{
diff --git a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
index 54880926cc..c3df5d5c4d 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/batchnormalization_layer.cs
@@ -60,38 +60,38 @@ BUFFER_DECLARATION(gamma, 6, float, readonly);
*
* @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
*
- * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F32
- * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
- * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p src_ptr
- * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
- * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
- * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p src_ptr
- * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
- * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
- * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
- * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p src_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p src_ptr
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
*/
void main(void)
{
@@ -138,38 +138,38 @@ BUFFER_DECLARATION(gamma, 6, uint, );
*
* @note Epsilon parameter in the batch normalization equation should be given as a preprocessor argument using "#define EPSILON". e.g. "#define EPSILON 0.1"
*
- * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16
- * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
- * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p src_ptr
- * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
- * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
- * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p src_ptr
- * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
- * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
- * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
- * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p src_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p src_ptr
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p src_ptr
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
*/
void main(void)
{
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
index 3a31cb80a7..071c1858bc 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution1x1.cs
@@ -24,107 +24,88 @@
layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
+#include "helpers_cs.h"
-layout(std140) uniform shader_params
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note The convolution stride x must be passed at compile time using "#define STRIDE_X n" e.g. "#define STRIDE_X 1"
+ * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_attrs The attributes of the destination tensor
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_attrs The attributes of the weights tensor
+ * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in] biases_attrs The attributes of the weights tensor
+ * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
+ * @param[in] weights_depth The third dimensions of the weights tensors
+ */
+SHADER_PARAMS_DECLARATION
{
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(weights);
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes dst_attrs;
+ Tensor3DAttributes weights_attrs;
#ifdef BIAS
- VECTOR_PARAM_DECLARATION(biases);
+ VectorAttributes biases_attrs;
#endif /* BIAS */
uint weights_stride_w;
uint weights_depth;
};
#if defined(DATA_TYPE_FP32)
-precision highp float;
-
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-BUFFER_DECLARATION(weights, 3, float, readonly);
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, float, weights_ptr, weights_shift, 2, readonly);
#ifdef BIAS
-BUFFER_DECLARATION(biases, 4, float, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, float, biases_ptr, biases_shift, 2, readonly);
#endif /* BIAS */
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP32"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
void main()
{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
#ifdef BIAS
- Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+ VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
#endif /* BIAS */
- float pixels = CONVERT(0, float);
+ float pixels = 0.f;
uint z_index = gl_GlobalInvocationID.z;
- weights.current_offset += z_index * weights_stride_w >> 2;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
+
float temp;
float temp_weight;
-
for(int d = 0; d < int(weights_depth); ++d)
{
- temp = LOAD4(src, CURRENT_OFFSET(src));
- temp_weight = LOAD4(weights, CURRENT_OFFSET(weights));
+ temp = LOAD_CURRENT_ITEM(src_ptr, src_iter);
+ temp_weight = LOAD_CURRENT_ITEM(weights_ptr, weights_iter);
pixels += temp * temp_weight;
- src.current_offset += (src_stride_z >> 2);
- weights.current_offset += (weights_stride_z >> 2);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
}
#ifdef BIAS
- pixels += LOAD4(biases, vector_offset(biases, int(z_index)));
+ pixels += LOAD(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
#endif /* BIAS */
- STORE4(dst, CURRENT_OFFSET(dst), pixels);
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, pixels);
}
#elif defined(DATA_TYPE_FP16)
-precision mediump float;
-BUFFER_DECLARATION(src, 1, uvec4, readonly);
-BUFFER_DECLARATION(dst, 2, uvec4, writeonly);
-BUFFER_DECLARATION(weights, 3, uint, readonly);
+TENSOR_DECLARATION(1, srcBuffer, uvec4, src_ptr, src_shift, 4, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uvec4, dst_ptr, dst_shift, 4, writeonly);
+TENSOR_DECLARATION(3, weightsBuffer, uint, weights_ptr, weights_shift, 2, readonly);
#ifdef BIAS
-BUFFER_DECLARATION(biases, 4, uint, readonly);
+TENSOR_DECLARATION(4, biasesBuffer, uint, biases_ptr, biases_shift, 2, readonly);
#endif /* BIAS */
#if STRIDE_X == 2
@@ -135,15 +116,10 @@ BUFFER_DECLARATION(biases, 4, uint, readonly);
#error STRIDE_X larger than 2 is not supported
#endif /* STRIDE_X == 2 */
-vec4[2] convolve_stride1(Image src, float w)
+vec4[2] convolve_stride1(ImageIterator src_iter, float w)
{
- uvec4 packed_s;
- vec4 s[2];
-
- GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
-
- s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
- s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+ vec4 s[2];
+ s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
s[0] *= w;
s[1] *= w;
@@ -151,22 +127,14 @@ vec4[2] convolve_stride1(Image src, float w)
return s;
}
-vec4[2] convolve_stride2(Image src, float w)
+vec4[2] convolve_stride2(ImageIterator src_iter, float w)
{
- uvec4 packed_s;
- vec4 s[2];
- vec4 r[2];
-
- GC_LOAD1_2D_OFFSET(packed_s, src, 0, 0);
- s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
- s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
+ vec4 s[2];
+ vec4 r[2];
+ s = LOAD_UNPACK8_CURRENT_ITEM_HALF(src_ptr, src_iter);
r[0] = vec4(s[0].xz, s[1].xz);
-
- GC_LOAD1_2D_OFFSET(packed_s, src, 8, 0);
- s[0] = vec4(unpackHalf2x16(packed_s.x), unpackHalf2x16(packed_s.y));
- s[1] = vec4(unpackHalf2x16(packed_s.z), unpackHalf2x16(packed_s.w));
-
+ s = LOAD_UNPACK8_HALF(src_ptr, IMAGE_OFFSET(src_iter, 8, 0));
r[1] = vec4(s[0].xz, s[1].xz);
r[0] *= w;
@@ -175,51 +143,14 @@ vec4[2] convolve_stride2(Image src, float w)
return r;
}
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using "#define DATA_TYPE_FP16"
- * @note The convolution stride x must be passed at compile time using "#define STRIDE_X" e.g. "#define STRIDE_X 1"
- * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
- * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
- * @param[in] weights_depth The third dimensions of the weights tensors
- */
void main()
{
- Image src = GC_CONVERT_TO_IMAGE_STRUCT(src);
- Tensor3D weights = GC_CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
- Tensor3D dst = GC_CONVERT_TO_TENSOR3D_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ Tensor3DIterator weights_iter = CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(weights_attrs, weights_shift);
+ Tensor3DIterator dst_iter = CONVERT_TO_TENSOR3D_ITERATOR(dst_attrs, dst_shift);
#ifdef BIAS
- Vector biases = GC_CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+ VectorIterator biases_iter = CONVERT_TO_VECTOR_ITERATOR_NO_STEP(biases_attrs, biases_shift);
#endif /* BIAS */
vec4 pixels[2];
@@ -227,48 +158,41 @@ void main()
pixels[1] = vec4(0.f);
uint z_index = gl_GlobalInvocationID.z;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, z_index * weights_stride_w);
- weights.current_offset += z_index * weights_stride_w;
-
- uint packed_w;
float w;
-
for(int d = 0; d < int(weights_depth); ++d)
{
- GC_LOAD1_3D_OFFSET(packed_w, weights, 0, 0, 0);
- w = unpackHalf2x16(packed_w).x;
+ w = LOAD_UNPACK2_CURRENT_ITEM_HALF(weights_ptr, weights_iter).x;
- vec4 r[2] = CONVOLVE(src, w);
+ vec4 r[2] = CONVOLVE(src_iter, w);
pixels[0] += r[0];
pixels[1] += r[1];
- src.current_offset += src_stride_z;
- weights.current_offset += weights_stride_z;
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(src_iter, src_attrs.stride_z);
+ TENSOR_ITERATOR_ADVANCE_IN_BYTES(weights_iter, weights_attrs.stride_z);
}
#ifdef BIAS
- uint packed_b;
+ vec2 vec2_b;
float b;
- GC_LOAD1_1D_OFFSET(packed_b, biases, z_index);
+ vec2_b = LOAD_UNPACK2_HALF(biases_ptr, VECTOR_OFFSET(biases_iter, z_index));
if(z_index % uint(2) == uint(0))
{
- b = unpackHalf2x16(packed_b).x;
+ b = vec2_b.x;
}
else
{
- b = unpackHalf2x16(packed_b).y;
+ b = vec2_b.y;
}
- pixels[0] += vec4(b);
- pixels[1] += vec4(b);
+ pixels[0] += b;
+ pixels[1] += b;
#endif /* BIAS */
- uvec4 packed_d;
- packed_d = uvec4(packHalf2x16(pixels[0].xy), packHalf2x16(pixels[0].zw),
- packHalf2x16(pixels[1].xy), packHalf2x16(pixels[1].zw));
- GC_STORE1_3D_OFFSET(packed_d, dst, 0, 0, 0);
+ STORE_PACK8_CURRENT_ITEM_HALF(dst_ptr, dst_iter, pixels);
}
#else /* DATA_TYPE_FP32 */
#error Data type not supported
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
index 67b92cb8cf..d450ac17e1 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution3x3.cs
@@ -82,7 +82,7 @@ BUFFER_DECLARATION(biases, 4, float, readonly);
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -230,7 +230,7 @@ vec4[2] convolve1x3_stride2(uint offset, vec3 w)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -376,7 +376,7 @@ vec4 convolve1x3_stride2(uint offset, vec3 w)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -481,7 +481,7 @@ vec4 convolve1x3_stride1(vec4 left, vec4 middle, vec4 right, vec3 w)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -647,7 +647,7 @@ vec4[3] load_and_unpack(uint offset)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -903,7 +903,7 @@ vec4[3] load_and_unpack_stride2(uint offset)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -1063,7 +1063,7 @@ vec4[2] load_and_unpack(uint offset)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -1248,7 +1248,7 @@ vec4[2] load_and_unpack(uint offset)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -1444,7 +1444,7 @@ vec4[2] load_and_unpack(uint offset)
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
diff --git a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
index 4fdbf0d19e..f3b843de73 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/direct_convolution5x5.cs
@@ -77,7 +77,7 @@ BUFFER_DECLARATION(biases, 4, float, readonly);
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
@@ -249,7 +249,7 @@ vec4 convolve1x5_stride2(vec4 tmp[3], vec2 w[3])
* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index 3313b88718..ffa0ebb2af 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
@@ -618,6 +618,6 @@ void main(void)
GC_STORE1_2D_OFFSET(packed_s[0], accum, 0, 0);
}
#endif /* GEMM_ACCUMULATE_BIASES */
-#else /* DATA_TYPE_F32 */
+#else /* DATA_TYPE_FP32 */
#error Data type not supported
-#endif /* DATA_TYPE_F32 */
+#endif /* DATA_TYPE_FP32 */
diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers.h b/src/core/GLES_COMPUTE/cs_shaders/helpers.h
index 86dedf5a9c..ba27eec716 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/helpers.h
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers.h
@@ -56,7 +56,9 @@
uint name##_stride_y; \
uint name##_step_y; \
uint name##_offset_first_element_in_bytes; \
- uint name##_buffer_data_type_size
+ uint name##_buffer_data_type_size; \
+ uint name##_padding1; \
+ uint name##_padding2
#define TENSOR3D_PARAM_DECLARATION(name) \
uint name##_stride_x; \
diff --git a/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
new file mode 100644
index 0000000000..ad67681067
--- /dev/null
+++ b/src/core/GLES_COMPUTE/cs_shaders/helpers_cs.h
@@ -0,0 +1,396 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_HELPER_CS_H
+#define ARM_COMPUTE_HELPER_CS_H
+
+#define SHADER_PARAMS_DECLARATION \
+ layout(std140, binding = 0) uniform shader_params
+
+#define TENSOR_DECLARATION(location, buffer_type, type, ptr_name, shift_name, element_shift, access) \
+ layout(std430, binding = location) access buffer buffer_type \
+ { \
+ type ptr_name[]; \
+ }; \
+ const uint shift_name = uint(element_shift)
+
+struct VectorAttributes
+{
+ uint stride_x; /**< Stride of the vector in X dimension (in bytes) */
+ uint step_x; /**< stride_x * number of elements along X processed per workitem (in bytes) */
+ uint offset_first_element_in_bytes; /**< The offset of the first element in the vector (in bytes) */
+ uint padding; /**< The padding to rounding up the structure to a multiple of a vec4 */
+};
+
+struct ImageAttributes
+{
+ uint stride_x; /**< Stride of the image in X dimension (in bytes) */
+ uint step_x; /**< stride_x * number of elements along X processed per workitem (in bytes) */
+ uint stride_y; /**< Stride of the image in Y dimension (in bytes) */
+ uint step_y; /**< stride_y * number of elements along Y processed per workitem (in bytes) */
+ uint offset_first_element_in_bytes; /**< The offset of the first element in the image (in bytes) */
+ uint padding1; /**< The padding to rounding up the structure to a multiple of a vec4 */
+ uint padding2; /**< The padding to rounding up the structure to a multiple of a vec4 */
+ uint padding3; /**< The padding to rounding up the structure to a multiple of a vec4 */
+};
+
+struct Tensor3DAttributes
+{
+ uint stride_x; /**< Stride of the tensor in X dimension (in bytes) */
+ uint step_x; /**< stride_x * number of elements along X processed per workitem (in bytes) */
+ uint stride_y; /**< Stride of the tensor in Y dimension (in bytes) */
+ uint step_y; /**< stride_y * number of elements along Y processed per workitem (in bytes) */
+ uint stride_z; /**< Stride of the tensor in Z dimension (in bytes) */
+ uint step_z; /**< stride_z * number of elements along Z processed per workitem (in bytes) */
+ uint offset_first_element_in_bytes; /**< The offset of the first element in the tensor (in bytes) */
+ uint padding; /**< The padding to rounding up the structure to a multiple of a vec4 */
+};
+
+struct VectorIterator
+{
+ int current_offset_in_bytes; /**< Current offset of vector (in bytes) */
+ int stride_x; /**< Stride of the vector in X dimension (in bytes) */
+ int element_shift; /**< The number of bits to shift by for one element */
+};
+
+struct ImageIterator
+{
+ int current_offset_in_bytes; /**< Current offset of image (in bytes) */
+ int stride_x; /**< Stride of the image in X dimension (in bytes) */
+ int stride_y; /**< Stride of the image in Y dimension (in bytes) */
+ int element_shift; /**< The number of bits to shift by for one element */
+};
+
+struct Tensor3DIterator
+{
+ int current_offset_in_bytes; /**< Current offset of tensor (in bytes) */
+ int stride_x; /**< Stride of the tensor in X dimension (in bytes) */
+ int stride_y; /**< Stride of the tensor in Y dimension (in bytes) */
+ int stride_z; /**< Stride of the tensor in Z dimension (in bytes) */
+ int element_shift; /**< The number of bits to shift by for one element */
+};
+
+#define CONVERT_TO_VECTOR_ITERATOR(attrs, element_shift) \
+ update_vector_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+ attrs.stride_x, attrs.step_x)
+
+#define CONVERT_TO_VECTOR_ITERATOR_NO_STEP(attrs, element_shift) \
+ update_vector_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+ attrs.stride_x, uint(0))
+
+#define CONVERT_TO_IMAGE_ITERATOR(attrs, element_shift) \
+ update_image_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+ attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y)
+
+#define CONVERT_TO_IMAGE_ITERATOR_NO_STEP(attrs, element_shift) \
+ update_image_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+ attrs.stride_x, uint(0), attrs.stride_y, uint(0))
+
+#define CONVERT_TO_TENSOR3D_ITERATOR(attrs, element_shift) \
+ update_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+ attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y, attrs.stride_z, attrs.step_z)
+
+#define CONVERT_TO_TENSOR3D_ITERATOR_NO_STEP(attrs, element_shift) \
+ update_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+ attrs.stride_x, uint(0), attrs.stride_y, uint(0), attrs.stride_z, uint(0))
+
+#define CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(attrs, element_shift) \
+ update_image_from_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+ attrs.stride_x, attrs.step_x, attrs.stride_y, attrs.step_y, attrs.stride_z, attrs.step_z)
+
+#define CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(attrs, element_shift) \
+ update_image_from_tensor3D_iter_offset(element_shift, attrs.offset_first_element_in_bytes, \
+ attrs.stride_x, uint(0), attrs.stride_y, uint(0), attrs.stride_z, attrs.step_z)
+
+/** Wrap vector information into a VectorIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
+ * @param[in] stride_x Stride of the vector in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem (in bytes)
+ *
+ * @return A VectorIterator object
+ */
+VectorIterator update_vector_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+{
+ VectorIterator vector_iter;
+ vector_iter.element_shift = int(element_shift);
+ vector_iter.stride_x = int(stride_x);
+ vector_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x);
+
+ return vector_iter;
+}
+
+/** Wrap image information into an ImageIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] stride_x Stride of the image in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem (in bytes)
+ * @param[in] stride_y Stride of the image in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem (in bytes)
+ *
+ * @return An ImageIterator object
+ */
+ImageIterator update_image_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+{
+ ImageIterator image_iter;
+ image_iter.element_shift = int(element_shift);
+ image_iter.stride_x = int(stride_x);
+ image_iter.stride_y = int(stride_y);
+ image_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y);
+
+ return image_iter;
+}
+
+/** Wrap 3D tensor information into a Tensor3DIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source tersor
+ * @param[in] stride_x Stride of the tersor in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem (in bytes)
+ * @param[in] stride_y Stride of the tersor in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem (in bytes)
+ * @param[in] stride_z Stride of the tersor in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per workitem (in bytes)
+ *
+ * @return A 3D Tensor3DIterator object
+ */
+Tensor3DIterator update_tensor3D_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ Tensor3DIterator tensor_iter;
+ tensor_iter.element_shift = int(element_shift);
+ tensor_iter.stride_x = int(stride_x);
+ tensor_iter.stride_y = int(stride_y);
+ tensor_iter.stride_z = int(stride_z);
+ tensor_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z);
+
+ return tensor_iter;
+}
+
+/** Wrap 3D tensor information into an ImageIterator structure, and make the offset to be this workitem's position.
+ *
+ * @param[in] element_shift The number of bits to shift by for one element
+ * @param[in] offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] stride_x Stride of the tensor in X dimension (in bytes)
+ * @param[in] step_x stride_x * number of elements along X processed per workitem (in bytes)
+ * @param[in] stride_y Stride of the tensor in Y dimension (in bytes)
+ * @param[in] step_y stride_y * number of elements along Y processed per workitem (in bytes)
+ * @param[in] stride_z Stride of the tensor in Z dimension (in bytes)
+ * @param[in] step_z stride_z * number of elements along Z processed per workitem (in bytes)
+ *
+ * @return An ImageIterator object
+ */
+ImageIterator update_image_from_tensor3D_iter_offset(uint element_shift, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+{
+ ImageIterator image_iter;
+ image_iter.element_shift = int(element_shift);
+ image_iter.stride_x = int(stride_x);
+ image_iter.stride_y = int(stride_y);
+ image_iter.current_offset_in_bytes = int(offset_first_element_in_bytes + gl_GlobalInvocationID.x * step_x + gl_GlobalInvocationID.y * step_y + gl_GlobalInvocationID.z * step_z);
+
+ return image_iter;
+}
+
+#define VECTOR_OFFSET(tensor_iter, x) \
+ uint(vector_offset_in_bytes(tensor_iter, int(x)) >> tensor_iter.element_shift)
+
+#define IMAGE_OFFSET(tensor_iter, x, y) \
+ uint(image_offset_in_bytes(tensor_iter, int(x), int(y)) >> tensor_iter.element_shift)
+
+#define TENSOR3D_OFFSET(tensor_iter, x, y, z) \
+ uint(tensor3D_offset_in_bytes(tensor_iter, int(x), int(y), int(z)) >> tensor_iter.element_shift)
+
+#define CURRENT_ITEM_OFFSET(tensor_iter) \
+ uint(tensor_iter.current_offset_in_bytes >> tensor_iter.element_shift)
+
+#define CURRENT_ITEM_OFFSET_IN_BYTES(tensor_iter) \
+ uint(tensor_iter.current_offset_in_bytes)
+
+#define TENSOR_ITERATOR_ADVANCE_IN_BYTES(tensor_iter, n) \
+ tensor_iter.current_offset_in_bytes += int(n)
+
+/** Get the offset of a VectorIterator
+ *
+ * @param[in] vector_iter The VectorIterator object pointed to the starting position of the buffer
+ * @param[in] x Relative X position
+ *
+ * @return The relative offset of the VectorIterator object (in bytes)
+ */
+uint vector_offset_in_bytes(VectorIterator vector_iter, int x)
+{
+ return uint(vector_iter.current_offset_in_bytes + x * vector_iter.stride_x);
+}
+
+/** Get the offset of an ImageIterator
+ *
+ * @param[in] vector_iter The ImageIterator object pointed to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ *
+ * @return The relative offset of the ImageIterator object (in bytes)
+ */
+uint image_offset_in_bytes(ImageIterator image_iter, int x, int y)
+{
+ return uint(image_iter.current_offset_in_bytes + x * image_iter.stride_x + y * image_iter.stride_y);
+}
+
+/** Get the offset of a Tensor3DIterator
+ *
+ * @param[in] vector_iter The Tensor3DIterator object pointed to the starting position of the buffer
+ * @param[in] x Relative X position
+ * @param[in] y Relative Y position
+ * @param[in] z Relative Z position
+ *
+ * @return The relative offset of the Tensor3DIterator object (in bytes)
+ */
+uint tensor3D_offset_in_bytes(Tensor3DIterator tensor_iter, int x, int y, int z)
+{
+ return uint(tensor_iter.current_offset_in_bytes + x * tensor_iter.stride_x + y * tensor_iter.stride_y + z * tensor_iter.stride_z);
+}
+
+#define LOAD(tensor_ptr, offset) tensor_ptr[offset]
+#define STORE(tensor_ptr, offset, data) tensor_ptr[offset] = data
+#define LOAD_CURRENT_ITEM(tensor_ptr, tensor_iter) tensor_ptr[CURRENT_ITEM_OFFSET(tensor_iter)]
+#define STORE_CURRENT_ITEM(tensor_ptr, tensor_iter, data) tensor_ptr[CURRENT_ITEM_OFFSET(tensor_iter)] = data
+
+#define VLOAD2(return_type, tensor_ptr, offset) \
+ return_type(LOAD(tensor_ptr, offset), \
+ LOAD(tensor_ptr, (offset) + uint(1)))
+
+#define VSTORE2(tensor_ptr, offset, data) \
+ STORE(tensor_ptr, offset, data[0]); \
+ STORE(tensor_ptr, (offset) + uint(1), data[1])
+
+#define VLOAD2_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD2(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE2_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE2(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD3(return_type, tensor_ptr, offset) \
+ return_type(LOAD(tensor_ptr, offset), \
+ LOAD(tensor_ptr, (offset) + uint(1)), \
+ LOAD(tensor_ptr, (offset) + uint(2)))
+
+#define VSTORE3(tensor_ptr, offset, data) \
+ STORE(tensor_ptr, offset, data[0]); \
+ STORE(tensor_ptr, (offset) + uint(1), data[1]); \
+ STORE(tensor_ptr, (offset) + uint(2), data[2])
+
+#define VLOAD3_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD3(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE3_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE3(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD4(return_type, tensor_ptr, offset) \
+ return_type(LOAD(tensor_ptr, offset), \
+ LOAD(tensor_ptr, (offset) + uint(1)), \
+ LOAD(tensor_ptr, (offset) + uint(2)), \
+ LOAD(tensor_ptr, (offset) + uint(3)))
+
+#define VSTORE4(tensor_ptr, offset, data) \
+ STORE(tensor_ptr, offset, data[0]); \
+ STORE(tensor_ptr, (offset) + uint(1), data[1]); \
+ STORE(tensor_ptr, (offset) + uint(2), data[2]); \
+ STORE(tensor_ptr, (offset) + uint(3), data[3])
+
+#define VLOAD4_CURRENT_ITEM(return_type, tensor_ptr, tensor_iter) VLOAD4(return_type, tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE4_CURRENT_ITEM(tensor_ptr, tensor_iter, data) VSTORE4(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+/** Converting the vec4 object to 4 half-precision (16-bits) floating point values and packing into a uvec2 object
+ *
+ * @param[in] data The vec4 object to be packed
+ *
+ * @return The packed uvec2 object
+ */
+highp uvec2 pack4_half(mediump vec4 data)
+{
+ return uvec2(packHalf2x16(data.xy), packHalf2x16(data.zw));
+}
+
+/** Unpacking the uvec2 object to 4 half-precision (16-bits) floating point values and converting to a vec4 object
+ *
+ * @param[in] packed_data The uvec2 object to be unpacked
+ *
+ * @return The unpacked vec4 object
+ */
+mediump vec4 unpack4_half(highp uvec2 packed_data)
+{
+ return vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y));
+}
+
+/** Converting the vec4[2] object to 8 half-precision (16-bits) floating point values and packing into a uvec4 object
+ *
+ * @param[in] data The vec4[2] object to be packed
+ *
+ * @return The packed uvec4 object
+ */
+highp uvec4 pack8_half(mediump vec4 data[2])
+{
+ return uvec4(packHalf2x16(data[0].xy), packHalf2x16(data[0].zw),
+ packHalf2x16(data[1].xy), packHalf2x16(data[1].zw));
+}
+
+/** Unpacking the uvec4 object to 8 half-precision (16-bits) floating point values and converting to a vec4[2] object
+ *
+ * @param[in] packed_data The uvec4 object to be unpacked
+ *
+ * @return The unpacked vec4[2] object
+ */
+mediump vec4[2] unpack8_half(highp uvec4 packed_data)
+{
+ return vec4[2](vec4(unpackHalf2x16(packed_data.x), unpackHalf2x16(packed_data.y)),
+ vec4(unpackHalf2x16(packed_data.z), unpackHalf2x16(packed_data.w)));
+}
+
+// For half-precision (16-bits) floating point packed into a "uint" element
+#define LOAD_UNPACK2_HALF(tensor_ptr, offset) unpackHalf2x16(LOAD(tensor_ptr, offset))
+#define STORE_PACK2_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, packHalf2x16(data))
+#define LOAD_UNPACK2_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK2_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK2_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK2_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD2_UNPACK4_HALF(tensor_ptr, offset) unpack4_half(VLOAD2(uvec2, tensor_ptr, offset))
+#define VSTORE2_PACK4_HALF(tensor_ptr, offset, data) VSTORE2(tensor_ptr, offset, pack4_half(data))
+#define VLOAD2_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE2_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD4_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD4(uvec4, tensor_ptr, offset))
+#define VSTORE4_PACK8_HALF(tensor_ptr, offset, data) VSTORE4(tensor_ptr, offset, pack8_half(data))
+#define VLOAD4_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD4_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE4_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE4_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+// For half-precision (16-bits) floating point packed into a "uvec2" element
+#define LOAD_UNPACK4_HALF(tensor_ptr, offset) unpack4_half(LOAD(tensor_ptr, offset))
+#define STORE_PACK4_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack4_half(data))
+#define LOAD_UNPACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK4_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK4_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#define VLOAD2_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(VLOAD2(uvec4, tensor_ptr, offset))
+#define VSTORE2_PACK8_HALF(tensor_ptr, offset, data) VSTORE2(tensor_ptr, offset, pack8_half(data))
+#define VLOAD2_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) VLOAD2_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define VSTORE2_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) VSTORE2_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+// For half-precision (16-bits) floating point packed into a "uvec4" element
+#define LOAD_UNPACK8_HALF(tensor_ptr, offset) unpack8_half(LOAD(tensor_ptr, offset))
+#define STORE_PACK8_HALF(tensor_ptr, offset, data) STORE(tensor_ptr, offset, pack8_half(data))
+#define LOAD_UNPACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter) LOAD_UNPACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter))
+#define STORE_PACK8_CURRENT_ITEM_HALF(tensor_ptr, tensor_iter, data) STORE_PACK8_HALF(tensor_ptr, CURRENT_ITEM_OFFSET(tensor_iter), data)
+
+#endif // ARM_COMPUTE_HELPER_CS_H
diff --git a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
index 5699340c14..166953ffc0 100755
--- a/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/normalization_layer.cs
@@ -45,30 +45,30 @@ BUFFER_DECLARATION(dst, 3, float, writeonly);
* @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
* @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
*
- * @param[in] src1_ptr Pointer to the first source tensor. Supported data types: F32
- * @param[in] src1_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src1_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src1_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] src1_step_z src1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in] src2_ptr Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
- * @param[in] src2_stride_x Stride of the second source tensor in X dimension (in bytes)
- * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src2_stride_y Stride of the second source tensor in Y dimension (in bytes)
- * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src2_stride_z Stride of the second source tensor in Z dimension (in bytes)
- * @param[in] src2_step_z src2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src1_ptr Pointer to the first source tensor. Supported data types: F32
+ * @param[in] src1_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_z src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] src2_ptr Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in] src2_stride_x Stride of the second source tensor in X dimension (in bytes)
+ * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_stride_z Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_z src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
void main(void)
{
@@ -104,30 +104,30 @@ void main(void)
* @note KAPPA parameter in the normalization equation should be given as a preprocessor argument using "#define KAPPA x"
* @note Number of elements on the right or left side to normalize across should be given as a preprocessor argument using "#define RADIUS x"
*
- * @param[in] src1_ptr Pointer to the first source tensor. Supported data types: F32
- * @param[in] src1_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src1_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src1_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] src1_step_z src1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in] src2_ptr Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
- * @param[in] src2_stride_x Stride of the second source tensor in X dimension (in bytes)
- * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src2_stride_y Stride of the second source tensor in Y dimension (in bytes)
- * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src2_stride_z Stride of the second source tensor in Z dimension (in bytes)
- * @param[in] src2_step_z src2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src1_ptr Pointer to the first source tensor. Supported data types: F32
+ * @param[in] src1_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src1_step_x src1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src1_step_y src1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src1_step_z src1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] src2_ptr Pointer to the second source tensor. Supported data types: Same as @p src1_ptr
+ * @param[in] src2_stride_x Stride of the second source tensor in X dimension (in bytes)
+ * @param[in] src2_step_x src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src2_stride_y Stride of the second source tensor in Y dimension (in bytes)
+ * @param[in] src2_step_y src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src2_stride_z Stride of the second source tensor in Z dimension (in bytes)
+ * @param[in] src2_step_z src2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src2_offset_first_element_in_bytes The offset of the second element in the second source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: Same as @p src1_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
void main(void)
{
diff --git a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
index 0bbabeaafc..1a2c3f7b20 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/softmax_layer.cs
@@ -24,99 +24,60 @@
layout(local_size_x = LOCAL_SIZE_X, local_size_y = LOCAL_SIZE_Y, local_size_z = LOCAL_SIZE_Z) in;
-#include "helpers.h"
+#include "helpers_cs.h"
+#if defined(DATA_TYPE_FP16)
+precision mediump float;
+#endif // DATA_TYPE_FP16
+
+// Common definitions
#define MAX_OP(x, y) max((x), (y))
#define ADD_OP(x, y) ((x) + (y))
#define SUB_OP(x, y) ((x) - (y))
#define DIV_OP(x, y) ((x) / (y))
#define EXP_OP(x) exp((x))
-#if defined(DATA_TYPE_FP32)
-const float MINVAL = -1.0 / 0.0;
-vec4 type_min = CONVERT(MINVAL, vec4);
-
-#define LOAD16(name, offset) \
- vec4(LOAD4(name, offset), \
- LOAD4(name, offset + uint(1)), \
- LOAD4(name, offset + uint(2)), \
- LOAD4(name, offset + uint(3)))
-
-#define STORE16(name, offset, value) \
- STORE4(name, offset, value.x); \
- STORE4(name, offset + uint(1), value.y); \
- STORE4(name, offset + uint(2), value.z); \
- STORE4(name, offset + uint(3), value.w)
+const float float_min = -1.0 / 0.0;
+const vec4 vec4_min = vec4(float_min);
#ifdef SOFTMAX_LAYER_MAX
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(dst, 2, float, writeonly);
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(max, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-BUFFER_DECLARATION(sum, 4, float, writeonly);
-#elif defined(SOFTMAX_LAYER_NORM)
-BUFFER_DECLARATION(src, 1, float, readonly);
-BUFFER_DECLARATION(sum, 2, float, readonly);
-BUFFER_DECLARATION(dst, 3, float, writeonly);
-#endif // SOFTMAX_LAYER_MAX
-layout(std140) uniform shader_params
-{
-#ifdef SOFTMAX_LAYER_MAX
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- uint width;
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(max);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(sum);
- uint width;
-#elif defined(SOFTMAX_LAYER_NORM)
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(sum);
- TENSOR3D_PARAM_DECLARATION(dst);
-#endif // SOFTMAX_LAYER_MAX
-};
-
-#ifdef SOFTMAX_LAYER_MAX
/** Identifies the maximum value across the 1st dimension.
*
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] width Input image width
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in] src_attrs The attributes of the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_attrs The attributes of the destination tensor
+ * @param[in] width Input image width
*/
+SHADER_PARAMS_DECLARATION
+{
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes dst_attrs;
+ uint width;
+};
+
+#if defined(DATA_TYPE_FP32)
+
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
void main(void)
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
// Initialize local maximum
- vec4 max_val = CONVERT(type_min, vec4);
+ vec4 max_val = vec4_min;
// Calculate max of row
uint width2 = width >> 2;
for(int i = 0; i < int(width2); i++)
{
- vec4 data = LOAD16(src, offset(src, i << 2, 0));
+ vec4 data = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
max_val = MAX_OP(data, max_val);
}
@@ -124,7 +85,7 @@ void main(void)
// Handle non multiple of 4
for(int i = int(width2 << 2); i < int(width); i++)
{
- float data = LOAD4(src, offset(src, i, 0));
+ float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
max_val.x = MAX_OP(data, max_val.x);
}
#endif /* NON_MULTIPLE_OF_4 */
@@ -134,408 +95,247 @@ void main(void)
max_val.x = MAX_OP(max_val.x, max_val.y);
// Store result
- STORE4(dst, CURRENT_OFFSET(dst), max_val.x);
+ STORE_CURRENT_ITEM(dst_ptr, dst_iter, max_val.x);
}
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
- *
- * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
- *
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
- * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
- * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
- * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
- */
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+
void main(void)
{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
- Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
- // Load max value of 1D logits vector (row)
- vec4 max_val = CONVERT(LOAD4(max, CURRENT_OFFSET(max)), vec4);
-
- // Set sum vector
- vec4 sum1D = CONVERT(0, vec4);
+ // Initialize local maximum
+ vec4 max_val = vec4_min;
- // Shift values, exp and sum
+ // Calculate max of row
uint width2 = width >> 2;
for(int i = 0; i < int(width2); i++)
{
- vec4 data = LOAD16(src, offset(src, i << 2, 0));
- data = SUB_OP(data, max_val);
- data = EXP_OP(data);
- STORE16(dst, offset(dst, i << 2, 0), data);
- sum1D = ADD_OP(sum1D, data);
+ vec4 data = VLOAD2_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
+ max_val = MAX_OP(data, max_val);
}
#ifdef NON_MULTIPLE_OF_4
// Handle non multiple of 4
- for(int i = int(width2 << 2); i < int(width); i++)
+ for(int i = int(width2 << 2); i < int(width); i = i + 2)
{
- float data;
- data = LOAD4(src, offset(src, i, 0));
- data = SUB_OP(data, max_val.x);
- data = EXP_OP(data);
- STORE4(dst, offset(dst, i, 0), data);
- sum1D.x = ADD_OP(sum1D.x, data);
+ vec2 data = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+ max_val.x = MAX_OP(data.x, max_val.x);
+ if((i + 1) < int(width))
+ {
+ max_val.x = MAX_OP(data.y, max_val.x);
+ }
}
-#endif /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_4 */
- // Perform min/max reduction
- sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
- sum1D.x = ADD_OP(sum1D.x, sum1D.y);
+ // Perform max reduction
+ max_val.xy = MAX_OP(max_val.xy, max_val.zw);
+ max_val.x = MAX_OP(max_val.x, max_val.y);
- // Calculate and store result
- STORE4(sum, CURRENT_OFFSET(sum), sum1D.x);
+ STORE_PACK2_CURRENT_ITEM_HALF(dst_ptr, dst_iter, max_val.xy);
}
-#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
*
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP32"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
+ * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in] src_attrs The attributes of the source tensor
+ * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] max_attrs The attributes of the max values tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_attrs The attributes of the destination tensor
+ * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] sum_attrs The attributes of the sum values tensor
+ * @param[in] width Input image width
*/
-void main(void)
-{
- Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
- // Load max value of 1D logits vector (row)
- vec4 sum_val = CONVERT(LOAD4(sum, offset(sum, 0, int(gl_GlobalInvocationID.y))), vec4);
- vec4 data = LOAD16(src, CURRENT_OFFSET(src));
- STORE16(dst, CURRENT_OFFSET(dst), DIV_OP(data, sum_val));
-}
-#endif // SOFTMAX_LAYER_MAX
-
-#elif defined(DATA_TYPE_FP16)
-precision mediump float;
-
-const float MINVAL1 = -1.0 / 0.0;
-vec4 type_min1 = CONVERT(MINVAL1, vec4);
-
-#define GC_LOAD4_IMAGE(r, name, x, y) \
- load_and_unpack(r.xy, name, x, y); \
- load_and_unpack(r.zw, name, (x + 2), y)
-
-#define GC_STORE4_IMAGE(r, name, x, y) \
- GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.xy)), name, x, y); \
- GC_STORE1_2D_OFFSET(uint(packHalf2x16(r.zw)), name, (x + 2), y)
-
-#ifdef SOFTMAX_LAYER_MAX
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(dst, 2, uint, writeonly);
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(max, 2, uint, readonly);
-BUFFER_DECLARATION(dst, 3, uint, writeonly);
-BUFFER_DECLARATION(sum, 4, uint, writeonly);
-#elif defined(SOFTMAX_LAYER_NORM)
-BUFFER_DECLARATION(src, 1, uint, readonly);
-BUFFER_DECLARATION(sum, 2, uint, readonly);
-BUFFER_DECLARATION(dst, 3, uint, writeonly);
-#endif // SOFTMAX_LAYER_MAX
-
-layout(std140) uniform shader_params
+SHADER_PARAMS_DECLARATION
{
-#ifdef SOFTMAX_LAYER_MAX
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(dst);
- uint width;
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM)
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(max);
- TENSOR3D_PARAM_DECLARATION(dst);
- TENSOR3D_PARAM_DECLARATION(sum);
- uint width;
-#elif defined(SOFTMAX_LAYER_NORM)
- TENSOR3D_PARAM_DECLARATION(src);
- TENSOR3D_PARAM_DECLARATION(sum);
- TENSOR3D_PARAM_DECLARATION(dst);
-#endif // SOFTMAX_LAYER_MAX
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes max_attrs;
+ Tensor3DAttributes dst_attrs;
+ Tensor3DAttributes sum_attrs;
+ uint width;
};
+#if defined(DATA_TYPE_FP32)
-#define load_and_unpack(rs, names, xs, ys) \
- do \
- { \
- uint packed_s; \
- GC_LOAD1_2D_OFFSET(packed_s, names, xs, ys); \
- rs = vec2(unpackHalf2x16(packed_s)); \
- } while(false)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maxBuffer, float, max_ptr, max_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(4, sumBuffer, float, sum_ptr, sum_shift, 2, writeonly);
-#ifdef SOFTMAX_LAYER_MAX
-/** Identifies the maximum value across the 1st dimension.
- *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
- *
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] width Input image width
- */
void main(void)
{
- Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+ ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
+ ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
- // Initialize local maximum
- vec4 max_val1 = CONVERT(type_min1, vec4);
+ // Load max value of 1D logits vector (row)
+ vec4 max_val = vec4(LOAD_CURRENT_ITEM(max_ptr, max_iter));
- // Calculate max of row
+ // Set sum vector
+ vec4 sum1D = vec4(0);
+
+ // Shift values, exp and sum
uint width2 = width >> 2;
for(int i = 0; i < int(width2); i++)
{
- vec4 data1;
- GC_LOAD4_IMAGE(data1, src, (i << 2), 0);
- max_val1 = MAX_OP(data1, max_val1);
+ vec4 data = VLOAD4(vec4, src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
+ data = SUB_OP(data, max_val);
+ data = EXP_OP(data);
+ VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, i << 2, 0), data);
+ sum1D = ADD_OP(sum1D, data);
}
#ifdef NON_MULTIPLE_OF_4
// Handle non multiple of 4
- for(int i = int(width2 << 2); i < int(width); i = i + 2)
+ for(int i = int(width2 << 2); i < int(width); i++)
{
- vec2 data;
- load_and_unpack(data, src, i, 0);
- max_val1.x = MAX_OP(data.x, max_val1.x);
- if((i + 1) < int(width))
- {
- max_val1.x = MAX_OP(data.y, max_val1.x);
- }
+ float data = LOAD(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+ data = SUB_OP(data, max_val.x);
+ data = EXP_OP(data);
+ STORE(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), data);
+ sum1D.x = ADD_OP(sum1D.x, data);
}
-#endif /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_4 */
- // Perform max reduction
- max_val1.xy = MAX_OP(max_val1.xy, max_val1.zw);
- max_val1.x = MAX_OP(max_val1.x, max_val1.y);
- vec2 res1 = vec2(max_val1.x, 0.f);
- uint res;
- res = uint(packHalf2x16(res1));
+ // Perform min/max reduction
+ sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+ sum1D.x = ADD_OP(sum1D.x, sum1D.y);
- // Store result
- GC_STORE1_2D_OFFSET(res, dst, 0, 0);
+ // Calculate and store result
+ STORE_CURRENT_ITEM(sum_ptr, sum_iter, sum1D.x);
}
-#elif defined(SOFTMAX_LAYER_SHIFT_EXP_SUM) // SOFTMAX_LAYER_MAX
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
- *
- * @note In case the input is not multiple of 4 NON_MULTIPLE_OF_4 must be passed.
- *
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
- * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
- * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
- * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[in] width Input image width
- */
+#elif defined(DATA_TYPE_FP16)
+
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, maxBuffer, uint, max_ptr, max_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+TENSOR_DECLARATION(4, sumBuffer, uint, sum_ptr, sum_shift, 2, writeonly);
+
void main(void)
{
- Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Image max = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
- Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+ ImageIterator max_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(max_attrs, max_shift);
+ ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(sum_attrs, sum_shift);
// Load max value of 1D logits vector (row)
- vec2 datamaxinit;
- load_and_unpack(datamaxinit, max, 0, 0);
- vec4 max_val = CONVERT(datamaxinit.x, vec4);
+ vec2 datamaxinit = LOAD_UNPACK2_CURRENT_ITEM_HALF(max_ptr, max_iter);
+ vec4 max_val = vec4(datamaxinit.x);
// Set sum vector
- vec4 sum1D1 = CONVERT(0.f, vec4);
+ vec4 sum1D = vec4(0.f);
// Shift values, exp and sum
uint width2 = width >> 2;
for(int i = 0; i < int(width2); i++)
{
- vec4 data;
- GC_LOAD4_IMAGE(data, src, (i << 2), 0);
- data = SUB_OP(data, max_val);
- data = EXP_OP(data);
- GC_STORE4_IMAGE(data, dst, (i << 2), 0);
- sum1D1 = ADD_OP(sum1D1, data);
+ vec4 data = VLOAD2_UNPACK4_HALF(src_ptr, IMAGE_OFFSET(src_iter, i << 2, 0));
+ data = SUB_OP(data, max_val);
+ data = EXP_OP(data);
+ VSTORE2_PACK4_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i << 2, 0), data);
+ sum1D = ADD_OP(sum1D, data);
}
#ifdef NON_MULTIPLE_OF_4
// Handle non multiple of 4
for(int i = int(width2 << 2); i < int(width); i = i + 2)
{
- vec2 datamiddle;
- float data1;
- load_and_unpack(datamiddle, src, i, 0);
- data1 = SUB_OP(datamiddle.x, max_val.x);
- data1 = EXP_OP(data1);
- vec2 datares1;
+ float data;
+ vec2 datamiddle = LOAD_UNPACK2_HALF(src_ptr, IMAGE_OFFSET(src_iter, i, 0));
+ data = SUB_OP(datamiddle.x, max_val.x);
+ data = EXP_OP(data);
+ vec2 datares;
if((i + 1) < int(width))
{
float data2;
- data2 = SUB_OP(datamiddle.y, max_val.x);
- data2 = EXP_OP(data2);
- datares1 = vec2(data1, data2);
- data1 = ADD_OP(data2, data1);
+ data2 = SUB_OP(datamiddle.y, max_val.x);
+ data2 = EXP_OP(data2);
+ datares = vec2(data, data2);
+ data = ADD_OP(data2, data);
}
else
{
- datares1 = vec2(data1, 0.f);
+ datares = vec2(data, 0.f);
}
- uint datares;
- datares = uint(packHalf2x16(datares1));
- GC_STORE1_2D_OFFSET(datares, dst, i, 0);
- sum1D1.x = ADD_OP(sum1D1.x, data1);
+
+ STORE_PACK2_HALF(dst_ptr, IMAGE_OFFSET(dst_iter, i, 0), datares);
+
+ sum1D.x = ADD_OP(sum1D.x, data);
}
-#endif /* NON_MULTIPLE_OF_4 */
+#endif /* NON_MULTIPLE_OF_4 */
// Perform min/max reduction
- sum1D1.xy = ADD_OP(sum1D1.xy, sum1D1.zw);
- sum1D1.x = ADD_OP(sum1D1.x, sum1D1.y);
- vec2 res1 = vec2(sum1D1.x, 0.f);
- uint res;
- res = uint(packHalf2x16(res1));
+ sum1D.xy = ADD_OP(sum1D.xy, sum1D.zw);
+ sum1D.x = ADD_OP(sum1D.x, sum1D.y);
+
// Calculate and store result
- GC_STORE1_2D_OFFSET(res, sum, 0, 0);
+ STORE_PACK2_CURRENT_ITEM_HALF(sum_ptr, sum_iter, sum1D.xy);
}
-#elif defined(SOFTMAX_LAYER_NORM) // SOFTMAX_LAYER_MAX
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#elif defined(SOFTMAX_LAYER_NORM)
+
/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
*
- * @note Datatype must be given as a preprocessor argument using "#define DATA_TYPE_FP16"
+ * @note The data type must be passed at compile time using "#define DATA_TYPE_NAME". e.g. "#define DATA_TYPE_FP32"
*
- * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
- * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in] src_attrs The attributes of the source tensor
+ * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] sum_attrs The attributes of the sum values tensor
+ * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in] dst_attrs The attributes of the destination tensor
*/
+SHADER_PARAMS_DECLARATION
+{
+ Tensor3DAttributes src_attrs;
+ Tensor3DAttributes sum_attrs;
+ Tensor3DAttributes dst_attrs;
+};
+#if defined(DATA_TYPE_FP32)
+TENSOR_DECLARATION(1, srcBuffer, float, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, sumBuffer, float, sum_ptr, sum_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
void main(void)
{
- Image src = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
- Image dst = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
- Image sum = GC_CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+ ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
// Load max value of 1D logits vector (row)
- vec2 sum1;
- load_and_unpack(sum1, sum, 0, int(gl_GlobalInvocationID.y));
- vec4 sum_val1 = CONVERT(sum1.x, vec4);
-
- vec4 data1;
- GC_LOAD4_IMAGE(data1, src, 0, 0);
- vec4 res = DIV_OP(data1, sum_val1);
- GC_STORE4_IMAGE(res, dst, 0, 0);
+ vec4 sum_val = vec4(LOAD(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)));
+ vec4 data = VLOAD4_CURRENT_ITEM(vec4, src_ptr, src_iter);
+ VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, DIV_OP(data, sum_val));
}
-#endif // SOFTMAX_LAYER_MAX
-#endif // DATA_TYPE_FP32 \ No newline at end of file
+#elif defined(DATA_TYPE_FP16)
+TENSOR_DECLARATION(1, srcBuffer, uint, src_ptr, src_shift, 2, readonly);
+TENSOR_DECLARATION(2, sumBuffer, uint, sum_ptr, sum_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, uint, dst_ptr, dst_shift, 2, writeonly);
+void main(void)
+{
+ ImageIterator src_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(src_attrs, src_shift);
+ ImageIterator dst_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+ ImageIterator sum_iter = CONVERT_TENSOR3D_TO_IMAGE_ITERATOR_NO_STEP(sum_attrs, sum_shift);
+
+ // Load max value of 1D logits vector (row)
+ vec4 sum_val = vec4(LOAD_UNPACK2_HALF(sum_ptr, IMAGE_OFFSET(sum_iter, 0, gl_GlobalInvocationID.y)).x);
+ vec4 data = VLOAD2_UNPACK4_CURRENT_ITEM_HALF(src_ptr, src_iter);
+ VSTORE2_PACK4_CURRENT_ITEM_HALF(dst_ptr, dst_iter, DIV_OP(data, sum_val));
+}
+#else // DATA_TYPE_FP32
+#error Data type not supported
+#endif // DATA_TYPE_FP32
+#endif // SOFTMAX_LAYER_MAX
diff --git a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
index d76ae8ff1c..c0f454d563 100644
--- a/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.cpp
@@ -80,11 +80,6 @@ void GCAbsoluteDifferenceKernel::configure(const IGCTensor *input1, const IGCTen
output_access.set_valid_region(win, valid_region);
- _kernel.clear_params();
-
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
index 42433cf076..b8672c662d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.cpp
@@ -99,10 +99,6 @@ void GCActivationLayerKernel::configure(IGCTensor *input, IGCTensor *output, Act
AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
}
- _kernel.clear_params();
-
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
index 9c24d2ef42..982143f0b2 100644
--- a/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.cpp
@@ -91,10 +91,6 @@ void GCBatchNormalizationLayerKernel::configure(const IGCTensor *input, IGCTenso
update_window_and_padding(win, input_access, output_access, mean_access, var_access, beta_access, gamma_access);
output_access.set_valid_region(win, input->info()->valid_region());
- _kernel.clear_params();
-
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
index 10716232c9..492f708a98 100644
--- a/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCCol2ImKernel.cpp
@@ -46,8 +46,6 @@ void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor *output,
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- _kernel.clear_params();
-
_input = input;
_output = output;
_convolved_dims = convolved_dims;
@@ -63,7 +61,7 @@ void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor *output,
// Set static kernel arguments
unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
- _kernel.set_params(idx++, _convolved_dims.first);
+ _kernel.set_argument(idx++, _convolved_dims.first);
// Configure window
Window win = calculate_max_window(*input->info(), Steps());
@@ -71,9 +69,6 @@ void GCCol2ImKernel::configure(const IGCTensor *input, IGCTensor *output,
// The GCCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp
index 7f9f438a46..b90a8e7b89 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDepthConcatenateKernel.cpp
@@ -108,8 +108,6 @@ void GCDepthConcatenateKernel::configure(const IGCTensor *input, unsigned int de
update_window_and_padding(win, input_access, output_access);
output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
- _kernel.clear_params();
- _kernel.set_shader_params_binding_point(0);
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index 1fa2a71fff..5c7320aa8d 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -228,8 +228,6 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name.str(), options));
- _kernel.clear_params();
-
unsigned int idx = (_bias == nullptr) ? 3 * num_arguments_per_3D_tensor() : (num_arguments_per_1D_tensor() + 3 * num_arguments_per_3D_tensor());
// Calculate output right and bottom border
@@ -290,11 +288,8 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
- _kernel.set_params(idx++, _weights->info()->strides_in_bytes()[3]); // weights_stride_w
- _kernel.set_params(idx++, _weights->info()->dimension(2)); // weights_depth
-
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
+ _kernel.set_argument(idx++, _weights->info()->strides_in_bytes()[3]); // weights_stride_w
+ _kernel.set_argument(idx++, _weights->info()->dimension(2)); // weights_depth
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp
index 6244fbef80..cdd6a9d989 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDropoutKernel.cpp
@@ -53,7 +53,6 @@ void GCDropoutKernel::configure(const IGCTensor *input, IGCTensor *mask, IGCTens
_input = input;
_mask = mask;
_output = output;
- _kernel.clear_params();
std::set<std::string> build_opts;
std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
@@ -81,8 +80,6 @@ void GCDropoutKernel::configure(const IGCTensor *input, IGCTensor *mask, IGCTens
output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
index 36742ef81e..b4efc0b9a0 100644
--- a/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCFillBorderKernel.cpp
@@ -54,7 +54,7 @@ void GCFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue
{
T value;
constant_border_value.get(value);
- _kernel.set_params(idx, static_cast<T>(value));
+ _kernel.set_argument(idx, static_cast<T>(value));
}
void GCFillBorderKernel::configure(const IGCTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
@@ -112,8 +112,6 @@ void GCFillBorderKernel::configure(const IGCTensor *tensor, BorderSize border_si
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel(kernel_name, build_opts));
_tensor = tensor;
- _kernel.clear_params();
-
// Create static kernel arguments
const unsigned int valid_width = tensor->info()->valid_region().shape[0];
const unsigned int valid_height = tensor->info()->valid_region().shape[1];
@@ -121,10 +119,10 @@ void GCFillBorderKernel::configure(const IGCTensor *tensor, BorderSize border_si
// Set static kernel arguments
unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
- _kernel.set_params(idx++, valid_width);
- _kernel.set_params(idx++, valid_height);
- _kernel.set_params(idx++, tensor->info()->valid_region().anchor[0]);
- _kernel.set_params(idx++, tensor->info()->valid_region().anchor[1]);
+ _kernel.set_argument(idx++, valid_width);
+ _kernel.set_argument(idx++, valid_height);
+ _kernel.set_argument(idx++, tensor->info()->valid_region().anchor[0]);
+ _kernel.set_argument(idx++, tensor->info()->valid_region().anchor[1]);
if(BorderMode::CONSTANT == border_mode)
{
@@ -137,8 +135,6 @@ void GCFillBorderKernel::configure(const IGCTensor *tensor, BorderSize border_si
win.set(Window::DimY, Window::Dimension(0, 1, 1));
win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ);
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
index 5e3788af99..4bc6731064 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.cpp
@@ -84,11 +84,6 @@ void GCGEMMInterleave4x4Kernel::configure(const IGCTensor *input, IGCTensor *out
output_access.set_valid_region(win, input->info()->valid_region());
- _kernel.clear_params();
-
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
index 434070a46c..8625d371e5 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp
@@ -80,10 +80,6 @@ void GCGEMMMatrixAccumulateBiasesKernel::configure(IGCTensor *accum, const IGCTe
update_window_and_padding(win, biases_access, accum_access);
- _kernel.clear_params();
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
index fa0415249a..cf5d37811f 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.cpp
@@ -74,10 +74,6 @@ void GCGEMMMatrixAdditionKernel::configure(const IGCTensor *input, IGCTensor *ou
output_access.set_valid_region(win, input->info()->valid_region());
- _kernel.clear_params();
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index ea9b3874b2..a75ab6b609 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
@@ -155,8 +155,6 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen
output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
}
- _kernel.clear_params();
- _kernel.set_shader_params_binding_point(0);
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
index a1270b4c3d..c361b60f84 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.cpp
@@ -92,10 +92,6 @@ void GCGEMMTranspose1xWKernel::configure(const IGCTensor *input, IGCTensor *outp
output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
- _kernel.clear_params();
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index 935d8420ff..97c4dc48a1 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
@@ -52,7 +52,6 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::p
_input = input;
_output = output;
- _kernel.clear_params();
std::set<std::string> build_opts;
std::string dt_name = (input->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16";
@@ -141,8 +140,6 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::p
win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
}
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
IGCKernel::configure(win);
}
@@ -189,9 +186,9 @@ void GCIm2ColKernel::run_generic(const Window &window)
add_3D_tensor_argument(idx, _input, 1, slice_in);
add_2D_tensor_argument(idx, _output, 2, slice_out);
- _kernel.set_params(idx++, static_cast<unsigned int>(_input->info()->dimension(2)));
- _kernel.set_params(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
- _kernel.set_params(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
+ _kernel.set_argument(idx++, static_cast<unsigned int>(_input->info()->dimension(2)));
+ _kernel.set_argument(idx++, static_cast<unsigned int>(_input->info()->strides_in_bytes()[3]));
+ _kernel.set_argument(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[3]));
_kernel.update_shader_params();
enqueue(*this, slice);
@@ -220,8 +217,8 @@ void GCIm2ColKernel::run_reduced(const Window &window)
add_3D_tensor_argument(idx, _input, 1, in_slice);
add_1D_tensor_argument(idx, _output, 2, out_slice);
- _kernel.set_params(idx++, _input->info()->dimension(0));
- _kernel.set_params(idx++, _input->info()->dimension(1));
+ _kernel.set_argument(idx++, _input->info()->dimension(0));
+ _kernel.set_argument(idx++, _input->info()->dimension(1));
_kernel.update_shader_params();
enqueue(*this, in_slice);
diff --git a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
index 65e54f538c..c0c2445c6f 100644
--- a/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.cpp
@@ -92,10 +92,6 @@ void GCNormalizationLayerKernel::configure(const IGCTensor *input, const IGCTens
output_access.set_valid_region(win, input->info()->valid_region());
- _kernel.clear_params();
-
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
index 2b5cee455c..21e967a67a 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.cpp
@@ -80,8 +80,6 @@ void GCPixelWiseMultiplicationKernel::configure(const IGCTensor *input1, const I
// Create kernel
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("pixelwise_mul_float", build_opts));
- _kernel.clear_params();
-
// Configure kernel window
constexpr unsigned int num_elems_processed_per_iteration = 1;
@@ -97,9 +95,6 @@ void GCPixelWiseMultiplicationKernel::configure(const IGCTensor *input1, const I
input2->info()->valid_region());
output_access.set_valid_region(win, valid_region);
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
index 073c3961f2..0b6ba583a3 100644
--- a/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.cpp
@@ -219,9 +219,6 @@ void GCPoolingLayerKernel::configure(const IGCTensor *input, IGCTensor *output,
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
}
- _kernel.clear_params();
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
diff --git a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
index 09a0f79ab2..29a1385f87 100644
--- a/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.cpp
@@ -75,11 +75,9 @@ void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output)
// Create kernel
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
- _kernel.clear_params();
-
// Set fixed arguments
unsigned int idx = 2 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
- _kernel.set_params(idx++, input->info()->dimension(0));
+ _kernel.set_argument(idx++, input->info()->dimension(0));
// Configure kernel window
// The kernel loops over all elements in steps of 4
@@ -98,46 +96,9 @@ void GCLogits1DMaxKernel::configure(const IGCTensor *input, IGCTensor *output)
output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
-void GCLogits1DMaxKernel::run(const Window &window)
-{
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
- Window slice = window.first_slice_window_3D();
-
- _kernel.use();
-
- do
- {
- unsigned int idx1 = 0;
- switch(_input->info()->data_type())
- {
- case DataType::F16:
- add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice);
- add_3D_tensor_argument(idx1, _output, BufferParam(2, 2), slice);
- break;
-
- case DataType::F32:
- add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice);
- add_3D_tensor_argument(idx1, _output, BufferParam(2, 2), slice);
- break;
-
- default:
- ARM_COMPUTE_ERROR("Current data type is mot supported");
- break;
- }
-
- _kernel.update_shader_params();
- enqueue(*this, slice);
- }
- while(window.slide_window_slice_3D(slice));
-}
-
GCLogits1DShiftExpSumKernel::GCLogits1DShiftExpSumKernel()
: _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
{
@@ -179,11 +140,9 @@ void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTen
// Create kernel
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
- _kernel.clear_params();
-
// Set fixed arguments
unsigned int idx = 4 * num_arguments_per_3D_tensor(); //Skip the input and output parameters
- _kernel.set_params(idx++, input->info()->dimension(0));
+ _kernel.set_argument(idx++, input->info()->dimension(0));
// Configure window
// The kernel loops over all elements in steps of 4
@@ -206,9 +165,6 @@ void GCLogits1DShiftExpSumKernel::configure(const IGCTensor *input, const IGCTen
output_access.set_valid_region(win, input->info()->valid_region());
sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
@@ -224,28 +180,13 @@ void GCLogits1DShiftExpSumKernel::run(const Window &window)
do
{
- unsigned int idx = 0;
- switch(_input->info()->data_type())
- {
- case DataType::F16:
- add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice);
- add_3D_tensor_argument(idx, _max, BufferParam(2, 2), slice);
- add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
- add_3D_tensor_argument(idx, _sum, BufferParam(4, 2), slice);
- break;
-
- case DataType::F32:
- add_3D_tensor_argument(idx, _input, BufferParam(1, 2), slice);
- add_3D_tensor_argument(idx, _max, BufferParam(2, 2), slice);
- add_3D_tensor_argument(idx, _output, BufferParam(3, 2), slice);
- add_3D_tensor_argument(idx, _sum, BufferParam(4, 2), slice);
- break;
-
- default:
- ARM_COMPUTE_ERROR("Current data type is mot supported");
- break;
- }
-
+ unsigned int idx = 0;
+ unsigned int binding = 1; // SSBO binding starts from 1.
+ // Set inputs
+ add_3D_tensor_argument(idx, _input, binding++, slice);
+ add_3D_tensor_argument(idx, _max, binding++, slice);
+ add_3D_tensor_argument(idx, _output, binding++, slice);
+ add_3D_tensor_argument(idx, _sum, binding++, slice);
_kernel.update_shader_params();
enqueue(*this, slice);
}
@@ -303,11 +244,6 @@ void GCLogits1DNormKernel::configure(const IGCTensor *input, const IGCTensor *su
output_access.set_valid_region(win, input->info()->valid_region());
- _kernel.clear_params();
-
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}
@@ -326,25 +262,12 @@ void GCLogits1DNormKernel::run(const Window &window)
Window sum_slice = slice;
sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
- unsigned int idx1 = 0;
- switch(_input->info()->data_type())
- {
- case DataType::F16:
- add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice);
- add_3D_tensor_argument(idx1, _sum, BufferParam(2, 2), slice);
- add_3D_tensor_argument(idx1, _output, BufferParam(3, 2), slice);
- break;
-
- case DataType::F32:
- add_3D_tensor_argument(idx1, _input, BufferParam(1, 2), slice);
- add_3D_tensor_argument(idx1, _sum, BufferParam(2, 2), slice);
- add_3D_tensor_argument(idx1, _output, BufferParam(3, 2), slice);
- break;
-
- default:
- ARM_COMPUTE_ERROR("Current data type is mot supported");
- break;
- }
+ unsigned int idx = 0;
+ unsigned int binding = 1; // SSBO binding starts from 1.
+ // Set inputs
+ add_3D_tensor_argument(idx, _input, binding++, slice);
+ add_3D_tensor_argument(idx, _sum, binding++, slice);
+ add_3D_tensor_argument(idx, _output, binding++, slice);
_kernel.update_shader_params();
enqueue(*this, slice);
diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
index b891b42ef8..5bd34c2c85 100644
--- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp
@@ -67,8 +67,6 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
// Create kernel
_kernel = static_cast<GCKernel>(GCKernelLibrary::get().create_kernel("transpose", build_opts));
- _kernel.clear_params();
-
// Configure kernel window
const unsigned int num_elems_processed_per_iteration = 4;
@@ -80,9 +78,6 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output)
output_access.set_valid_region(win, input->info()->valid_region());
- // set shader params binding point
- _kernel.set_shader_params_binding_point(0);
-
IGCKernel::configure(win);
}