COMPMID-1044: Optimizing GCGEMM - Support for not reshaped GEMM on GLES

Change-Id: I22fe80393ec70e4501a4f9f9cad14014029d035d Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129134 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
author: Michele Di Giorgio <michele.digiorgio@arm.com> 2018-04-26 10:24:30 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:52:35 +0000
commit: f6f08dac6d57770c191d1bc77123f0ddd2363d3f (patch)
tree: 3409794e82c069398fb6eaf74f5fbce645adc2c9
parent: a4244190b6c7dc7d30d6adc621ca9a8b84b677ee (diff)
download: ComputeLibrary-f6f08dac6d57770c191d1bc77123f0ddd2363d3f.tar.gz
3 files changed, 203 insertions, 14 deletions
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 69bc09f023..9ed3af8adc 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -1129,7 +1129,7 @@ __kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),
 #if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
 #if defined(DATA_TYPE)
 #define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
  *
  * @note This OpenCL kernel works with floating point data types (F16/F32)
  * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
@@ -1304,7 +1304,7 @@ __kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
 }
 #endif // defined(DATA_TYPE)
 
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
  *
  * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
  * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
diff --git a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
index 580acc16a7..e51908b5e5 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/gemm.cs
@@ -132,7 +132,7 @@ void main(void)
 /** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
  *
  * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_attrs The attributes of the source matrix
@@ -221,7 +221,9 @@ void main()
 /** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
  *
  * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
  * @param[in]  src0_attrs The attributes of the source matrix
@@ -345,6 +347,184 @@ void main()
 }
 #endif /* GEMM_MM_FLOATING_POINT */
 
+#ifdef GEMM_MM_FLOATING_POINT_BIFROST
+/** This OpenGL ES kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)
+ *  Matrix A and matrix B in case both matrices have not been reshaped
+ *
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ *
+ * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_attrs The attributes of the source matrix
+ * @param[in]  src1_ptr   Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_attrs The attributes of the source matrix
+ * @param[out] dst_ptr    Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_attrs  The attributes of the destination matrix
+ */
+SHADER_PARAMS_DECLARATION
+{
+    ImageAttributes src0_attrs;
+    ImageAttributes src1_attrs;
+    ImageAttributes dst_attrs;
+};
+TENSOR_DECLARATION(1, src0Buffer, float, src0_ptr, src0_shift, 2, readonly);
+TENSOR_DECLARATION(2, src1Buffer, float, src1_ptr, src1_shift, 2, readonly);
+TENSOR_DECLARATION(3, dstBuffer, float, dst_ptr, dst_shift, 2, writeonly);
+
+void main()
+{
+    ImageIterator src0_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src0_attrs, src0_shift);
+    ImageIterator src1_iter = CONVERT_TO_IMAGE_ITERATOR_NO_STEP(src1_attrs, src1_shift);
+    ImageIterator dst_iter  = CONVERT_TO_IMAGE_ITERATOR(dst_attrs, dst_shift);
+
+    int idx = int(gl_GlobalInvocationID.x) * int(NUM_ELEMS_PROCESSED_PER_THREAD_X);
+    /* Compute the address for the vector A and matrix B */
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src0_iter, uint(gl_GlobalInvocationID.y) * (src0_attrs.stride_y) * uint(NUM_ELEMS_PROCESSED_PER_THREAD_Y));
+    TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, idx * 4);
+
+    /* Reset accumulators */
+    vec4 acc0 = vec4(0.0f);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    vec4 acc1 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    vec4 acc2 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    vec4 acc3 = vec4(0.0f);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // A and B src indices get incremented at the same time.
+    int i = 0;
+    for(; i <= (COLS_A - 4); i += 4)
+    {
+        // Load values from matrix A and matrix B
+        vec4 a0 = VLOAD4_CURRENT_ITEM(vec4, src0_ptr, src0_iter);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        vec4 a1 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        vec4 a2 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 a3 = VLOAD4(vec4, src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0.x);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.x);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix B
+        b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0.y);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.y);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix B
+        b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0.z);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.z);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.z);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.z);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        // Load values from matrix B
+        b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0.w);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1.w);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2.w);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3.w);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        TENSOR_ITERATOR_ADVANCE(src0_iter, 4);
+    }
+
+    for(; i < COLS_A; ++i)
+    {
+        // Load values from matrix A
+        float a0 = LOAD_CURRENT_ITEM(src0_ptr, src0_iter);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        float a1 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        float a2 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        float a3 = LOAD(src0_ptr, IMAGE_OFFSET(src0_iter, 0, 3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        vec4 b0 = VLOAD4_CURRENT_ITEM(vec4, src1_ptr, src1_iter);
+
+        // Multiply and accumulate
+        acc0 += b0 * vec4(a0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+        acc1 += b0 * vec4(a1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+        acc2 += b0 * vec4(a2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+        acc3 += b0 * vec4(a3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+        TENSOR_ITERATOR_ADVANCE_IN_BYTES(src1_iter, src1_attrs.stride_y);
+        TENSOR_ITERATOR_ADVANCE(src0_iter, 1);
+    }
+
+    /* Multiply by the weight of vector-matrix product */
+    acc0 = acc0 * vec4(ALPHA);
+    VSTORE4_CURRENT_ITEM(dst_ptr, dst_iter, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = acc1 * vec4(ALPHA);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 1), acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = acc2 * vec4(ALPHA);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 2), acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = acc3 * vec4(ALPHA);
+    VSTORE4(dst_ptr, IMAGE_OFFSET(dst_iter, 0, 3), acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+#endif /* GEMM_MM_FLOATING_POINT_BIFROST */
+
 #ifdef GEMM_MATRIXADDITION
 /** This OpenGL ES kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
  *
@@ -462,7 +642,7 @@ void main(void)
 /** This OpenGL ES kernel computes the matrix multiplication between matrix A(src0) and matrix B(src1)
  * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
  *
  * @param[in]  src0_ptr   Pointer to the source matrix.Supported data types: F16
  * @param[in]  src0_attrs The attributes of the source matrix
@@ -837,7 +1017,7 @@ void main(void)
 /** This OpenGL ES kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)
  *  Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication
  *
- * @attention The width of matrix B and the alpha's value need to be passed at compile time using WIDTH_MATRIX_B and ALPHA
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
  *
  * @param[in]  src0_ptr   Pointer to the source matrix. Supported data types: F16
  * @param[in]  src0_attrs The attributes of the source matrix
diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
index 2bd769cac4..d576c30f80 100644
--- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp
@@ -52,12 +52,13 @@ inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *i
 {
     ARM_COMPUTE_UNUSED(reshape_info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
 
     if(!is_interleaved_transposed)
     {
-        ARM_COMPUTE_ERROR_ON(input0->dimension(0) != input1->dimension(1));
+        ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
 
         if(output->total_size() != 0)
         {
@@ -141,18 +142,17 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
     }
     else // The input tensors have not been reshaped
     {
-        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
+        // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor.
+        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
 
         switch(data_type)
         {
             case DataType::F16:
                 num_elems_processed_per_iteration_x = 4;
-                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
                 break;
 
             case DataType::F32:
                 num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type);
-                num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
                 break;
 
             default:
@@ -207,7 +207,6 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen
     // Create build options
     std::set<std::string> build_opts;
     std::string           kernel_name;
-    Window                win;
 
     build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1));
     build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1));
@@ -248,15 +247,26 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen
     {
         // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor
 
+        GPUTarget arch_target = get_arch_from_target(gpu_target);
         switch(input0->info()->data_type())
         {
             case DataType::F16:
                 build_opts.emplace("#define DATA_TYPE_FP16");
                 build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED");
+                build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
                 break;
 
             case DataType::F32:
                 build_opts.emplace("#define DATA_TYPE_FP32");
+
+                if(arch_target == GPUTarget::BIFROST && input0->info()->num_dimensions() != 1)
+                {
+                    build_opts.emplace("#define GEMM_MM_FLOATING_POINT_BIFROST");
+                }
+                else
+                {
+                    build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
+                }
                 break;
 
             default:
@@ -264,7 +274,6 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen
                 break;
         }
 
-        build_opts.emplace("#define GEMM_MM_FLOATING_POINT");
         build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elements_processed.x()));
         build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elements_processed.y()));
author	Michele Di Giorgio <michele.digiorgio@arm.com>	2018-04-26 10:24:30 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:52:35 +0000
commit	f6f08dac6d57770c191d1bc77123f0ddd2363d3f (patch)
tree	3409794e82c069398fb6eaf74f5fbce645adc2c9
parent	a4244190b6c7dc7d30d6adc621ca9a8b84b677ee (diff)
download	ComputeLibrary-f6f08dac6d57770c191d1bc77123f0ddd2363d3f.tar.gz