aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/CL/CLKernelLibrary.cpp1
-rw-r--r--src/core/CL/cl_kernels/gemm.cl177
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp15
-rw-r--r--tests/benchmark/fixtures/WinogradConvolutionLayerFixture.h6
4 files changed, 193 insertions, 6 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 955844da3e..ff4803e79c 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -251,6 +251,7 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "gemm_mv", "gemv.cl" },
{ "gemm_mv_quantized", "gemv.cl" },
{ "gemm_mm_interleaved_transposed_f16", "gemm.cl" },
+ { "gemm_mm_interleaved_transposed_f16_acc32", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f32", "gemm.cl" },
{ "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" },
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
index 5d5cab6578..7de15d018a 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/gemm.cl
@@ -879,6 +879,183 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
#endif // defined(REINTERPRET_OUTPUT_AS_3D)
}
+/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) while accumulating the result in a 32 floating point variable.
+ * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16)
+ * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16
+ * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
+ * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
+ * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+ IMAGE_DECLARATION(src1),
+ IMAGE_DECLARATION(dst),
+ uint src0_stride_z,
+ uint src1_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+ )
+{
+ int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+ int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+ int z = get_global_id(2);
+
+ // Offset
+ const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+ const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+ // src_addr_a = address of matrix A
+ // src_addr_b = address of matrix B
+ int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+ int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+ // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+ src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else // defined(MATRIX_B_DEPTH)
+ src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+ __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+ __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+ // Compute end row address for matrix B
+ __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+ src_addr_a += offset_row_a;
+ src_addr_b += offset_row_b;
+
+ // Reset accumulators
+ float8 c00 = 0.0f;
+ float8 c10 = 0.0f;
+ float8 c20 = 0.0f;
+ float8 c30 = 0.0f;
+
+ for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = convert_float4(vload4(0, src_addr_a));
+ float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+ c00 += (float8)a0.s0 * b0;
+ c10 += (float8)a0.s1 * b0;
+ c20 += (float8)a0.s2 * b0;
+ c30 += (float8)a0.s3 * b0;
+
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+ b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
+
+ c00 += (float8)a0.s0 * b0;
+ c10 += (float8)a0.s1 * b0;
+ c20 += (float8)a0.s2 * b0;
+ c30 += (float8)a0.s3 * b0;
+ }
+
+ for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+ {
+ // Load values from matrix A (interleaved) and matrix B (transposed)
+ float4 a0 = convert_float4(vload4(0, src_addr_a));
+ float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+ c00 += (float8)a0.s0 * b0;
+ c10 += (float8)a0.s1 * b0;
+ c20 += (float8)a0.s2 * b0;
+ c30 += (float8)a0.s3 * b0;
+ }
+
+ // Compute destination address
+ Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+#if defined(ALPHA)
+ // Multiply by the weight of matrix product
+ c00 = c00 * (float8)ALPHA;
+ c10 = c10 * (float8)ALPHA;
+ c20 = c20 * (float8)ALPHA;
+ c30 = c30 * (float8)ALPHA;
+#endif // defined(ALPHA)
+
+ // Compute dst address
+ __global uchar *dst_addr = offset(&dst, 0, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+ // in order to take into account the presence of possible cross plane paddings
+ //
+ // | |
+ // | plane0 |
+ // | |
+ // |__________________|
+ // |******************|
+ // | cross_plane_pad |
+ // |******************|
+ // | |
+ // | plane1 |
+ // | |
+ // |__________________|
+
+ // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+ uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+ zout = min(DEPTH_GEMM3D - 1, zout);
+
+ // Add offset due to the cross plane paddings
+ zout *= (cross_plane_pad * dst_stride_y);
+
+ // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+ // multiply dst_stride_z by DEPTH_GEMM3D
+ dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+ // Store 4x8 block
+ vstore8(convert_half8(c00), 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+ vstore8(convert_half8(c10), 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+ vstore8(convert_half8(c20), 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+ vstore8(convert_half8(c30), 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+ // Add offset for batched GEMM
+ dst_addr += z * dst_stride_z;
+
+ // Store 4x8 block
+ vstore8(convert_half8(c00), 0, (__global half *)(dst_addr + 0 * dst_stride_y));
+ vstore8(convert_half8(c10), 0, (__global half *)(dst_addr + 1 * dst_stride_y));
+ vstore8(convert_half8(c20), 0, (__global half *)(dst_addr + 2 * dst_stride_y));
+ vstore8(convert_half8(c30), 0, (__global half *)(dst_addr + 3 * dst_stride_y));
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+}
+
/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1)
* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication
*
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index b549638343..c9ed7763da 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -292,6 +292,11 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
else
{
kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
+ if(fp_mixed_precision && data_type == DataType::F16)
+ {
+ // currently wider accumulator is only supported for fp16 kernels.
+ kernel_name += "_acc32";
+ }
}
}
else // The input tensors have not been reshaped
@@ -307,6 +312,11 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
if(input0->info()->num_dimensions() != 1)
{
kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+ if(fp_mixed_precision && data_type == DataType::F16)
+ {
+ // currently wider accumulator is only supported for fp16 kernels.
+ kernel_name += "_acc32";
+ }
}
else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
{
@@ -319,11 +329,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
// The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
// via exhaustive autotuning over a range of representative layer configurations.
set_lws_hint(cl::NDRange(4));
- if(fp_mixed_precision && data_type == DataType::F16)
- {
- // currently wider accumulator is only supported for fp16 kernels.
- kernel_name += "_acc32";
- }
}
else // (MIDGARD and F32) or (F16)
{
diff --git a/tests/benchmark/fixtures/WinogradConvolutionLayerFixture.h b/tests/benchmark/fixtures/WinogradConvolutionLayerFixture.h
index 5f44517817..86ea9c4dc1 100644
--- a/tests/benchmark/fixtures/WinogradConvolutionLayerFixture.h
+++ b/tests/benchmark/fixtures/WinogradConvolutionLayerFixture.h
@@ -60,7 +60,11 @@ public:
dst = create_tensor<TensorType>(dst_shape, data_type, 1);
// Create and configure function
- conv_layer.configure(&src, &weights, &biases, &dst, info, act_info);
+ if(data_type == DataType::F16){
+ conv_layer.configure(&src, &weights, &biases, &dst, info, act_info, true);
+ }else{
+ conv_layer.configure(&src, &weights, &biases, &dst, info, act_info);
+ }
// Allocate tensors
src.allocator()->allocate();