From 38d93bdda73f9b1024c6b4b834b382f7f25aae19 Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Tue, 20 Nov 2018 15:38:13 +0000 Subject: COMPMID-1801 : (Nightly) CLWinogradConvolutionLayer FP16 mismatches FP mixed precision support added to GEMM kernel used for fp16 winograd conv on Midgard GPUs Change-Id: I1619beb025fc484a1ac9d3e528d785edabbc7ee6 --- src/core/CL/CLKernelLibrary.cpp | 1 + src/core/CL/cl_kernels/gemm.cl | 177 +++++++++++++++++++++ src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 15 +- .../fixtures/WinogradConvolutionLayerFixture.h | 6 +- 4 files changed, 193 insertions(+), 6 deletions(-) diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 955844da3e..ff4803e79c 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -251,6 +251,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "gemm_mv", "gemv.cl" }, { "gemm_mv_quantized", "gemv.cl" }, { "gemm_mm_interleaved_transposed_f16", "gemm.cl" }, + { "gemm_mm_interleaved_transposed_f16_acc32", "gemm.cl" }, { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm.cl" }, { "gemm_mm_interleaved_transposed_f32", "gemm.cl" }, { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm.cl" }, diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index 5d5cab6578..7de15d018a 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -879,6 +879,183 @@ __kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), #endif // defined(REINTERPRET_OUTPUT_AS_3D) } +/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) while accumulating the result in a 32 floating point variable. + * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication + * + * @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA + * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (i.e. -DMULT_TRANSPOSE1XW_WIDTH=2) + * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2) + * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (i.e. -DMATRIX_B_DEPTH=16) + * This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (i.e. a = [K, M, 16, Batches], b = [N, K, 16]) + * + * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time: + * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D + * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor. + * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor + * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped + * + * @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16 + * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr + * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes) + * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes) + * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix + * @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr + * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix + * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes) + * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D) + */ +__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0), + IMAGE_DECLARATION(src1), + IMAGE_DECLARATION(dst), + uint src0_stride_z, + uint src1_stride_z, + uint dst_stride_z +#if defined(REINTERPRET_OUTPUT_AS_3D) + , + uint cross_plane_pad +#endif // REINTERPRET_OUTPUT_AS_3D + ) +{ + int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH; + int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT; + int z = get_global_id(2); + + // Offset + const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4; + const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8; + + // src_addr_a = address of matrix A + // src_addr_b = address of matrix B + int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes; + int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes; + +#if defined(MATRIX_B_DEPTH) + // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3 + src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z; +#else // defined(MATRIX_B_DEPTH) + src1_addr_in_bytes += z * src1_stride_z; +#endif // defined(MATRIX_B_DEPTH) + + __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes); + __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes); + + // Compute end row address for matrix B + __global half *src_end_addr_b = src_addr_b + COLS_B; + + src_addr_a += offset_row_a; + src_addr_b += offset_row_b; + + // Reset accumulators + float8 c00 = 0.0f; + float8 c10 = 0.0f; + float8 c20 = 0.0f; + float8 c30 = 0.0f; + + for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = convert_float4(vload4(0, src_addr_a)); + float8 b0 = convert_float8(vload8(0, src_addr_b)); + + c00 += (float8)a0.s0 * b0; + c10 += (float8)a0.s1 * b0; + c20 += (float8)a0.s2 * b0; + c30 += (float8)a0.s3 * b0; + + // Load values from matrix A (interleaved) and matrix B (transposed) + a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT)); + b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH)); + + c00 += (float8)a0.s0 * b0; + c10 += (float8)a0.s1 * b0; + c20 += (float8)a0.s2 * b0; + c30 += (float8)a0.s3 * b0; + } + + for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH) + { + // Load values from matrix A (interleaved) and matrix B (transposed) + float4 a0 = convert_float4(vload4(0, src_addr_a)); + float8 b0 = convert_float8(vload8(0, src_addr_b)); + + c00 += (float8)a0.s0 * b0; + c10 += (float8)a0.s1 * b0; + c20 += (float8)a0.s2 * b0; + c30 += (float8)a0.s3 * b0; + } + + // Compute destination address + Image dst = CONVERT_TO_IMAGE_STRUCT(dst); + +#if defined(ALPHA) + // Multiply by the weight of matrix product + c00 = c00 * (float8)ALPHA; + c10 = c10 * (float8)ALPHA; + c20 = c20 * (float8)ALPHA; + c30 = c30 * (float8)ALPHA; +#endif // defined(ALPHA) + + // Compute dst address + __global uchar *dst_addr = offset(&dst, 0, 0); + +#if defined(REINTERPRET_OUTPUT_AS_3D) + // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension + // in order to take into account the presence of possible cross plane paddings + // + // | | + // | plane0 | + // | | + // |__________________| + // |******************| + // | cross_plane_pad | + // |******************| + // | | + // | plane1 | + // | | + // |__________________| + + // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D + uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D; + zout = min(DEPTH_GEMM3D - 1, zout); + + // Add offset due to the cross plane paddings + zout *= (cross_plane_pad * dst_stride_y); + + // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we + // multiply dst_stride_z by DEPTH_GEMM3D + dst_addr += z * dst_stride_z * DEPTH_GEMM3D; + + // Store 4x8 block + vstore8(convert_half8(c00), 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0)); + vstore8(convert_half8(c10), 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1)); + vstore8(convert_half8(c20), 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2)); + vstore8(convert_half8(c30), 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3)); + +#else // defined(REINTERPRET_OUTPUT_AS_3D) + // Add offset for batched GEMM + dst_addr += z * dst_stride_z; + + // Store 4x8 block + vstore8(convert_half8(c00), 0, (__global half *)(dst_addr + 0 * dst_stride_y)); + vstore8(convert_half8(c10), 0, (__global half *)(dst_addr + 1 * dst_stride_y)); + vstore8(convert_half8(c20), 0, (__global half *)(dst_addr + 2 * dst_stride_y)); + vstore8(convert_half8(c30), 0, (__global half *)(dst_addr + 3 * dst_stride_y)); +#endif // defined(REINTERPRET_OUTPUT_AS_3D) +} + /** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A (src0) and matrix B (src1) * Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication * diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index b549638343..c9ed7763da 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -292,6 +292,11 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen else { kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)); + if(fp_mixed_precision && data_type == DataType::F16) + { + // currently wider accumulator is only supported for fp16 kernels. + kernel_name += "_acc32"; + } } } else // The input tensors have not been reshaped @@ -307,6 +312,11 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen if(input0->info()->num_dimensions() != 1) { kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost"; + if(fp_mixed_precision && data_type == DataType::F16) + { + // currently wider accumulator is only supported for fp16 kernels. + kernel_name += "_acc32"; + } } else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32) { @@ -319,11 +329,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels // via exhaustive autotuning over a range of representative layer configurations. set_lws_hint(cl::NDRange(4)); - if(fp_mixed_precision && data_type == DataType::F16) - { - // currently wider accumulator is only supported for fp16 kernels. - kernel_name += "_acc32"; - } } else // (MIDGARD and F32) or (F16) { diff --git a/tests/benchmark/fixtures/WinogradConvolutionLayerFixture.h b/tests/benchmark/fixtures/WinogradConvolutionLayerFixture.h index 5f44517817..86ea9c4dc1 100644 --- a/tests/benchmark/fixtures/WinogradConvolutionLayerFixture.h +++ b/tests/benchmark/fixtures/WinogradConvolutionLayerFixture.h @@ -60,7 +60,11 @@ public: dst = create_tensor(dst_shape, data_type, 1); // Create and configure function - conv_layer.configure(&src, &weights, &biases, &dst, info, act_info); + if(data_type == DataType::F16){ + conv_layer.configure(&src, &weights, &biases, &dst, info, act_info, true); + }else{ + conv_layer.configure(&src, &weights, &biases, &dst, info, act_info); + } // Allocate tensors src.allocator()->allocate(); -- cgit v1.2.1