From 578ab61332aede4e11b7d5d92dbd72993d807a08 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 23 Jun 2017 09:34:33 +0100 Subject: COMPMID-414 - Port CLConvolutionLayer to support 8 bit fixed point - CLGEMMMatrixAccumulateBiasesKernel Change-Id: Idba13b578dc564b8003ce2fa3392eea2af3ce806 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78664 Tested-by: Kaizen Reviewed-by: Georgios Pinitas --- .../kernels/CLGEMMMatrixAccumulateBiasesKernel.h | 2 +- src/core/CL/CLKernelLibrary.cpp | 3 +- src/core/CL/cl_kernels/gemm.cl | 44 ++++++---------------- .../kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp | 12 +++--- 4 files changed, 20 insertions(+), 41 deletions(-) diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h index ea1db9f831..74a7a0e4a6 100644 --- a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h +++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h @@ -46,7 +46,7 @@ public: CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default; /** Set the accumulate buffer and the biases of the kernel. * - * @param[in, out] accum The accumulate tensor to convert. Data types supported: F16/F32 + * @param[in, out] accum The accumulate tensor to convert. Data types supported: QS8/F16/F32 * @param[in] biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input */ void configure(ICLTensor *accum, const ICLTensor *biases); diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 45a247db1a..6c64265785 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -151,8 +151,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "finalize", "optical_flow_pyramid_lk.cl" }, { "gaussian1x5_sub_x", "gaussian_pyramid.cl" }, { "gaussian5x1_sub_y", "gaussian_pyramid.cl" }, - { "gemm_accumulate_biases_f16", "gemm.cl" }, - { "gemm_accumulate_biases_f32", "gemm.cl" }, + { "gemm_accumulate_biases", "gemm.cl" }, { "gemm_interleave4x4_8bit", "gemm.cl" }, { "gemm_interleave4x4_16bit", "gemm.cl" }, { "gemm_interleave4x4_32bit", "gemm.cl" }, diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl index d80b5262a7..9bec8d5d92 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/gemm.cl @@ -247,6 +247,8 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src), } /** This kernel accumulates each row with the biases vector + * + * @note The data type must be passed at compile time -DDATA_TYPE=type. e.g. -DDATA_TYPE=short * * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F32 * @param[in] accum_stride_x Stride of the accmulate tensor in X dimension (in bytes) @@ -259,48 +261,24 @@ __kernel void gemm_interleave4x4_8bit(IMAGE_DECLARATION(src), * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor */ -__kernel void gemm_accumulate_biases_f32( - IMAGE_DECLARATION(accum), - VECTOR_DECLARATION(biases)) -{ - Image accum = CONVERT_TO_IMAGE_STRUCT(accum); - Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); - - float4 accum_value = vload4(0, (__global float *)accum.ptr); - float4 biases_value = vload4(0, (__global float *)biases.ptr); - accum_value = biases_value + accum_value; - - // Store result in the accummulate buffer - vstore4(accum_value, 0, (__global float *)accum.ptr); -} - -/** This kernel accumulates each row with the biases vector - * - * @param[in, out] accum_ptr Pointer to the accumulate tensor. Supported data type: F16 - * @param[in] accum_stride_x Stride of the accumulate tensor in X dimension (in bytes) - * @param[in] accum_step_x accum_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] accum_stride_y Stride of the accumlulate tensor in Y dimension (in bytes) - * @param[in] accum_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] accum_offset_first_element_in_bytes The offset of the first element in the accumulate tensor - * @param[in] biases_ptr Pointer to the biases vector. Same as input. - * @param[in] biases_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] biases_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void gemm_accumulate_biases_f16( +#if(defined DATA_TYPE) +__kernel void gemm_accumulate_biases( IMAGE_DECLARATION(accum), VECTOR_DECLARATION(biases)) { Image accum = CONVERT_TO_IMAGE_STRUCT(accum); Vector biases = CONVERT_TO_VECTOR_STRUCT(biases); - half8 accum_value = vload8(0, (__global half *)accum.ptr); - half8 biases_value = vload8(0, (__global half *)biases.ptr); - accum_value = biases_value + accum_value; + VEC_DATA_TYPE(DATA_TYPE, 16) + accum_value = vload16(0, (__global DATA_TYPE *)accum.ptr); + VEC_DATA_TYPE(DATA_TYPE, 16) + biases_value = vload16(0, (__global DATA_TYPE *)biases.ptr); + accum_value = biases_value + accum_value; // Store result in the accummulate buffer - vstore8(accum_value, 0, (__global half *)accum.ptr); + vstore16(accum_value, 0, (__global DATA_TYPE *)accum.ptr); } +#endif // defined DATA_TYPE #if(defined WIDTH_MATRIX_B) /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp index 289873c23f..75c1a6e629 100644 --- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp @@ -43,20 +43,22 @@ CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel() void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(biases, accum); ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1); _biases = biases; _accum = accum; + std::set build_opts; + build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()))); + // Create kernel - std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type())); - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name)); + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts)); // Configure kernel window - const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type()); + const unsigned int num_elems_processed_per_iteration = 16; Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration)); -- cgit v1.2.1