From daa38559bb01eb3c2985f503e7b2b4dc850a34d3 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 28 Aug 2018 17:43:18 +0100 Subject: COMPMID-1433: Use Arm macro to check whether we support dot product instructions Change-Id: I70c0ee5adfac81dccae26b6756f424f4200ba584 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/145990 Tested-by: Jenkins Reviewed-by: Giorgio Arena --- .../cl_kernels/depthwise_convolution_quantized.cl | 24 +++++++++++----------- src/core/CL/cl_kernels/gemmlowp.cl | 18 ++++++++-------- src/core/CL/cl_kernels/helpers.h | 18 ++++++++-------- 3 files changed, 29 insertions(+), 31 deletions(-) (limited to 'src/core/CL/cl_kernels') diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index fe902ed981..71889830c5 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -37,13 +37,13 @@ #define ACTIVATION_FUNC(x) (x) #endif /* defined(FUSED_ACTIVATION) */ -#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) -#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) #define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val); -#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) +#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) #define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3)); -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) @@ -51,7 +51,7 @@ #error "Stride X not supported" #endif /* CONV_STRIDE_X > 3 */ -#if !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) +#if !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)) #if CONV_STRIDE_X == 1 #define GET_VALUES(first_value, left, middle, right) \ @@ -260,7 +260,7 @@ __kernel void depthwise_convolution_3x3_quantized_nchw( #endif /* CONV_STRIDE_Y == 1 */ } -#else // !defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) +#else // !(defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)) #if CONV_STRIDE_X == 1 #define GET_VALUES(first_value, left, middle, right) \ @@ -499,7 +499,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( #endif /* CONV_STRIDE_Y == 1 */ } -#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) */ @@ -523,7 +523,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( #define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc) #endif /* WEIGHTS_OFFSET != 0 */ -#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) \ ({ \ ARM_DOT(val0.s0, val1.s0, val2.s0, val3.s0, w0.s0, w1.s0, w2.s0, w3.s0, acc.s0); \ @@ -553,7 +553,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nchw( #define DOT_PRODUCT_ACCUMULATE(acc, sum, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1, w2, w3, w4, w5, w6, w7, w8) #endif /* WEIGHTS_OFFSET != 0 */ -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1. @@ -954,7 +954,7 @@ __kernel void depthwise_convolution_3x3_quantized_nhwc_stride1( } } -#if ARM_COMPUTE_OPENCL_DOT8_ENABLED +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) /** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product * * @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2) @@ -1159,7 +1159,7 @@ __kernel void depthwise_convolution_3x3_quantized_dot8_nhwc_stride1( (res3, 0, dst_addr + 1 * dst_stride_y + 1 * dst_stride_z); } } -#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl index cd8b269ae2..12ac811cc7 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/gemmlowp.cl @@ -24,13 +24,13 @@ #include "helpers.h" #include "helpers_asymm.h" -#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) -#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) #define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val = arm_dot_acc((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3), val); -#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) +#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) #define ARM_DOT(x0, x1, x2, x3, y0, y1, y2, y3, val) val += arm_dot((uchar4)(x0, x1, x2, x3), (uchar4)(y0, y1, y2, y3)); -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #if defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP) /** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) @@ -423,7 +423,7 @@ __kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0) vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3))); } -#if ARM_COMPUTE_OPENCL_DOT8_ENABLED +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) /** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1) * Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication * @@ -587,7 +587,7 @@ __kernel void gemmlowp_mm_interleaved_transposed_bifrost_dot8(IMAGE_DECLARATION( vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2))); vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3))); } -#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP) @@ -1094,7 +1094,7 @@ __kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0), #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 } -#if ARM_COMPUTE_OPENCL_DOT8_ENABLED +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) /** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped * * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A @@ -1339,7 +1339,7 @@ __kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0), vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4))); #endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4 } -#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A) diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h index 3f7a2a504b..7ff9d8eaec 100644 --- a/src/core/CL/cl_kernels/helpers.h +++ b/src/core/CL/cl_kernels/helpers.h @@ -24,23 +24,21 @@ #ifndef ARM_COMPUTE_HELPER_H #define ARM_COMPUTE_HELPER_H -#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) #pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) +#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16) -#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) #pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) -#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) +#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable -#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) +#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) -#if defined(ARM_COMPUTE_DEBUG_ENABLED) -#if defined(cl_arm_printf) +#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) #pragma OPENCL EXTENSION cl_arm_printf : enable -#endif // defined(cl_arm_printf) -#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) +#endif defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf) #define EXPAND(x) x -- cgit v1.2.1