From 79acd77b9e737971f653cde640759670b27c673f Mon Sep 17 00:00:00 2001 From: Giorgio Arena Date: Thu, 22 Oct 2020 14:29:50 +0100 Subject: COMPMID-3713 Remove OpenCL padding: CLDepthwiseConvolutionLayerNativeKernel Signed-off-by: Giorgio Arena Change-Id: Ic43aba8a6a0a106fc4c1f665ff5cc3ccb31f403d Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4235 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio --- arm_compute/core/Utils.h | 2 +- src/core/CL/cl_kernels/depthwise_convolution.cl | 32 ++++---- .../cl_kernels/depthwise_convolution_quantized.cl | 27 +++---- .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 87 ++++------------------ 4 files changed, 46 insertions(+), 102 deletions(-) diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h index 590bdf93c0..681a1a708e 100644 --- a/arm_compute/core/Utils.h +++ b/arm_compute/core/Utils.h @@ -1351,7 +1351,7 @@ inline unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0) { ARM_COMPUTE_ERROR_ON(vec_size > 16); - if(dim0 == 3) + if((vec_size >= dim0) && (dim0 == 3)) { return dim0; } diff --git a/src/core/CL/cl_kernels/depthwise_convolution.cl b/src/core/CL/cl_kernels/depthwise_convolution.cl index da22faabed..5aba2061b4 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution.cl @@ -1311,7 +1311,7 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16( //3x3 Convolution of elements starting in 0th row pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y); //3x3 Convolution of elements starting in 2nd row - pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y); + pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y); #endif /* DILATION_X==1 && DILATION_Y==1 */ #ifdef HAS_BIAS @@ -1324,7 +1324,7 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16( } #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16) -#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) +#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER) /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float @@ -1338,6 +1338,7 @@ __kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16( * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1) * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X) * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1) + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively * @@ -1384,23 +1385,25 @@ __kernel void dwc_MxN_native_fp_nhwc( #endif // defined(HAS_BIAS) ) { + int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0) * sizeof(DATA_TYPE); + int x = get_global_id(0); // channels int y = get_global_id(1); // spatial coordinate x #if defined(DST_DEPTH) int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch #else // defined(DST_DEPTH) - int z = get_global_id(2); // spatial coordinate y + int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) - __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0; + __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs; - __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z; + __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z; - __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0; + __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER; #if defined(HAS_BIAS) - __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0; + __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER; #endif // defined(HAS_BIAS) #if defined(DST_DEPTH) @@ -1412,7 +1415,7 @@ __kernel void dwc_MxN_native_fp_nhwc( { // Each work-item computes N0x1x1 elements VEC_DATA_TYPE(DATA_TYPE, N0) - res = 0; + res0 = 0; int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT; int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP; @@ -1437,9 +1440,9 @@ __kernel void dwc_MxN_native_fp_nhwc( w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset)); #if GPU_ARCH == GPU_ARCH_MIDGARD - res += i * w; + res0 += i * w; #else // GPU_ARCH == GPU_ARCH_MIDGARD - res = fma(i, w, res); + res0 = fma(i, w, res0); #endif // GPU_ARCH == GPU_ARCH_MIDGARD } x_coord_tmp += DILATION_X; @@ -1449,13 +1452,12 @@ __kernel void dwc_MxN_native_fp_nhwc( } #if defined(HAS_BIAS) - res += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr)); + res0 += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr)); #endif // defined(HAS_BIAS) - res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL); + res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, res0, A_VAL, B_VAL); - VSTORE(N0) - (res, 0, (__global DATA_TYPE *)(d_addr)); + STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) w_addr += sizeof(DATA_TYPE); d_addr += sizeof(DATA_TYPE); @@ -1464,7 +1466,7 @@ __kernel void dwc_MxN_native_fp_nhwc( #endif // defined(HAS_BIAS) } } -#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) +#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER) #if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE) diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl index d4bea4b2e8..95cd44eb78 100644 --- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl @@ -1616,7 +1616,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( #endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) -#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) +#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER) /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped * * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2) @@ -1629,6 +1629,7 @@ __kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc( * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1) * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X) * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1) + * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively * @@ -1685,8 +1686,8 @@ __kernel void dwc_MxN_native_quantized8_nhwc( #endif // defined(HAS_BIAS) ) { - int x = get_global_id(0); // channels - int y = get_global_id(1); // spatial coordinate x + int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0); + int y = get_global_id(1); // spatial coordinate x #if defined(DST_DEPTH) int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y int b = get_global_id(2) / (int)DST_DEPTH; // batch @@ -1694,19 +1695,19 @@ __kernel void dwc_MxN_native_quantized8_nhwc( int z = get_global_id(2); // spatial coordinate y #endif // defined(DST_DEPTH) - __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0; + __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE); - __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z; + __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z; - __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0; + __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER; #if defined(HAS_BIAS) - __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; + __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER; #endif // defined(HAS_BIAS) #if defined(PER_CHANNEL_QUANTIZATION) - __global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; - __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0; + __global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER; + __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER; #endif // defined(PER_CHANNEL_QUANTIZATION) #if defined(DST_DEPTH) @@ -1772,10 +1773,10 @@ __kernel void dwc_MxN_native_quantized8_nhwc( res += (VEC_INT)OUTPUT_OFFSET; VEC_TYPE(VEC_SIZE) - res1 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE)); + res0 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE)); + res0 = ACTIVATION_FUNC(res0); - VSTORE(N0) - (ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(d_addr)); + STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) #if DEPTH_MULTIPLIER > 1 w_addr += sizeof(WEIGHTS_TYPE); @@ -1790,5 +1791,5 @@ __kernel void dwc_MxN_native_quantized8_nhwc( } #endif // DEPTH_MULTIPLIER > 1 } -#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) +#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER) #endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE) diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 5a3a0ec435..4580968d38 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -123,60 +123,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, return Status{}; } - -std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info, - const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts) -{ - ARM_COMPUTE_UNUSED(dwc_info); - - // Get convolved dimensions - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation); - - auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_quantization_info(output->quantization_info())); - - const unsigned int n0 = dwc_weights_info.n0; - - // Configure kernel window - Window win = calculate_max_window(*output, Steps(n0)); - - // The following access windows are only valid in case of NHWC and because n0 must unit in case depth_multiplier > 1 - AccessWindowHorizontal input_access(input, 0, n0); - AccessWindowHorizontal weights_access(weights, 0, n0); - AccessWindowHorizontal output_access(output, 0, n0); - - bool window_changed = false; - - if(bias != nullptr) - { - AccessWindowHorizontal bias_access(bias, 0, n0); - window_changed = update_window_and_padding(win, input_access, weights_access, bias_access, output_access); - } - else - { - window_changed = update_window_and_padding(win, input_access, weights_access, output_access); - } - - if(is_data_type_quantized(input->data_type())) - { - if((output_multipliers != nullptr) && (output_shifts != nullptr)) - { - AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, n0); - AccessWindowHorizontal output_shifts_access(output_shifts, 0, n0); - window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access); - } - else - { - Status err = ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "output_multipliers and output_shifts must be non-nullptr for quantized input"); - return std::make_pair(err, win); - } - } - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} } // namespace CLDepthwiseConvolutionLayerNativeKernel::CLDepthwiseConvolutionLayerNativeKernel() @@ -208,10 +154,10 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr)); - auto win_config = validate_and_configure_window(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), - dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, - (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + auto padding_info = get_padding_info({ input, output }); + + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info, depth_multiplier, dilation); + auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info())); _input = input; _output = output; @@ -222,10 +168,7 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & _output_shifts = output_shifts; _is_quantized = is_data_type_quantized(input->info()->data_type()); - const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH); - const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT); - const size_t weights_width = weights->info()->dimension(idx_w); - const size_t weights_height = weights->info()->dimension(idx_h); + const unsigned int n0 = adjust_vec_size(dwc_weights_info.n0, input->info()->dimension(0)); CLBuildOptions build_opts; build_opts.add_option_if(_biases != nullptr, "-DHAS_BIAS"); @@ -233,17 +176,18 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type())); build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(dwc_info.activation_info.activation()))); build_opts.add_option("-DDEPTH_MULTIPLIER=" + support::cpp11::to_string(depth_multiplier)); - build_opts.add_option("-DN0=" + support::cpp11::to_string(dwc_weights_info.n0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); build_opts.add_option("-DSRC_DIM1=" + support::cpp11::to_string(_input->info()->dimension(1))); build_opts.add_option("-DSRC_DIM2=" + support::cpp11::to_string(_input->info()->dimension(2))); - build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(weights_width)); - build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(weights_height)); + build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(weights->info()->dimension(1))); + build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(weights->info()->dimension(2))); build_opts.add_option("-DCONV_PAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); build_opts.add_option("-DCONV_PAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); build_opts.add_option("-DCONV_STRIDE_X=" + support::cpp11::to_string(conv_info.stride().first)); build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second)); build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(_input->info()->dimension(0) % n0)); std::string kernel_name = (_is_quantized) ? "dwc_MxN_native_quantized8_nhwc" : "dwc_MxN_native_fp_nhwc"; @@ -290,9 +234,13 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option_if(dwc_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(dwc_info.activation_info.b())); } - ICLKernel::configure_internal(win_config.second); + Window win = calculate_max_window(*(output->info()), Steps(n0)); + ICLKernel::configure_internal(win); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); + // Set config_id for enabling LWS tuning _config_id = kernel_name; _config_id += "_"; @@ -316,13 +264,6 @@ Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *inpu unsigned int depth_multiplier, const Size2D &dilation, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, output_multipliers, output_shifts)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), - biases != nullptr ? biases->clone().get() : nullptr, - output->clone().get(), dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation, - output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, - output_shifts != nullptr ? output_shifts->clone().get() : nullptr) - .first); - return Status{}; } -- cgit v1.2.1