From 0b76f7dd12240dc7a546c202ee80a7942d9898cd Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Thu, 8 Apr 2021 17:20:00 +0100 Subject: Add support for cl_image in CLDirectConvolutionLayer - The cl_image object can be used for the weights - cl_image can only work for f32/f16 - Fix the implicit padding on the first dimension X Resolves COMPMID-4341 Change-Id: I04e0901c69e7765c42afceca38c4a840645b9123 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5393 Reviewed-by: Giorgio Arena Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/direct_convolution.cl | 62 ++++++-------- src/core/CL/cl_kernels/tile_helpers.h | 71 ++++++++++++---- .../CL/cl_kernels/winograd_output_transform.cl | 8 +- .../gpu/cl/kernels/ClDirectConvolutionKernel.cpp | 94 +++++++++++++++++++--- src/runtime/CL/functions/CLConvolutionLayer.cpp | 2 +- 5 files changed, 169 insertions(+), 68 deletions(-) diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/direct_convolution.cl index 96196bda8d..220179effb 100644 --- a/src/core/CL/cl_kernels/direct_convolution.cl +++ b/src/core/CL/cl_kernels/direct_convolution.cl @@ -122,6 +122,7 @@ __kernel void direct_convolution_nhwc( #define _IDST_WIDTH DST_WIDTH #define _IDST_HEIGHT DST_HEIGHT #define _IDST_CHANNELS DST_CHANNELS +#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT) // If quantized, the output tile has to be quantized first before being stored to global memory #if defined(IS_QUANTIZED) @@ -136,8 +137,8 @@ __kernel void direct_convolution_nhwc( // .v = access the whole vector (OpenCL vector) // .s[x] = access the vector element at position x (scalar access) - TILE(int, M0, 1, xi) = {{ { 0 } }}; - TILE(int, M0, 1, yi) = {{ { 0 } }}; + TILE(int, M0, 1, xi); + TILE(int, M0, 1, yi); // Convert the linear index to coordinate LOOP_UNROLLING(int, i, 0, M0, 1) @@ -148,29 +149,14 @@ __kernel void direct_convolution_nhwc( yi[i].v -= PAD_TOP; } - uint wei_x = 0; - // Initialize the accumulators - TILE(ACC_DATA_TYPE, M0, N0, c) = {{ { 0 } }}; + TILE(ACC_DATA_TYPE, M0, N0, c) = { { { 0 } } }; for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i) { - uint src_x = 0; - int xk = i % _IWEI_WIDTH; - int yk = i / _IWEI_WIDTH; - - TILE(int, M0, 1, src_indirect_y) = {{ { 0 } }}; - TILE(int, M0, 1, src_indirect_mask) = {{ { 0 } }}; - - // Calculate the source indirect Y and the source indirect mask - // Since the indirect Y is clamped when out-of-bound, the mask is used to - // force to zero the out-of-bound values - LOOP_UNROLLING(int, i, 0, M0, 1) - { - src_indirect_y[i].v = (CLAMP(xi[i].v + xk, 0, (int)_ISRC_WIDTH - 1) + CLAMP(yi[i].v + yk, 0, (int)_ISRC_HEIGHT - 1) * _ISRC_WIDTH); - src_indirect_y[i].v += bout * (int)_ISRC_WIDTH * (int)_ISRC_HEIGHT; - src_indirect_mask[i].v = ((xi[i].v + xk) >= 0 && (xi[i].v + xk) < (int)_ISRC_WIDTH && (yi[i].v + yk) >= 0 && (yi[i].v + yk) < (int)_ISRC_HEIGHT); - } + int ck = 0; + int xk = i % _IWEI_WIDTH; + int yk = i / _IWEI_WIDTH; int k = 0; for(; k <= (_ISRC_CHANNELS - K0); k += K0) @@ -178,14 +164,16 @@ __kernel void direct_convolution_nhwc( TILE(SRC_DATA_TYPE, M0, K0, a); TILE(WEI_DATA_TYPE, N0, K0, b); + LOOP_UNROLLING(int, i, 0, M0, 1) + { + a[i].v = ZERO_VALUE; + } + // Load tile from the src tensor - T_LOAD_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, src_x, src_stride_y, src_indirect_y, a); + T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, 1, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); // Load tile from the weights tensor - T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, wei_x, cout, wei_stride_w, b); - - // Fill with zero the out-of-bound rows - T_ROWSET_MASK(SRC_DATA_TYPE, M0, K0, ZERO_VALUE, a, src_indirect_mask); + T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); // Compute the matrix multiplication between two tiles T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c); @@ -194,8 +182,7 @@ __kernel void direct_convolution_nhwc( // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c); - src_x += K0; - wei_x += K0; + ck += K0; } // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS @@ -207,14 +194,17 @@ __kernel void direct_convolution_nhwc( TILE(SRC_DATA_TYPE, M0, 1, a); TILE(WEI_DATA_TYPE, N0, 1, b); + LOOP_UNROLLING(int, i, 0, M0, 1) + { + a[i].v = ZERO_VALUE; + } + // Load tile from the src tensor - T_LOAD_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, src_x, src_stride_y, src_indirect_y, a); + T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, 1, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a); // Load tile from the weights tensor - T_LOAD(WEI_DATA_TYPE, N0, 1, WEI_TENSOR_TYPE, wei, wei_x, cout, wei_stride_w, b); - - // Fill with zero the out-of-bound rows - T_ROWSET_MASK(SRC_DATA_TYPE, M0, 1, ZERO_VALUE, a, src_indirect_mask); + // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration + T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b); // Compute the matrix multiplication between two tiles T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c); @@ -223,8 +213,7 @@ __kernel void direct_convolution_nhwc( // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c); - ++src_x; - ++wei_x; + ++ck; } #endif // ((SRC_CHANNELS % K0) != 0) } @@ -236,7 +225,7 @@ __kernel void direct_convolution_nhwc( #if defined(HAS_BIAS) TILE(BIA_DATA_TYPE, 1, N0, bias0); - T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 0, bias0); + T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0); // c = c + bias[broadcasted] T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c); @@ -274,4 +263,5 @@ __kernel void direct_convolution_nhwc( #undef _IDST_WIDTH #undef _IDST_HEIGHT #undef _IDST_CHANNELS +#undef _IY_MULTIPLIER } \ No newline at end of file diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h index b963f8b5e3..496f2dd664 100644 --- a/src/core/CL/cl_kernels/tile_helpers.h +++ b/src/core/CL/cl_kernels/tile_helpers.h @@ -165,23 +165,26 @@ /** Load a tile from global memory (tensor) * - * @param[in] DATA_TYPE Data type - * @param[in] HEIGHT Number of dst rows - * @param[in] WIDTH Number of dst columns - * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). - * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16) - * @param[in] TENSOR Tensor basename - * @param[in] X Starting X position - * @param[in] Y Starting Y position - * @param[in] STRIDE_Y Stride Y (in bytes) - * @param[out] dst Output tile + * @param[in] DATA_TYPE Data type + * @param[in] HEIGHT Number of dst rows + * @param[in] WIDTH Number of dst columns + * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). + * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16) + * @param[in] TENSOR Tensor basename + * @param[in] X Starting X position + * @param[in] Y Starting Y position + * @param[in] YI_MULTIPLIER Parameter used to multiply the internal row increment (_i). + * In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y. (e.g. loading the weights of convolution layer). + * In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y + * @param[in] STRIDE_Y Stride Y (in bytes) used to load each row. + * @param[out] dst Output tile */ -#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y, dst) \ +#define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \ ({ \ LOOP_UNROLLING(int, _i, 0, HEIGHT, 1) \ { \ - dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i), STRIDE_Y); \ - } \ + dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \ + } \ }) /** Load a tile from global memory (tensor) using an indirect Y index tile @@ -223,7 +226,7 @@ * @param[in] STRIDE_Y Stride Y (in bytes) * @param[out] dst Output tile */ -#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \ +#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \ ({ \ LOOP_UNROLLING(int, _yk, 0, (TILE_HEIGHT), 1) \ { \ @@ -235,9 +238,43 @@ if(_src_valid_y != 0) \ { \ dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ - } \ - } \ - } \ + } \ + } \ + } \ + }) + +/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout using indirect X and Y coordinates + * + * @param[in] DATA_TYPE Data type + * @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension + * @param[in] TILE_WIDTH Number of elements to load from X (width) dimension + * @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension + * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported + * In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16) + * @param[in] TENSOR Tensor basename + * @param[in] B Starting batch index + * @param[in] Y Starting Y index + * @param[in] X Starting X index + * @param[in] C Starting C index + * @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension + * @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension + * @param[in] STRIDE_Y Stride Y (in bytes) + * @param[out] xi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate + * @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate + * @param[out] dst Output tile + */ +#define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \ + ({ \ + LOOP_UNROLLING(int, _i, 0, (TILE_WIDTH * TILE_HEIGHT), 1) \ + { \ + int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \ + _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \ + int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \ + if(_src_valid_y != 0) \ + { \ + dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \ + } \ + } \ }) /** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl index 674a138d48..6bd90604e5 100644 --- a/src/core/CL/cl_kernels/winograd_output_transform.cl +++ b/src/core/CL/cl_kernels/winograd_output_transform.cl @@ -637,7 +637,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( #if defined(HAS_BIAS) TILE(DATA_TYPE, 1, N0, b); - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b); + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); // c = c + bias[broadcasted] T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); @@ -718,7 +718,7 @@ __kernel void winograd_output_transform_4x4_3x3_nhwc( #if defined(HAS_BIAS) TILE(DATA_TYPE, 1, N0, b); - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b); + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); // c = c + bias[broadcasted] T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); @@ -1070,7 +1070,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( #if defined(HAS_BIAS) TILE(DATA_TYPE, 1, N0, b); - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b); + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); // c = c + bias[broadcasted] T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); @@ -1162,7 +1162,7 @@ __kernel void winograd_output_transform_4x4_5x5_nhwc( #if defined(HAS_BIAS) TILE(DATA_TYPE, 1, N0, b); - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 0, b); + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); // c = c + bias[broadcasted] T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); diff --git a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp index c6ca084386..bf26477895 100644 --- a/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp +++ b/src/core/gpu/cl/kernels/ClDirectConvolutionKernel.cpp @@ -33,12 +33,13 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" +#include "src/core/CL/gemm/CLGEMMHelpers.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" #include "support/StringSupport.h" - namespace arm_compute { namespace opencl @@ -276,7 +277,11 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso if(data_layout == DataLayout::NHWC) { const unsigned int vec_size = std::min(static_cast(dst->tensor_shape()[0]), 4u); - const unsigned int num_rows = dst->tensor_shape()[0] > 16 ? 2u : 1U; + unsigned int num_rows = 1U; + if(dst->tensor_shape()[0] > 16) + { + num_rows = src->data_type() == DataType::F32 ? 2U : 4U; + } // Create window and update padding Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); @@ -318,6 +323,50 @@ std::pair validate_and_configure_window(ITensorInfo *src, ITenso ARM_COMPUTE_ERROR("Not supported"); } } + +bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout) +{ + if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) + { + return false; + } + + // If not floating point + if(!is_data_type_float(tensor->data_type())) + { + return false; + } + + if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + { + return false; + } + + // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform + if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + { + return false; + } + + // Check cl image pitch alignment + if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) + { + return false; + } + + const size_t image_w = tensor->tensor_shape()[0] / 4; + const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; + const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo(); + const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo(); + + if(image_w > max_image_w || image_h > max_image_h) + { + return false; + } + + return true; +} + } // namespace BorderSize ClDirectConvolutionKernel::border_size() const @@ -365,13 +414,19 @@ void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_contex kernel_name << "direct_convolution_nhwc"; - const unsigned int n0 = win_config.second.x().step(); - const unsigned int m0 = win_config.second.y().step(); + const unsigned int n0 = win_config.second.x().step(); + const unsigned int m0 = win_config.second.y().step(); + const unsigned int k0 = adjust_vec_size(8u, src->dimension(channel_idx)); + const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; + const unsigned int pad_left = conv_info.pad_left(); + const unsigned int pad_top = conv_info.pad_top(); + const bool export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout); - const unsigned int k0 = adjust_vec_size(8u, src->dimension(channel_idx)); - const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; - const unsigned int pad_left = conv_info.pad_left(); - const unsigned int pad_top = conv_info.pad_top(); + // Update the padding for the weights tensor if we can export to cl_image + if(export_to_cl_image) + { + arm_compute::cl_gemm::update_padding_for_cl_image(weights); + } if(biases != nullptr) { @@ -390,7 +445,7 @@ void ClDirectConvolutionKernel::configure(const CLCompileContext &compile_contex build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx))); build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx))); build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_options.add_option("-DWEI_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type())); @@ -533,13 +588,32 @@ void ClDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &windo if(_data_layout == DataLayout::NHWC) { - const size_t dim_y_collapsed = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step()); + cl::Image2D weights_cl_image; + + const size_t dim_y_collapsed = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step()); + const bool export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout); + slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step())); slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1)); + if(export_to_cl_image) + { + const size_t image_w = weights->info()->dimension(0) / 4; + const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); + const TensorShape shape2d(image_w, image_h); + const size_t image_row_pitch = weights->info()->strides_in_bytes()[1]; + + // Export cl_buffer to cl_image + weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch); + } + unsigned int idx = 0; add_4D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice); + if(export_to_cl_image) + { + _kernel.setArg(idx++, weights_cl_image); + } add_4D_tensor_argument(idx, weights, slice); if(biases != nullptr) { diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp index b1351f6747..ac18b966af 100644 --- a/src/runtime/CL/functions/CLConvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp @@ -224,7 +224,7 @@ ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo * { return ConvolutionMethod::DIRECT; } - if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) > output->dimension(idx_c)) && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) + if((weights->dimension(idx_h) > 7) && (input->dimension(idx_c) >= output->dimension(idx_c)) && (CLDirectConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info))) { return ConvolutionMethod::DIRECT; } -- cgit v1.2.1