COMPMID-1017: Implement dilated convolution in NEON, OpenCL, and GC

Change-Id: If4626ec9e215e14dffe22e80812da5bac84a52e2 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125734 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
author: Alex Gilday <alexander.gilday@arm.com> 2018-03-23 14:16:00 +0000
committer: Anthony Barbier <anthony.barbier@arm.com> 2018-11-02 16:49:16 +0000
commit: 7da29b6b12ff319ed2b6e2c46588dfa1991556fb (patch)
tree: 24e766d916ae8da32deb5cd4fac4d82207cbe6ea /src/core
parent: f92cb23f06572fe73ec5ab9da0ec5713724c2dde (diff)
download: ComputeLibrary-7da29b6b12ff319ed2b6e2c46588dfa1991556fb.tar.gz
6 files changed, 119 insertions, 97 deletions
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 75d99bda85..1e85e1b303 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -680,6 +680,7 @@ __kernel void im2col_generic_padx0_pady0_dchw(
  * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
@@ -722,10 +723,12 @@ __kernel void im2col_generic_dchw(
     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
 
     // Linearize convolution elements
-    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
     {
-        for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
+        int y = yi + yk * DILATION_Y;
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
         {
+            int x = xi + xk * DILATION_X;
 #if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
             *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
 #else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 9bc4787384..cc19d3c263 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -41,11 +41,12 @@ using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -63,12 +64,12 @@ CLIm2ColKernel::CLIm2ColKernel()
 {
 }
 
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias, dilation));
 
     _input       = input;
     _output      = output;
@@ -107,7 +108,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
 
         _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
                                             kernel_dims.width, kernel_dims.height,
-                                            conv_info);
+                                            conv_info, dilation);
 
         build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
         build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
@@ -122,77 +123,82 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
         build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
         build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+        build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+        build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
         build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset), "-DPAD_VALUE=0");
 
         const bool squared_im2col = kernel_dims.width == kernel_dims.height;
 
-        if(squared_im2col && !is_data_type_fixed_point(data_type))
+        if(dilation == Size2D(1U, 1U))
         {
-            // Check if we can run an optimized im2col
-            switch(kernel_dims.width)
+            if(squared_im2col && !is_data_type_fixed_point(data_type))
             {
-                case 1:
-                    // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                    if(conv_info.stride().first == 1 && !conv_info.has_padding())
-                    {
-                        // Set hint for LWS
+                // Check if we can run an optimized im2col
+                switch(kernel_dims.width)
+                {
+                    case 1:
+                        // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
+                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
+                        {
+                            // Set hint for LWS
+                            _lws_hint                          = cl::NDRange(1, 1, 8);
+                            _num_elems_processed_per_iteration = 4;
+                            is_optimized_path                  = true;
+                            kernel_name                        = "im2col1x1_stridex1_dchw";
+                        }
+                        break;
+                    case 3:
                         _lws_hint                          = cl::NDRange(1, 1, 8);
-                        _num_elems_processed_per_iteration = 4;
+                        _num_elems_processed_per_iteration = 1;
                         is_optimized_path                  = true;
-                        kernel_name                        = "im2col1x1_stridex1_dchw";
-                    }
-                    break;
-                case 3:
-                    _lws_hint                          = cl::NDRange(1, 1, 8);
-                    _num_elems_processed_per_iteration = 1;
-                    is_optimized_path                  = true;
-                    kernel_name                        = "im2col3x3_dchw";
-                    break;
-                case 5:
-                    _num_elems_processed_per_iteration = 1;
-                    is_optimized_path                  = true;
-                    kernel_name                        = "im2col5x5_dchw";
-                    break;
-                case 11:
-                    // Optimized im2col11x11 if pad_x = pad_y = 0
-                    if(!conv_info.has_padding())
-                    {
+                        kernel_name                        = "im2col3x3_dchw";
+                        break;
+                    case 5:
                         _num_elems_processed_per_iteration = 1;
                         is_optimized_path                  = true;
-                        kernel_name                        = "im2col11x11_padx0_pady0_dchw";
-                    }
-                    break;
-                default:
-                    is_optimized_path = false;
-                    break;
-            }
-        }
-        else if(kernel_dims.width > 1 && !conv_info.has_padding())
-        {
-            _num_elems_processed_per_iteration = 1;
-            kernel_name                        = "im2col_generic_padx0_pady0_dchw";
-
-            // Optimized im2col is performed using one or more vector operations with the specified vector size
-            // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
-            // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
-            // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
-            // Using the vector size of 8, however, may be faster.
-            size_t vector_size = 4;
-            // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
-            // is used instead.)
-            if(kernel_dims.width < vector_size)
-            {
-                vector_size = kernel_dims.width;
+                        kernel_name                        = "im2col5x5_dchw";
+                        break;
+                    case 11:
+                        // Optimized im2col11x11 if pad_x = pad_y = 0
+                        if(!conv_info.has_padding())
+                        {
+                            _num_elems_processed_per_iteration = 1;
+                            is_optimized_path                  = true;
+                            kernel_name                        = "im2col11x11_padx0_pady0_dchw";
+                        }
+                        break;
+                    default:
+                        is_optimized_path = false;
+                        break;
+                }
             }
-            // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
-            if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && kernel_dims.width == 11)
+            else if(kernel_dims.width > 1 && !conv_info.has_padding())
             {
-                _lws_hint   = cl::NDRange(1, 1, 1);
-                vector_size = 8;
+                _num_elems_processed_per_iteration = 1;
+                kernel_name                        = "im2col_generic_padx0_pady0_dchw";
+
+                // Optimized im2col is performed using one or more vector operations with the specified vector size
+                // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+                // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+                // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+                // Using the vector size of 8, however, may be faster.
+                size_t vector_size = 4;
+                // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+                // is used instead.)
+                if(kernel_dims.width < vector_size)
+                {
+                    vector_size = kernel_dims.width;
+                }
+                // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
+                if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && kernel_dims.width == 11)
+                {
+                    _lws_hint   = cl::NDRange(1, 1, 1);
+                    vector_size = 8;
+                }
+                const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+                build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+                build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
             }
-            const size_t width_mod_vector_size = kernel_dims.width % vector_size;
-            build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-            build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
         }
         _run_func = &CLIm2ColKernel::run_generic;
     }
@@ -206,7 +212,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
-    // Configure  kernel window
+    // Configure kernel window
     Window win;
     if(is_optimized_path)
     {
@@ -250,12 +256,12 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_UNUSED(kernel_dims);
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(has_bias);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias, dilation));
     return Status{};
 }
 
diff --git a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
index 2701f5b262..ad3f14d442 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/convolution_layer.cs
@@ -164,6 +164,7 @@ void main()
  * @note STRIDE_X/STRIDE_Y must be passed for stride info, e.g. "#define STRIDE_X xxx"
  * @note CONVOLVED_WIDTH/CONVOLVED_HEIGHT must be passed for convolved dimension, e.g. "#define CONVOLVED_WIDTH xxx"
  * @note SRC_WIDTH/SRC_HEIGHT must be passed for input dimension, e.g. "#define SRC_WIDTH xxx"
+ * @note DILATION_X/DILATION_Y must be passed for dilation sizes, e.g. "#define DILATION_X xxx"
  * @note In case biases will be added to the convolution "#define HAS_BIAS" has to be passed to append the final matrix with 1 in each row.
  *
  * @param[in]  src_ptr      Pointer to the source tensor. Supported data types: F16/F32
@@ -210,9 +211,9 @@ void main(void)
     uint src_pos = 0u;
 
     // Linearize convolution elements
-    for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT); y < y_e; ++y)
+    for(uint y = yi, y_e = yi + uint(KERNEL_HEIGHT) * uint(DILATION_Y); y < y_e; y += uint(DILATION_Y))
     {
-        for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH); x < x_e; ++x, TENSOR_OFFSET_ADVANCE(dst_iter, 1u))
+        for(uint x = xi, x_e = xi + uint(KERNEL_WIDTH) * uint(DILATION_X); x < x_e; x += uint(DILATION_X), TENSOR_OFFSET_ADVANCE(dst_iter, 1u))
         {
 #if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
             src_pos = TENSOR_OFFSET_ADVANCE_IN_BYTES(src_iter, x * src_attrs.stride_x + y * src_attrs.stride_y);
diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
index 47bfebcc09..eb790471fb 100644
--- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp
@@ -65,7 +65,7 @@ GCIm2ColKernel::GCIm2ColKernel()
 {
 }
 
-void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -98,7 +98,8 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const
                                      && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                                     input->info()->tensor_shape().cend(),
                                                     output->info()->tensor_shape().cbegin() + 1))
-                                     && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
+                                     && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding())
+                                     && (dilation == Size2D(1U, 1U));
 
     std::string kernel_name = "im2col_generic";
     if(!run_img2col_reduced)
@@ -111,7 +112,7 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const
         build_opts.emplace("#define IM2COL_GENERIC");
         _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
                                             kernel_dims.width, kernel_dims.height,
-                                            conv_info);
+                                            conv_info, dilation);
         _num_elems_processed_per_iteration = 2;
 
         build_opts.emplace("#define KERNEL_WIDTH " + support::cpp11::to_string(kernel_dims.width));
@@ -127,6 +128,8 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const
         build_opts.emplace("#define PAD_BOTTOM " + support::cpp11::to_string(conv_info.pad_bottom()));
         build_opts.emplace("#define SRC_WIDTH " + support::cpp11::to_string(input->info()->dimension(0)));
         build_opts.emplace("#define SRC_HEIGHT " + support::cpp11::to_string(input->info()->dimension(1)));
+        build_opts.emplace("#define DILATION_X " + support::cpp11::to_string(dilation.x()));
+        build_opts.emplace("#define DILATION_Y " + support::cpp11::to_string(dilation.y()));
 
         _run_func = &GCIm2ColKernel::run_generic;
     }
@@ -206,11 +209,12 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, const
     IGCKernel::configure(win);
 }
 
-Status GCIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status GCIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_UNUSED(kernel_dims);
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(has_bias);
+    ARM_COMPUTE_UNUSED(dilation);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
     return Status{};
 }
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index dee1608c43..348722c55d 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -45,12 +45,13 @@ using namespace arm_compute;
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, bool is_fully_connected, bool is_flatten)
+                          bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
 
     if(is_flatten) /* Called by FlattenLayer */
     {
@@ -59,7 +60,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     }
     else if(!is_fully_connected) /* Called by ConvolutionLayer */
     {
-        std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_dims.width, kernel_dims.height, conv_info);
+        std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_dims.width, kernel_dims.height, conv_info, dilation);
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(2) * kernel_dims.area() + (has_bias ? 1 : 0)));
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != (out_dims.first * out_dims.second));
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(2) != 1);
@@ -91,11 +92,13 @@ inline void linearize_volume(const uint8_t *const in_ptr,
                              int                  input_stride_y,
                              int                  input_stride_z,
                              int                  fixed_point_position,
-                             int                  pad_value)
+                             int                  pad_value,
+                             int                  dilation_x,
+                             int                  dilation_y)
 {
     const int kernel_size2 = kernel_width * kernel_height;
-    const int x_e          = top_left_x + kernel_width;
-    const int y_e          = top_left_y + kernel_height;
+    const int x_e          = top_left_x + kernel_width * dilation_x;
+    const int y_e          = top_left_y + kernel_height * dilation_y;
 
     // Linearize volume
     int d = 0;
@@ -104,12 +107,12 @@ inline void linearize_volume(const uint8_t *const in_ptr,
     // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
     for(; d <= (kernel_depth - 3); d += 3)
     {
-        for(int y = top_left_y; y < y_e; ++y)
+        for(int y = top_left_y; y < y_e; y += dilation_y)
         {
             if((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be the offset (will be zeros when not quantized)
-                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
                     *(out_ptr + 0 * kernel_size2) = pad_value;
                     *(out_ptr + 1 * kernel_size2) = pad_value;
@@ -118,7 +121,7 @@ inline void linearize_volume(const uint8_t *const in_ptr,
             }
             else
             {
-                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
                     if((x < 0 || x >= input_w) && has_pads)
                     {
@@ -141,7 +144,7 @@ inline void linearize_volume(const uint8_t *const in_ptr,
     // Left over
     for(; d < kernel_depth; d++)
     {
-        for(int y = top_left_y; y < y_e; ++y)
+        for(int y = top_left_y; y < y_e; y += dilation_y)
         {
             if((y < 0 || y >= input_h) && has_pads)
             {
@@ -151,7 +154,7 @@ inline void linearize_volume(const uint8_t *const in_ptr,
             }
             else
             {
-                for(int x = top_left_x; x < x_e; ++x, ++out_ptr)
+                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
                     if((x < 0 || x >= input_w) && has_pads)
                     {
@@ -251,7 +254,9 @@ void NEIm2ColKernel::run_generic(const Window &window)
                                       input_stride_y,
                                       input_stride_z,
                                       _input->info()->fixed_point_position(),
-                                      offset);
+                                      offset,
+                                      _dilation.x(),
+                                      _dilation.y());
     },
     in, out);
 }
@@ -309,27 +314,28 @@ void NEIm2ColKernel::run_reduced(const Window &window)
 }
 
 NEIm2ColKernel::NEIm2ColKernel()
-    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false)
+    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U)
 {
 }
 
 void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                               bool has_bias, bool is_fully_connected, bool is_flatten)
+                               bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
     ARM_COMPUTE_UNUSED(is_fully_connected, is_flatten);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
 
     _input          = input;
     _output         = output;
     _conv_info      = conv_info;
     _kernel_width   = kernel_dims.width;
-    _kernel_height  = kernel_dims.height,
+    _kernel_height  = kernel_dims.height;
+    _dilation       = dilation;
     _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
                                         _kernel_width, _kernel_height,
-                                        _conv_info);
+                                        _conv_info, _dilation);
     _has_bias = has_bias;
 
     unsigned int stride_x = 0;
@@ -340,7 +346,8 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
                                && (std::equal(input->info()->tensor_shape().cbegin() + 3,
                                               input->info()->tensor_shape().cend(),
                                               output->info()->tensor_shape().cbegin() + 1))
-                               && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding());
+                               && ((stride_x == 1) && (stride_y == 1) && !conv_info.has_padding())
+                               && ((dilation.x() == 1) && (dilation.y() == 1));
 
     Window window = calculate_max_window(*input->info(), Steps());
 
@@ -407,9 +414,9 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
 }
 
 Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, bool is_fully_connected, bool is_flatten)
+                                bool has_bias, bool is_fully_connected, bool is_flatten, const Size2D &dilation)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
     return Status{};
 }
 
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index f4b45532cf..4a237f9daa 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -292,7 +292,8 @@ const std::pair<unsigned int, unsigned int> arm_compute::deconvolution_output_di
 
 const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsigned int width, unsigned int height,
                                                                            unsigned int kernel_width, unsigned int kernel_height,
-                                                                           const PadStrideInfo &pad_stride_info)
+                                                                           const PadStrideInfo &pad_stride_info,
+                                                                           const Size2D        &dilation)
 {
     const unsigned int pad_left   = pad_stride_info.pad_left();
     const unsigned int pad_top    = pad_stride_info.pad_top();
@@ -305,12 +306,12 @@ const std::pair<unsigned int, unsigned int> arm_compute::scaled_dimensions(unsig
     switch(pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::floor((static_cast<float>(width + pad_left + pad_right - (dilation.x() * (kernel_width - 1) + 1)) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation.y() * (kernel_height - 1) + 1)) / stride_y) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<unsigned int>(std::ceil((static_cast<float>(width + pad_left + pad_right - (dilation.x() * (kernel_width - 1) + 1)) / stride_x) + 1));
+            h = static_cast<unsigned int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation.y() * (kernel_height - 1) + 1)) / stride_y) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
author	Alex Gilday <alexander.gilday@arm.com>	2018-03-23 14:16:00 +0000
committer	Anthony Barbier <anthony.barbier@arm.com>	2018-11-02 16:49:16 +0000
commit	7da29b6b12ff319ed2b6e2c46588dfa1991556fb (patch)
tree	24e766d916ae8da32deb5cd4fac4d82207cbe6ea /src/core
parent	f92cb23f06572fe73ec5ab9da0ec5713724c2dde (diff)
download	ComputeLibrary-7da29b6b12ff319ed2b6e2c46588dfa1991556fb.tar.gz