From 9a7182e8b53ab77d26b56752cc95b80f4e1774f2 Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Tue, 11 Jul 2017 18:36:40 +0100 Subject: COMPMID-443 Use 3D tensor for pixel multiply (Needed for Normalization Layer) Change-Id: I117688f12334e6afc705c863acdf71b0bb1fc6e8 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80352 Tested-by: Kaizen Reviewed-by: Georgios Pinitas --- src/core/CL/cl_kernels/pixelwise_mul_float.cl | 18 ++++++++++++------ src/core/CL/cl_kernels/pixelwise_mul_int.cl | 18 ++++++++++++------ .../CL/kernels/CLPixelWiseMultiplicationKernel.cpp | 12 ++++++------ 3 files changed, 30 insertions(+), 18 deletions(-) (limited to 'src') diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl index 89367dc0ce..98127e0311 100644 --- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl +++ b/src/core/CL/cl_kernels/pixelwise_mul_float.cl @@ -43,31 +43,37 @@ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16, F16, F32 * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16, F16, F32 * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image * @param[in] scale Float scaling factor. Supported data types: F32 */ __kernel void pixelwise_mul_float( - IMAGE_DECLARATION(in1), - IMAGE_DECLARATION(in2), - IMAGE_DECLARATION(out), + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), const float scale) { // Get pixels pointer - Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); - Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); - Image out = CONVERT_TO_IMAGE_STRUCT(out); + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); // Load data VEC_DATA_TYPE(DATA_TYPE_RES, 16) diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl index a407a3264e..b5734a39ed 100644 --- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl +++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl @@ -59,31 +59,37 @@ * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in1_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in1_step_z in1_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in2_stride_z Stride of the source image in Y dimension (in bytes) + * @param[in] in2_step_z in2_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_stride_z Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_z out_stride_z * number of elements along Y processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1). */ __kernel void pixelwise_mul_int( - IMAGE_DECLARATION(in1), - IMAGE_DECLARATION(in2), - IMAGE_DECLARATION(out), + TENSOR3D_DECLARATION(in1), + TENSOR3D_DECLARATION(in2), + TENSOR3D_DECLARATION(out), const uint scale) { // Get pixels pointer - Image in1 = CONVERT_TO_IMAGE_STRUCT(in1); - Image in2 = CONVERT_TO_IMAGE_STRUCT(in2); - Image out = CONVERT_TO_IMAGE_STRUCT(out); + Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1); + Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); // Load data VEC_DATA_TYPE(DATA_TYPE_RES, 16) diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp index b95e8fac73..33c8b81c1d 100644 --- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp +++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp @@ -149,7 +149,7 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts)); // Set scale argument - unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the inputs and output parameters + unsigned int idx = 3 * num_arguments_per_3D_tensor(); //Skip the inputs and output parameters if(scale_int >= 0) { @@ -183,15 +183,15 @@ void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - Window slice = window.first_slice_window_2D(); + Window slice = window.first_slice_window_3D(); do { unsigned int idx = 0; - add_2D_tensor_argument(idx, _input1, slice); - add_2D_tensor_argument(idx, _input2, slice); - add_2D_tensor_argument(idx, _output, slice); + add_3D_tensor_argument(idx, _input1, slice); + add_3D_tensor_argument(idx, _input2, slice); + add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice); } - while(window.slide_window_slice_2D(slice)); + while(window.slide_window_slice_3D(slice)); } -- cgit v1.2.1