aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels/tile_helpers.h
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2021-04-01 16:17:16 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2021-04-08 10:00:11 +0000
commit534b889482967a4b4e7d6443bad4e4bdcb4999d4 (patch)
tree173890ba83eb6ce24266304c983a347b4d3fccc2 /src/core/CL/cl_kernels/tile_helpers.h
parent68508897deafe26b5d50566a6ca3ba70c728dd12 (diff)
downloadComputeLibrary-534b889482967a4b4e7d6443bad4e4bdcb4999d4.tar.gz
Rework the OpenCL Winograd Input Transformations NHWC
- Rework Winograd Input Transform 3x3 NHWC using the new macros - Rework Winograd Input Transform 5x5 NHWC using the new macros - Rework Winograd Input Transform 7x7 NHWC using the new macros - The new implementation is also faster than before - Winograd Input Transform 5x5/7x7 3x faster Resolves COMPMID-4139 Change-Id: Ia9c8af23a2d47d2db60ec4c44650a63a34ffa0d5 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5358 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels/tile_helpers.h')
-rw-r--r--src/core/CL/cl_kernels/tile_helpers.h35
1 files changed, 35 insertions, 0 deletions
diff --git a/src/core/CL/cl_kernels/tile_helpers.h b/src/core/CL/cl_kernels/tile_helpers.h
index b72430c026..b963f8b5e3 100644
--- a/src/core/CL/cl_kernels/tile_helpers.h
+++ b/src/core/CL/cl_kernels/tile_helpers.h
@@ -205,6 +205,41 @@
} \
})
+/** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout
+ *
+ * @param[in] DATA_TYPE Data type
+ * @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension
+ * @param[in] TILE_WIDTH Number of elements to load from X (width) dimension
+ * @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension
+ * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
+ * In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
+ * @param[in] TENSOR Tensor basename
+ * @param[in] B Starting batch index
+ * @param[in] Y Starting Y index
+ * @param[in] X Starting X index
+ * @param[in] C Starting C index
+ * @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension
+ * @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension
+ * @param[in] STRIDE_Y Stride Y (in bytes)
+ * @param[out] dst Output tile
+ */
+#define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \
+ ({ \
+ LOOP_UNROLLING(int, _yk, 0, (TILE_HEIGHT), 1) \
+ { \
+ LOOP_UNROLLING(int, _xk, 0, (TILE_WIDTH), 1) \
+ { \
+ int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \
+ _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \
+ int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
+ if(_src_valid_y != 0) \
+ { \
+ dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \
+ } \
+ } \
+ } \
+ })
+
/** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store
*
* @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store