aboutsummaryrefslogtreecommitdiff
path: root/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
diff options
context:
space:
mode:
authorJoel Liang <joel.liang@arm.com>2018-01-02 14:05:06 +0800
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:42:33 +0000
commit6387543b0ad0fbfa77d988437b4ad159803de7c3 (patch)
treec13935859f16d6f2e3e9855937cb1be38e8742ea /src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
parent19e9142604edce6efbf117181578f8f408008134 (diff)
downloadComputeLibrary-6387543b0ad0fbfa77d988437b4ad159803de7c3.tar.gz
APPBROWSER-371: Rewrite the direct_convolution3x3.cs with the new common code
Change-Id: I82a3ec133193433ba9ed3efcb49c51a2b95b16c0 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114962 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Zhenglin Li <zhenglin.li@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp')
-rw-r--r--src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp58
1 files changed, 29 insertions, 29 deletions
diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
index ab78fb994b..06f9bce498 100644
--- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp
@@ -119,44 +119,44 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
{
switch(input->info()->data_type())
{
+ case DataType::F16:
// TODO(APPBROWSER-299): Choose the most optimal path and remove others.
-#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
+#define PROCESS_4X_3Y_1Z
- case DataType::F16:
-#if defined(PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16)
- options.emplace("#define PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16");
+#if defined(PROCESS_8X_3Y_1Z)
+ options.emplace("#define PROCESS_8X_3Y_1Z");
num_elems_read_per_iteration_x = 16;
num_elems_read_per_iteration_y = 5;
num_elems_written_per_iteration_x = 8;
num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16)
- options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16");
+#elif defined(PROCESS_4X_3Y_1Z)
+ options.emplace("#define PROCESS_4X_3Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_read_per_iteration_y = 5;
num_elems_written_per_iteration_x = 4;
num_elems_written_per_iteration_y = 3;
-#elif defined(PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16)
- options.emplace("#define PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16");
+#elif defined(PROCESS_4X_4Y_1Z)
+ options.emplace("#define PROCESS_4X_4Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_read_per_iteration_y = 6;
num_elems_written_per_iteration_x = 4;
num_elems_written_per_iteration_y = 4;
-#elif defined(PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16)
- options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16");
+#elif defined(PROCESS_4X_3Y_2Z)
+ options.emplace("#define PROCESS_4X_3Y_2Z");
num_elems_read_per_iteration_x = 8;
num_elems_read_per_iteration_y = 5;
num_elems_written_per_iteration_x = 4;
num_elems_written_per_iteration_y = 3;
num_elems_written_per_iteration_z = 2;
-#endif /* PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16 */
-#undef PROCESS_X_8ELEMENTS_Y_3ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_4ELEMENTS_FP16
-#undef PROCESS_X_4ELEMENTS_Y_3ELEMENTS_Z_2ELEMENTS_FP16
+#endif /* PROCESS_nX_nY_nZ */
+#undef PROCESS_8X_3Y_1Z
+#undef PROCESS_4X_3Y_1Z
+#undef PROCESS_4X_4Y_1Z
+#undef PROCESS_4X_3Y_2Z
break;
case DataType::F32:
- options.emplace("#define PROCESS_X_4ELEMENTS_Y_3ELEMENTS");
+ options.emplace("#define PROCESS_4X_3Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_read_per_iteration_y = 5;
num_elems_written_per_iteration_x = 4;
@@ -174,33 +174,33 @@ void GCDirectConvolutionLayerKernel<kernel_size>::configure(const IGCTensor *inp
switch(input->info()->data_type())
{
case DataType::F16:
- options.emplace("#define PROCESS_X_4ELEMENTS_FP16");
+ options.emplace("#define PROCESS_4X_1Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_written_per_iteration_x = 4;
break;
case DataType::F32:
// TODO(APPBROWSER-299): Choose the most optimal path and remove others.
-#define PROCESS_4_ELEMENT
+#define PROCESS_4X_1Y_1Z
-#if defined(PROCESS_1_ELEMENT)
- options.emplace("#define PROCESS_1_ELEMENT");
+#if defined(PROCESS_1X_1Y_1Z)
+ options.emplace("#define PROCESS_1X_1Y_1Z");
num_elems_read_per_iteration_x = 3;
num_elems_written_per_iteration_x = 1;
-#elif defined(PROCESS_4_ELEMENT)
- options.emplace("#define PROCESS_4_ELEMENT");
+#elif defined(PROCESS_4X_1Y_1Z)
+ options.emplace("#define PROCESS_4X_1Y_1Z");
num_elems_read_per_iteration_x = 8;
num_elems_written_per_iteration_x = 4;
-#elif defined(PROCESS_8_ELEMENT)
- options.emplace("#define PROCESS_8_ELEMENT");
+#elif defined(PROCESS_8X_1Y_1Z)
+ options.emplace("#define PROCESS_8X_1Y_1Z");
num_elems_read_per_iteration_x = 12;
num_elems_written_per_iteration_x = 8;
-#else /* PROCESS_1_ELEMENT */
+#else /* PROCESS_nX_nY_nZ */
#error Have to declare how many elements to process in one thread.
-#endif /* PROCESS_1_ELEMENT */
-#undef PROCESS_1_ELEMENT
-#undef PROCESS_4_ELEMENT
-#undef PROCESS_8_ELEMENT
+#endif /* PROCESS_nX_nY_nZ */
+#undef PROCESS_1X_1Y_1Z
+#undef PROCESS_4X_1Y_1Z
+#undef PROCESS_8X_1Y_1Z
break;
default: