aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/kernels
diff options
context:
space:
mode:
authorAnthony Barbier <anthony.barbier@arm.com>2017-09-04 18:44:23 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-09-17 13:03:09 +0100
commit6ff3b19ee6120edf015fad8caab2991faa3070af (patch)
treea7a6dcd16dfd56d79fa1b56a313caeebcc939b68 /src/core/CL/kernels
downloadComputeLibrary-6ff3b19ee6120edf015fad8caab2991faa3070af.tar.gz
COMPMID-344 Updated doxygen
Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae
Diffstat (limited to 'src/core/CL/kernels')
-rw-r--r--src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp102
-rw-r--r--src/core/CL/kernels/CLAccumulateKernel.cpp83
-rw-r--r--src/core/CL/kernels/CLActivationLayerKernel.cpp64
-rw-r--r--src/core/CL/kernels/CLArithmeticAdditionKernel.cpp111
-rw-r--r--src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp111
-rw-r--r--src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp115
-rw-r--r--src/core/CL/kernels/CLBitwiseAndKernel.cpp88
-rw-r--r--src/core/CL/kernels/CLBitwiseNotKernel.cpp48
-rw-r--r--src/core/CL/kernels/CLBitwiseOrKernel.cpp89
-rw-r--r--src/core/CL/kernels/CLBitwiseXorKernel.cpp89
-rw-r--r--src/core/CL/kernels/CLBox3x3Kernel.cpp77
-rw-r--r--src/core/CL/kernels/CLCannyEdgeKernel.cpp255
-rw-r--r--src/core/CL/kernels/CLChannelCombineKernel.cpp244
-rw-r--r--src/core/CL/kernels/CLChannelExtractKernel.cpp148
-rw-r--r--src/core/CL/kernels/CLCol2ImKernel.cpp85
-rw-r--r--src/core/CL/kernels/CLColorConvertKernel.cpp476
-rw-r--r--src/core/CL/kernels/CLConvolutionKernel.cpp330
-rw-r--r--src/core/CL/kernels/CLDepthConcatenateKernel.cpp113
-rw-r--r--src/core/CL/kernels/CLDepthConvertKernel.cpp99
-rw-r--r--src/core/CL/kernels/CLDerivativeKernel.cpp145
-rw-r--r--src/core/CL/kernels/CLDilateKernel.cpp65
-rw-r--r--src/core/CL/kernels/CLErodeKernel.cpp65
-rw-r--r--src/core/CL/kernels/CLFastCornersKernel.cpp172
-rw-r--r--src/core/CL/kernels/CLFillBorderKernel.cpp175
-rw-r--r--src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp106
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp122
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp92
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp92
-rw-r--r--src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp168
-rw-r--r--src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp129
-rw-r--r--src/core/CL/kernels/CLGaussian3x3Kernel.cpp76
-rw-r--r--src/core/CL/kernels/CLGaussian5x5Kernel.cpp45
-rw-r--r--src/core/CL/kernels/CLGaussianPyramidKernel.cpp218
-rw-r--r--src/core/CL/kernels/CLHOGDescriptorKernel.cpp200
-rw-r--r--src/core/CL/kernels/CLHOGDetectorKernel.cpp130
-rw-r--r--src/core/CL/kernels/CLHarrisCornersKernel.cpp126
-rw-r--r--src/core/CL/kernels/CLHistogramKernel.cpp224
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp202
-rw-r--r--src/core/CL/kernels/CLIntegralImageKernel.cpp112
-rw-r--r--src/core/CL/kernels/CLLKTrackerKernel.cpp285
-rw-r--r--src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp116
-rw-r--r--src/core/CL/kernels/CLMagnitudePhaseKernel.cpp168
-rw-r--r--src/core/CL/kernels/CLMeanStdDevKernel.cpp134
-rw-r--r--src/core/CL/kernels/CLMedian3x3Kernel.cpp66
-rw-r--r--src/core/CL/kernels/CLMinMaxLocationKernel.cpp169
-rw-r--r--src/core/CL/kernels/CLNonLinearFilterKernel.cpp98
-rw-r--r--src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp72
-rw-r--r--src/core/CL/kernels/CLNormalizationLayerKernel.cpp111
-rw-r--r--src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp154
-rw-r--r--src/core/CL/kernels/CLPoolingLayerKernel.cpp180
-rw-r--r--src/core/CL/kernels/CLRemapKernel.cpp108
-rw-r--r--src/core/CL/kernels/CLScaleKernel.cpp99
-rw-r--r--src/core/CL/kernels/CLScharr3x3Kernel.cpp132
-rw-r--r--src/core/CL/kernels/CLSobel3x3Kernel.cpp133
-rw-r--r--src/core/CL/kernels/CLSobel5x5Kernel.cpp234
-rw-r--r--src/core/CL/kernels/CLSobel7x7Kernel.cpp238
-rw-r--r--src/core/CL/kernels/CLSoftmaxLayerKernel.cpp216
-rw-r--r--src/core/CL/kernels/CLTableLookupKernel.cpp63
-rw-r--r--src/core/CL/kernels/CLThresholdKernel.cpp76
-rw-r--r--src/core/CL/kernels/CLTransposeKernel.cpp82
-rw-r--r--src/core/CL/kernels/CLWarpAffineKernel.cpp99
-rw-r--r--src/core/CL/kernels/CLWarpPerspectiveKernel.cpp99
-rw-r--r--src/core/CL/kernels/CLWeightsReshapeKernel.cpp163
63 files changed, 8686 insertions, 0 deletions
diff --git a/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
new file mode 100644
index 0000000000..685b8e234e
--- /dev/null
+++ b/src/core/CL/kernels/CLAbsoluteDifferenceKernel.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLAbsoluteDifferenceKernel::CLAbsoluteDifferenceKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLAbsoluteDifferenceKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+ "The output image can only be U8 if both input images are U8");
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.insert("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.insert("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("absdiff", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+ input2->info()->valid_region());
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLAbsoluteDifferenceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, slice);
+ add_2D_tensor_argument(idx, _input2, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLAccumulateKernel.cpp b/src/core/CL/kernels/CLAccumulateKernel.cpp
new file mode 100644
index 0000000000..6333f04e71
--- /dev/null
+++ b/src/core/CL/kernels/CLAccumulateKernel.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLAccumulateKernel::configure(const ICLTensor *input, ICLTensor *accum)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate"));
+
+ // Make sure _kernel is initialized before calling the parent's configure
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void CLAccumulateWeightedKernel::configure(const ICLTensor *input, float alpha, ICLTensor *accum)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_weighted"));
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, alpha);
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
+
+void CLAccumulateSquaredKernel::configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON(shift > 15);
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("accumulate_squared"));
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, shift);
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ ICLSimple2DKernel::configure(input, accum, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
new file mode 100644
index 0000000000..83bbe6a3be
--- /dev/null
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+void CLActivationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.insert(("-D" + string_from_activation_func(act_info.activation())));
+ build_opts.insert(("-D" + ((is_data_type_float(input->info()->data_type())) ? std::string("TYPE_FP") : std::string("TYPE_INT"))));
+ build_opts.insert(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.insert(("-DA=" + val_to_string(act_info.a())));
+ build_opts.insert(("-DB=" + val_to_string(act_info.b())));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("activation_layer", build_opts));
+
+ // Make sure _kernel is initialized before calling the parent's configure
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ ICLSimple3DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
new file mode 100644
index 0000000000..aaa62d0268
--- /dev/null
+++ b/src/core/CL/kernels/CLArithmeticAdditionKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticAdditionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLArithmeticAdditionKernel::CLArithmeticAdditionKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticAdditionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ const bool has_float_out = is_data_type_float(output->info()->data_type());
+
+ // Check for invalid combination
+ if(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8))
+ {
+ ARM_COMPUTE_ERROR("You called with the wrong data types.");
+ }
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+ build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_add", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+ input2->info()->valid_region());
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLArithmeticAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, slice);
+ add_2D_tensor_argument(idx, _input2, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
new file mode 100644
index 0000000000..4c847276da
--- /dev/null
+++ b/src/core/CL/kernels/CLArithmeticSubtractionKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLArithmeticSubtractionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLArithmeticSubtractionKernel::CLArithmeticSubtractionKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLArithmeticSubtractionKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ // Check for invalid combination
+ if(output->info()->data_type() == DataType::U8)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ }
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ bool has_float_out = is_data_type_float(output->info()->data_type());
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace((policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+ build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("arithmetic_sub", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+ input2->info()->valid_region());
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLArithmeticSubtractionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, slice);
+ add_2D_tensor_argument(idx, _input2, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..309a153b7a
--- /dev/null
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
+ : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0)
+{
+}
+
+void CLBatchNormalizationLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
+ float epsilon)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != mean->info()->dimension(0));
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+ _input = input;
+ _output = output;
+ _mean = mean;
+ _var = var;
+ _beta = beta;
+ _gamma = gamma;
+ _epsilon = epsilon;
+
+ // Create kernel
+ std::string kernel_name = "batchnormalization_layer";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Set kernel static arguments
+ unsigned int idx = 2 * num_arguments_per_3D_tensor() + 4 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<cl_float>(idx++, _epsilon);
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = 4;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLBatchNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+
+ Window vector_slice = window.first_slice_window_1D();
+ vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
+
+ unsigned int idx = 2 * num_arguments_per_3D_tensor();
+ add_1D_tensor_argument(idx, _mean, vector_slice);
+ add_1D_tensor_argument(idx, _var, vector_slice);
+ add_1D_tensor_argument(idx, _beta, vector_slice);
+ add_1D_tensor_argument(idx, _gamma, vector_slice);
+
+ do
+ {
+ idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseAndKernel.cpp b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
new file mode 100644
index 0000000000..5ea4a86da5
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseAndKernel.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseAndKernel::CLBitwiseAndKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+void CLBitwiseAndKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_and"));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+ input2->info()->valid_region());
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLBitwiseAndKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, slice);
+ add_2D_tensor_argument(idx, _input2, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseNotKernel.cpp b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
new file mode 100644
index 0000000000..0098e15ab6
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseNotKernel.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+void CLBitwiseNotKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_not"));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLBitwiseOrKernel.cpp b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
new file mode 100644
index 0000000000..2eeef0a993
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseOrKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseOrKernel::CLBitwiseOrKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBitwiseOrKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_or"));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+ input2->info()->valid_region());
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLBitwiseOrKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, slice);
+ add_2D_tensor_argument(idx, _input2, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBitwiseXorKernel.cpp b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
new file mode 100644
index 0000000000..c19a78e1c4
--- /dev/null
+++ b/src/core/CL/kernels/CLBitwiseXorKernel.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLBitwiseXorKernel::CLBitwiseXorKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLBitwiseXorKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("bitwise_xor"));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+ input2->info()->valid_region());
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLBitwiseXorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, slice);
+ add_2D_tensor_argument(idx, _input2, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLBox3x3Kernel.cpp b/src/core/CL/kernels/CLBox3x3Kernel.cpp
new file mode 100644
index 0000000000..e113d30210
--- /dev/null
+++ b/src/core/CL/kernels/CLBox3x3Kernel.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLBox3x3Kernel::border_size() const
+{
+ return 1;
+}
+
+void CLBox3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ _input = input;
+ _output = output;
+
+ // Set build options
+ std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=1", "-DMAT2=1",
+ "-DMAT3=1", "-DMAT4=1", "-DMAT5=1",
+ "-DMAT6=1", "-DMAT7=1", "-DMAT8=1",
+ "-DSCALE=9", "-DDATA_TYPE_OUT=uchar"
+ };
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLCannyEdgeKernel.cpp b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
new file mode 100644
index 0000000000..5d06d34631
--- /dev/null
+++ b/src/core/CL/kernels/CLCannyEdgeKernel.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGradientKernel::CLGradientKernel()
+ : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
+{
+}
+
+void CLGradientKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(gy->info()->data_type()),
+ "Gx and Gy must have the same pixel size");
+ ARM_COMPUTE_ERROR_ON_MSG(data_size_from_type(gx->info()->data_type()) != data_size_from_type(magnitude->info()->data_type()),
+ "Mag must have the same pixel size as Gx and Gy");
+
+ _gx = gx;
+ _gy = gy;
+ _magnitude = magnitude;
+ _phase = phase;
+
+ // Create build opts
+ std::set<std::string> built_opts;
+ built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(gx->info()->data_type()));
+ built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(gx->info()->data_type()));
+
+ // Create kernel
+ const std::string kernel_name = (norm_type == 1) ? std::string("combine_gradients_L1") : std::string("combine_gradients_L2");
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, built_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access);
+
+ mag_access.set_valid_region(win, _gx->info()->valid_region());
+ phase_access.set_valid_region(win, _gx->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLGradientKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _gx, slice);
+ add_2D_tensor_argument(idx, _gy, slice);
+ add_2D_tensor_argument(idx, _magnitude, slice);
+ add_2D_tensor_argument(idx, _phase, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLEdgeNonMaxSuppressionKernel::CLEdgeNonMaxSuppressionKernel()
+ : _magnitude(nullptr), _phase(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLEdgeNonMaxSuppressionKernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLEdgeNonMaxSuppressionKernel::configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::U32);
+
+ _magnitude = magnitude;
+ _phase = phase;
+ _output = output;
+
+ // Create build opts
+ std::set<std::string> built_opts;
+ built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(magnitude->info()->data_type()));
+ built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("suppress_non_maximum", built_opts));
+
+ // Set minimum threshold argument
+ unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, lower_thr);
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+ constexpr unsigned int num_elems_read_written_per_iteration = 3;
+
+ Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top,
+ num_elems_read_written_per_iteration, num_elems_read_written_per_iteration);
+ AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, mag_access, phase_access, output_access);
+
+ output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLEdgeNonMaxSuppressionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _magnitude, slice);
+ add_2D_tensor_argument(idx, _phase, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLEdgeTraceKernel::CLEdgeTraceKernel()
+ : _input(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0), _visited(nullptr), _recorded(nullptr), _l1_stack(nullptr), _l1_stack_counter(nullptr)
+{
+}
+
+void CLEdgeTraceKernel::configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
+ ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::U32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(visited, 1, DataType::U32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(recorded, 1, DataType::U32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(l1_stack_counter, 1, DataType::U8);
+
+ _input = input;
+ _output = output;
+ _lower_thr = lower_thr;
+ _upper_thr = upper_thr;
+ _visited = visited;
+ _recorded = recorded;
+ _l1_stack = l1_stack;
+ _l1_stack_counter = l1_stack_counter;
+
+ // Create build opts
+ std::set<std::string> built_opts;
+ built_opts.emplace("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+ built_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hysteresis", built_opts));
+
+ // Set constant kernel args
+ unsigned int width = _input->info()->dimension(0);
+ unsigned int height = _input->info()->dimension(1);
+ unsigned int idx = 6 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, static_cast<cl_uint>(_lower_thr));
+ _kernel.setArg(idx++, static_cast<cl_uint>(_upper_thr));
+ _kernel.setArg(idx++, static_cast<cl_uint>(width));
+ _kernel.setArg(idx++, static_cast<cl_uint>(height));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+ Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal visited_access(_visited->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal recorded_access(_recorded->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal l1_stack_access(_l1_stack->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal l1_stack_counter_access(_l1_stack_counter->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowHorizontal(_input->info(), 0, num_elems_processed_per_iteration),
+ output_access,
+ visited_access,
+ recorded_access,
+ l1_stack_access,
+ l1_stack_counter_access);
+
+ output_access.set_valid_region(win, _input->info()->valid_region());
+ visited_access.set_valid_region(win, _input->info()->valid_region());
+ recorded_access.set_valid_region(win, _input->info()->valid_region());
+ l1_stack_access.set_valid_region(win, _input->info()->valid_region());
+ l1_stack_counter_access.set_valid_region(win, _input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLEdgeTraceKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ add_2D_tensor_argument(idx, _visited, slice);
+ add_2D_tensor_argument(idx, _recorded, slice);
+ add_2D_tensor_argument(idx, _l1_stack, slice);
+ add_2D_tensor_argument(idx, _l1_stack_counter, slice);
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLChannelCombineKernel.cpp b/src/core/CL/kernels/CLChannelCombineKernel.cpp
new file mode 100644
index 0000000000..d729ebcfb3
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelCombineKernel.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLChannelCombineKernel::CLChannelCombineKernel()
+ : _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }
+{
+}
+
+void CLChannelCombineKernel::configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
+
+ const Format fmt = output->info()->format();
+ _planes[0] = plane0;
+ _planes[1] = plane1;
+ _planes[2] = plane2;
+ if(Format::RGBA8888 == fmt)
+ {
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
+ _planes[3] = plane3;
+ }
+ else
+ {
+ _planes[3] = nullptr;
+ }
+ _output = output;
+ _output_multi = nullptr;
+
+ // Half the processed elements for U,V channels due to sub-sampling of 2
+ if(Format::YUYV422 == fmt || Format::UYVY422 == fmt)
+ {
+ _x_subsampling = { { 1, 2, 2 } };
+ _y_subsampling = { { 1, 2, 2 } };
+ }
+ else
+ {
+ _x_subsampling = { { 1, 1, 1 } };
+ _y_subsampling = { { 1, 1, 1 } };
+ }
+
+ // Create kernel
+ std::string kernel_name = "channel_combine_" + string_from_format(fmt);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+ // Configure window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowRectangle plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+ AccessWindowRectangle plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+ AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, plane0_access, plane1_access, plane2_access, plane3_access, output_access);
+
+ ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
+ plane1->info()->valid_region(),
+ plane2->info()->valid_region());
+ if(plane3 != nullptr)
+ {
+ valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
+ }
+ output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLChannelCombineKernel::configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+
+ _planes[0] = plane0;
+ _planes[1] = plane1;
+ _planes[2] = plane2;
+ _planes[3] = nullptr;
+ _output = nullptr;
+ _output_multi = output;
+ bool has_two_planars = false;
+
+ // Set sub-sampling parameters for each plane
+ const Format fmt = output->info()->format();
+ std::string kernel_name;
+ std::set<std::string> build_opts;
+
+ if(Format::NV12 == fmt || Format::NV21 == fmt)
+ {
+ _x_subsampling = { { 1, 2, 2 } };
+ _y_subsampling = { { 1, 2, 2 } };
+ kernel_name = "channel_combine_NV";
+ build_opts.emplace(Format::NV12 == fmt ? "-DNV12" : "-DNV21");
+ has_two_planars = true;
+ }
+ else
+ {
+ if(Format::IYUV == fmt)
+ {
+ _x_subsampling = { { 1, 2, 2 } };
+ _y_subsampling = { { 1, 2, 2 } };
+ }
+ else
+ {
+ _x_subsampling = { { 1, 1, 1 } };
+ _y_subsampling = { { 1, 1, 1 } };
+ }
+
+ kernel_name = "copy_planes_3p";
+ build_opts.emplace(Format::IYUV == fmt ? "-DIYUV" : "-DYUV444");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Configure window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*plane0->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_plane0_access(plane0->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowRectangle input_plane1_access(plane1->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+ AccessWindowRectangle input_plane2_access(plane2->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+ AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[1]);
+ AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
+ AccessWindowRectangle output_plane2_access(has_two_planars ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
+
+ update_window_and_padding(win,
+ input_plane0_access, input_plane1_access, input_plane2_access,
+ output_plane0_access, output_plane1_access, output_plane2_access);
+
+ ValidRegion plane0_valid_region = plane0->info()->valid_region();
+ ValidRegion output_plane1_region = has_two_planars ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
+ output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
+ output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
+ output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLChannelCombineKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ // Subsampling in plane 1
+ Window win_sub_plane1(slice);
+ win_sub_plane1.set(Window::DimX, Window::Dimension(win_sub_plane1.x().start() / _x_subsampling[1], win_sub_plane1.x().end() / _x_subsampling[1], win_sub_plane1.x().step() / _x_subsampling[1]));
+ win_sub_plane1.set(Window::DimY, Window::Dimension(win_sub_plane1.y().start() / _y_subsampling[1], win_sub_plane1.y().end() / _y_subsampling[1], 1));
+
+ // Subsampling in plane 2
+ Window win_sub_plane2(slice);
+ win_sub_plane2.set(Window::DimX, Window::Dimension(win_sub_plane2.x().start() / _x_subsampling[2], win_sub_plane2.x().end() / _x_subsampling[2], win_sub_plane2.x().step() / _x_subsampling[2]));
+ win_sub_plane2.set(Window::DimY, Window::Dimension(win_sub_plane2.y().start() / _y_subsampling[2], win_sub_plane2.y().end() / _y_subsampling[2], 1));
+
+ unsigned int idx = 0;
+
+ // Set inputs
+ add_2D_tensor_argument(idx, _planes[0], slice);
+ add_2D_tensor_argument(idx, _planes[1], win_sub_plane1);
+ add_2D_tensor_argument(idx, _planes[2], win_sub_plane2);
+
+ if(nullptr != _planes[3])
+ {
+ add_2D_tensor_argument(idx, _planes[3], slice);
+ }
+
+ // Set outputs
+ if(nullptr != _output) // Single planar output
+ {
+ add_2D_tensor_argument(idx, _output, slice);
+ }
+ else // Multi-planar output
+ {
+ // Reduce slice in case of subsampling to avoid out-of bounds access
+ slice.set(Window::DimY, Window::Dimension(slice.y().start() / _y_subsampling[1], slice.y().end() / _y_subsampling[1], 1));
+
+ add_2D_tensor_argument(idx, _output_multi->cl_plane(0), slice);
+ add_2D_tensor_argument(idx, _output_multi->cl_plane(1), win_sub_plane1);
+
+ if(3 == num_planes_from_format(_output_multi->info()->format()))
+ {
+ add_2D_tensor_argument(idx, _output_multi->cl_plane(2), win_sub_plane2);
+ }
+
+ _kernel.setArg(idx++, slice.y().end());
+ }
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLChannelExtractKernel.cpp b/src/core/CL/kernels/CLChannelExtractKernel.cpp
new file mode 100644
index 0000000000..541153316a
--- /dev/null
+++ b/src/core/CL/kernels/CLChannelExtractKernel.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLChannelExtractKernel::CLChannelExtractKernel()
+ : _input(nullptr), _output(nullptr), _num_elems_processed_per_iteration(8), _subsampling(1)
+{
+}
+
+void CLChannelExtractKernel::configure(const ICLTensor *input, Channel channel, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::YUYV422, Format::UYVY422);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+ ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+ _input = input;
+ _output = output;
+
+ // Check format
+ const Format format = input->info()->format();
+ ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
+
+ // Create kernel
+ std::string kernel_name = "channel_extract_" + string_from_format(format);
+ std::set<std::string> build_opts = { ("-DCHANNEL_" + string_from_channel(channel)) };
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Half the processed elements for U,V channels due to sub-sampling of 2
+ _subsampling = ((Format::YUYV422 == format || Format::UYVY422 == format) && Channel::Y != channel) ? 2 : 1;
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, _num_elems_processed_per_iteration);
+ AccessWindowRectangle output_access(input->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _subsampling, 1.f / _subsampling);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ ValidRegion input_valid_region = input->info()->valid_region();
+ output_access.set_valid_region(win, ValidRegion(std::move(input_valid_region.anchor), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLChannelExtractKernel::configure(const ICLMultiImage *input, Channel channel, ICLImage *output)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
+ ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
+ ARM_COMPUTE_ERROR_ON(static_cast<const void *>(input) == static_cast<void *>(output));
+
+ // Get format
+ const Format fmt = input->info()->format();
+
+ // Get input plane
+ const ICLImage *input_plane = input->cl_plane(plane_idx_from_channel(fmt, channel));
+ ARM_COMPUTE_ERROR_ON(nullptr == input_plane);
+
+ _output = output;
+ _input = input_plane;
+ _subsampling = 1;
+
+ // Create kernel
+ std::string kernel_name;
+ std::set<std::string> build_opts;
+ if(Channel::Y == channel || Format::IYUV == fmt || Format::YUV444 == fmt)
+ {
+ kernel_name = "copy_plane";
+ }
+ else
+ {
+ kernel_name = "channel_extract_" + string_from_format(fmt);
+ build_opts.insert(("-DCHANNEL_" + string_from_channel(channel)));
+ }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Configure window
+ Window win = calculate_max_window(*input_plane->info(), Steps(_num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(input_plane->info(), 0, _num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input_plane->info(), 0, _num_elems_processed_per_iteration),
+ output_access);
+
+ output_access.set_valid_region(win, input_plane->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLChannelExtractKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ Window win_sub(slice);
+ win_sub.set(Window::DimX, Window::Dimension(win_sub.x().start() / _subsampling, win_sub.x().end() / _subsampling, win_sub.x().step() / _subsampling));
+ win_sub.set(Window::DimY, Window::Dimension(win_sub.y().start() / _subsampling, win_sub.y().end() / _subsampling, 1));
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, win_sub);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
new file mode 100644
index 0000000000..ad66c39483
--- /dev/null
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+CLCol2ImKernel::CLCol2ImKernel()
+ : _input(nullptr), _output(nullptr), _convolved_dims()
+{
+}
+
+void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+ _convolved_dims = convolved_dims;
+
+ // Create kernel
+ std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts));
+
+ // Set static kernel arguments
+ unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+ _kernel.setArg<cl_uint>(idx++, _convolved_dims.first);
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps());
+ // The CLCol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ ICLKernel::configure(win);
+}
+
+void CLCol2ImKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice_in = window.first_slice_window_2D();
+ Window slice_out = window.first_slice_window_3D();
+ do
+ {
+ // Set inputs
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice_in);
+ add_3D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_in);
+ }
+ while(window.slide_window_slice_2D(slice_in) && window.slide_window_slice_3D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLColorConvertKernel.cpp b/src/core/CL/kernels/CLColorConvertKernel.cpp
new file mode 100644
index 0000000000..ead2b8f092
--- /dev/null
+++ b/src/core/CL/kernels/CLColorConvertKernel.cpp
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLMultiImage.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <sstream>
+
+using namespace arm_compute;
+
+CLColorConvertKernel::CLColorConvertKernel()
+ : _input(nullptr), _output(nullptr), _multi_input(nullptr), _multi_output(nullptr)
+{
+}
+
+void CLColorConvertKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON(input == nullptr);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ unsigned int num_elems_processed_per_iteration = 0;
+ switch(input->info()->format())
+ {
+ case Format::RGBA8888:
+ {
+ switch(output->info()->format())
+ {
+ case Format::RGB888:
+ num_elems_processed_per_iteration = 16;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ case Format::UYVY422:
+ case Format::YUYV422:
+ {
+ switch(output->info()->format())
+ {
+ case Format::RGB888:
+ case Format::RGBA8888:
+ num_elems_processed_per_iteration = 8;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ case Format::RGB888:
+ {
+ switch(output->info()->format())
+ {
+ case Format::RGBA8888:
+ num_elems_processed_per_iteration = 16;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+ string_from_format(input->info()->format()).c_str(),
+ string_from_format(output->info()->format()).c_str());
+
+ std::stringstream kernel_name;
+
+ kernel_name << string_from_format(input->info()->format());
+ kernel_name << "_to_";
+ kernel_name << string_from_format(output->info()->format());
+ kernel_name << "_bt709";
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLImage *output)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ unsigned int num_elems_processed_per_iteration = 0;
+
+ switch(input->info()->format())
+ {
+ case Format::NV12:
+ case Format::NV21:
+ case Format::IYUV:
+ {
+ switch(output->info()->format())
+ {
+ case Format::RGB888:
+ case Format::RGBA8888:
+ num_elems_processed_per_iteration = 4;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+ string_from_format(input->info()->format()).c_str(),
+ string_from_format(output->info()->format()).c_str());
+
+ std::stringstream kernel_name;
+
+ kernel_name << string_from_format(input->info()->format());
+ kernel_name << "_to_";
+ kernel_name << string_from_format(output->info()->format());
+ kernel_name << "_bt709";
+
+ _multi_input = input;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+ // Configure kernel window
+ const bool has_two_planes = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
+ const float sub_sampling = (has_two_planes || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ win.set_dimension_step(Window::DimY, 2);
+
+ AccessWindowHorizontal plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowRectangle plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+ sub_sampling, sub_sampling);
+ AccessWindowRectangle plane2_access(has_two_planes ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+ sub_sampling, sub_sampling);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ plane0_access, plane1_access, plane2_access,
+ output_access);
+
+ ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
+ input->plane(2)->info()->valid_region());
+ output_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLImage *input, ICLMultiImage *output)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ unsigned int num_elems_processed_per_iteration = 0;
+
+ bool has_two_planes = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
+ float sub_sampling = (has_two_planes || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+ switch(input->info()->format())
+ {
+ case Format::RGB888:
+ case Format::RGBA8888:
+ {
+ switch(output->info()->format())
+ {
+ case Format::NV12:
+ case Format::IYUV:
+ num_elems_processed_per_iteration = 2;
+ break;
+ case Format::YUV444:
+ num_elems_processed_per_iteration = 4;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ case Format::UYVY422:
+ case Format::YUYV422:
+ {
+ switch(output->info()->format())
+ {
+ case Format::NV12:
+ case Format::IYUV:
+ num_elems_processed_per_iteration = 8;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+ string_from_format(input->info()->format()).c_str(),
+ string_from_format(output->info()->format()).c_str());
+
+ std::stringstream kernel_name;
+
+ kernel_name << string_from_format(input->info()->format());
+ kernel_name << "_to_";
+ kernel_name << string_from_format(output->info()->format());
+ kernel_name << "_bt709";
+
+ _input = input;
+ _multi_output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
+ {
+ win.set_dimension_step(Window::DimY, 2);
+ }
+
+ AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+ AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0,
+ num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
+
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+ output_plane0_access,
+ output_plane1_access,
+ output_plane2_access);
+
+ ValidRegion input_region = input->info()->valid_region();
+
+ output_plane0_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(0)->info()->tensor_shape()));
+ output_plane1_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(1)->info()->tensor_shape()));
+ output_plane2_access.set_valid_region(win, ValidRegion(input_region.anchor, output->plane(2)->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::configure(const ICLMultiImage *input, ICLMultiImage *output)
+{
+ unsigned int num_elems_processed_per_iteration = 0;
+ switch(input->info()->format())
+ {
+ case Format::NV12:
+ case Format::NV21:
+ {
+ switch(output->info()->format())
+ {
+ case Format::IYUV:
+ case Format::YUV444:
+ num_elems_processed_per_iteration = 16;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ case Format::IYUV:
+ {
+ switch(output->info()->format())
+ {
+ case Format::YUV444:
+ case Format::NV12:
+ num_elems_processed_per_iteration = 16;
+ break;
+ default:
+ break;
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ ARM_COMPUTE_ERROR_ON_MSG(num_elems_processed_per_iteration == 0, "Conversion from %s to %s not supported",
+ string_from_format(input->info()->format()).c_str(),
+ string_from_format(output->info()->format()).c_str());
+
+ std::stringstream kernel_name;
+
+ kernel_name << string_from_format(input->info()->format());
+ kernel_name << "_to_";
+ kernel_name << string_from_format(output->info()->format());
+ kernel_name << "_bt709";
+
+ _multi_input = input;
+ _multi_output = output;
+
+ // Create kernel
+ bool has_two_input_planars = (input->info()->format() == Format::NV12) || (input->info()->format() == Format::NV21);
+ bool has_two_output_planars = (output->info()->format() == Format::NV12) || (output->info()->format() == Format::NV21);
+
+ float sub_sampling_input = (has_two_input_planars || (input->info()->format() == Format::IYUV)) ? 0.5f : 1;
+ float sub_sampling_output = (has_two_output_planars || (output->info()->format() == Format::IYUV)) ? 0.5f : 1;
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str()));
+
+ Window win = calculate_max_window(*input->cl_plane(0)->info(), Steps(num_elems_processed_per_iteration));
+ win.set_dimension_step(Window::DimY, 2);
+
+ AccessWindowHorizontal input_plane0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowRectangle input_plane1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+ sub_sampling_input, sub_sampling_input);
+ AccessWindowRectangle input_plane2_access(has_two_input_planars ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1,
+ sub_sampling_input, sub_sampling_input);
+ AccessWindowHorizontal output_plane0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
+ AccessWindowRectangle output_plane2_access(has_two_output_planars ? nullptr : output->plane(2)->info(), 0, 0,
+ num_elems_processed_per_iteration, 1, sub_sampling_output, sub_sampling_output);
+
+ update_window_and_padding(win,
+ input_plane0_access, input_plane1_access, input_plane2_access,
+ output_plane0_access, output_plane1_access, output_plane2_access);
+
+ ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), input->plane(1)->info()->valid_region(),
+ input->plane(2)->info()->valid_region());
+ output_plane0_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(0)->info()->tensor_shape()));
+ output_plane1_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(1)->info()->tensor_shape()));
+ output_plane2_access.set_valid_region(win, ValidRegion(intersect_region.anchor, output->plane(2)->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLColorConvertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ if(nullptr != _input && nullptr != _output)
+ {
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+ }
+ else if(nullptr != _input && nullptr != _multi_output)
+ {
+ Format format = _multi_output->info()->format();
+ do
+ {
+ Window win_uv(slice);
+
+ if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
+ {
+ win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+ win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+ }
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
+ for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
+ {
+ add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_uv);
+ }
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+ }
+ else if(nullptr != _multi_input && nullptr != _output)
+ {
+ Format format = _multi_input->info()->format();
+ do
+ {
+ Window win_uv(slice);
+
+ if((Format::NV12 == format) || (Format::NV21 == format) || (Format::IYUV == format))
+ {
+ win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
+ win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
+
+ for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
+ {
+ add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_uv);
+ }
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+ }
+ else if(nullptr != _multi_input && nullptr != _multi_output)
+ {
+ Format in_format = _multi_input->info()->format();
+ Format out_format = _multi_output->info()->format();
+ do
+ {
+ Window win_in_uv(slice);
+ if((Format::NV12 == in_format) || (Format::NV21 == in_format) || (Format::IYUV == in_format))
+ {
+ win_in_uv.set(Window::DimX, Window::Dimension(win_in_uv.x().start() / 2,
+ win_in_uv.x().end() / 2, win_in_uv.x().step() / 2));
+ win_in_uv.set(Window::DimY, Window::Dimension(win_in_uv.y().start() / 2, win_in_uv.y().end() / 2, 1));
+ }
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _multi_input->cl_plane(0), slice);
+ for(int i = 1; i < 3 && (0 != _multi_input->cl_plane(i)->info()->num_dimensions()); ++i)
+ {
+ add_2D_tensor_argument(idx, _multi_input->cl_plane(i), win_in_uv);
+ }
+
+ Window win_out_uv(slice);
+ if((Format::NV12 == out_format) || (Format::NV21 == out_format) || (Format::IYUV == out_format))
+ {
+ win_out_uv.set(Window::DimX, Window::Dimension(win_out_uv.x().start() / 2,
+ win_out_uv.x().end() / 2, win_out_uv.x().step() / 2));
+ win_out_uv.set(Window::DimY, Window::Dimension(win_out_uv.y().start() / 2, win_out_uv.y().end() / 2, 1));
+ }
+
+ add_2D_tensor_argument(idx, _multi_output->cl_plane(0), slice);
+ for(int i = 1; i < 3 && (0 != _multi_output->cl_plane(i)->info()->num_dimensions()); ++i)
+ {
+ add_2D_tensor_argument(idx, _multi_output->cl_plane(i), win_out_uv);
+ }
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported");
+ }
+}
diff --git a/src/core/CL/kernels/CLConvolutionKernel.cpp b/src/core/CL/kernels/CLConvolutionKernel.cpp
new file mode 100644
index 0000000000..bdfe398a1d
--- /dev/null
+++ b/src/core/CL/kernels/CLConvolutionKernel.cpp
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+#define MAX_MATRIX_SIZE 81
+
+/****************************************************************************************\
+ * Square Convolution *
+\****************************************************************************************/
+
+template <unsigned int matrix_size>
+BorderSize CLConvolutionKernel<matrix_size>::border_size() const
+{
+ return BorderSize(matrix_size / 2);
+}
+
+template <unsigned int matrix_size>
+void CLConvolutionKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON(conv == nullptr);
+
+ _input = input;
+ _output = output;
+
+ std::stringstream kernel_name;
+ std::set<std::string> options;
+ kernel_name << "convolution" << matrix_size << "x" << matrix_size << "_static";
+
+ if(scale == 0)
+ {
+ scale = calculate_matrix_scale(conv, matrix_size);
+ }
+
+ for(unsigned int i = 0; i < matrix_size * matrix_size; i++)
+ {
+ std::stringstream mat_str;
+ mat_str << "-DMAT" << i << "=" << conv[i];
+ options.insert(mat_str.str());
+ }
+
+ options.insert("-DSCALE=" + val_to_string(scale));
+
+ DataType data_type = data_type_for_convolution_matrix(conv, matrix_size * matrix_size);
+ options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+
+ std::stringstream out_type;
+ out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+ options.insert(out_type.str());
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name.str(), options));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_rows_read_per_iteration = matrix_size;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+/****************************************************************************************\
+ * Separable Convolution *
+\****************************************************************************************/
+template <unsigned int matrix_size>
+CLSeparableConvolutionHorKernel<matrix_size>::CLSeparableConvolutionHorKernel()
+ : _border_size(0)
+{
+}
+
+template <unsigned int matrix_size>
+BorderSize CLSeparableConvolutionHorKernel<matrix_size>::border_size() const
+{
+ return _border_size;
+}
+
+template <unsigned int matrix_size>
+void CLSeparableConvolutionHorKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32);
+
+ ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
+
+ _input = input;
+ _output = output;
+ _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
+
+ // Set build options
+ std::set<std::string> build_opts;
+
+ int16_t mat[matrix_size * matrix_size] = { 0 };
+ memcpy(mat, conv, matrix_size * sizeof(int16_t));
+
+ for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
+ {
+ build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+ }
+
+ build_opts.insert("-DSCALE=0");
+
+ build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable1x" + val_to_string(matrix_size) + "_static", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+
+ Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+template <unsigned int matrix_size>
+BorderSize CLSeparableConvolutionVertKernel<matrix_size>::border_size() const
+{
+ return BorderSize(matrix_size / 2, 0);
+}
+
+template <unsigned int matrix_size>
+void CLSeparableConvolutionVertKernel<matrix_size>::configure(const ICLTensor *input, ICLTensor *output,
+ const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON((matrix_size != 5) && (matrix_size != 7) && (matrix_size != 9));
+ ARM_COMPUTE_ERROR_ON(scale == 0);
+
+ _input = input;
+ _output = output;
+
+ std::set<std::string> build_opts;
+
+ int16_t mat[matrix_size * matrix_size] = { 0 };
+ memcpy(mat + matrix_size, conv, matrix_size * sizeof(int16_t));
+
+ for(unsigned int j = 0; j < matrix_size * matrix_size; j++)
+ {
+ build_opts.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+ }
+
+ build_opts.insert("-DSCALE=" + val_to_string(scale));
+
+ build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+
+ build_opts.insert("-DCOMPUTE_TYPE=" + get_cl_type_from_data_type(data_type));
+
+ std::stringstream out_type;
+ out_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+ build_opts.insert(out_type.str());
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_separable" + val_to_string(matrix_size) + "x1_static", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 8;
+ constexpr unsigned int num_rows_read_per_iteration = matrix_size;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+/****************************************************************************************\
+ * Rectangle Convolution *
+\****************************************************************************************/
+
+CLConvolutionRectangleKernel::CLConvolutionRectangleKernel()
+ : _border_size(0), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLConvolutionRectangleKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLConvolutionRectangleKernel::configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON(nullptr == conv);
+ ARM_COMPUTE_ERROR_ON(3 != width && 5 != width && 7 != width && 9 != width);
+ ARM_COMPUTE_ERROR_ON(3 != height && 5 != height && 7 != height && 9 != height);
+ ARM_COMPUTE_ERROR_ON(0 == scale);
+
+ _input = input;
+ _output = output;
+ _border_size = BorderSize(height / 2, width / 2);
+
+ std::set<std::string> options;
+
+ std::stringstream output_type;
+ output_type << "-DDATA_TYPE_OUT=" << get_cl_type_from_data_type(output->info()->data_type());
+ options.insert(output_type.str());
+
+ uint32_t matrix_size = width * height;
+
+ int16_t mat[MAX_MATRIX_SIZE] = { 0 };
+
+ memcpy(mat, conv, matrix_size * sizeof(int16_t));
+
+ for(unsigned int j = 0; j < MAX_MATRIX_SIZE; j++)
+ {
+ options.insert("-DMAT" + val_to_string(j) + "=" + val_to_string(mat[j]));
+ }
+
+ options.insert("-DSCALE=" + val_to_string(scale));
+
+ DataType data_type = data_type_for_convolution_matrix(conv, matrix_size);
+ options.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+
+ options.insert("-DMATRIX_WIDTH=" + val_to_string(width));
+ options.insert("-DMATRIX_HEIGHT=" + val_to_string(height));
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution_rectangle", options));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ const unsigned int num_rows_read_per_iteration = height;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLConvolutionRectangleKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+template class arm_compute::CLConvolutionKernel<3>;
+template class arm_compute::CLConvolutionKernel<5>;
+template class arm_compute::CLConvolutionKernel<7>;
+template class arm_compute::CLConvolutionKernel<9>;
+template class arm_compute::CLSeparableConvolutionVertKernel<5>;
+template class arm_compute::CLSeparableConvolutionVertKernel<7>;
+template class arm_compute::CLSeparableConvolutionVertKernel<9>;
+template class arm_compute::CLSeparableConvolutionHorKernel<5>;
+template class arm_compute::CLSeparableConvolutionHorKernel<7>;
+template class arm_compute::CLSeparableConvolutionHorKernel<9>;
diff --git a/src/core/CL/kernels/CLDepthConcatenateKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
new file mode 100644
index 0000000000..73f1ba15df
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConcatenateKernel.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLDepthConcatenateKernel::CLDepthConcatenateKernel()
+ : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0)
+{
+}
+
+BorderSize CLDepthConcatenateKernel::border_size() const
+{
+ return BorderSize(_top_bottom, _left_right);
+}
+
+void CLDepthConcatenateKernel::configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output);
+
+ // The gaps between the two lowest dimensions of input and output need to be divisible by 2
+ // Otherwise it is not clear how the padding should be added onto the input tensor
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2);
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2);
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth"));
+
+ // Configure kernel window
+ _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2;
+ _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2;
+
+ const unsigned int offset_to_first_elements_in_bytes = depth_offset * output->info()->strides_in_bytes()[2] + _left_right * output->info()->strides_in_bytes()[0] + _top_bottom *
+ output->info()->strides_in_bytes()[1];
+
+ const unsigned int num_elems_processed_per_iteration = 4;
+ const unsigned int num_elems_read_per_iteration = 4;
+ const unsigned int num_rows_read_per_iteration = 1;
+
+ // The window needs to be based on input as we copy all the depths of input
+ Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size());
+
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowRectangle(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration),
+ output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<unsigned int>(idx, offset_to_first_elements_in_bytes);
+
+ ICLKernel::configure(win);
+}
+
+void CLDepthConcatenateKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLDepthConvertKernel.cpp b/src/core/CL/kernels/CLDepthConvertKernel.cpp
new file mode 100644
index 0000000000..24608bd17c
--- /dev/null
+++ b/src/core/CL/kernels/CLDepthConvertKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDepthConvertKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void CLDepthConvertKernel::configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::U16, DataType::U32, DataType::S32);
+ ARM_COMPUTE_ERROR_ON(input == output);
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data types must be different");
+ ARM_COMPUTE_ERROR_ON(shift >= 8);
+
+ // Check if convertion is supported
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::U16 && output->info()->data_type() != DataType::S16
+ && output->info()->data_type() != DataType::U32 && output->info()->data_type() != DataType::S32),
+ "Only data types supported [in] U8 -> [out] U16, S16, U32, S32");
+
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+ && output->info()->data_type() != DataType::S32),
+ "Only data types supported [in] U16 -> [out] U8, U32, S32");
+
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32
+ && output->info()->data_type() != DataType::S32),
+ "Only data types supported [in] S16 -> [out] U8, U32, S32");
+
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+ && output->info()->data_type() != DataType::S16),
+ "Only data types supported [in] U32 -> [out] U8, U16, S16");
+
+ ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S32 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U16
+ && output->info()->data_type() != DataType::S16),
+ "Only data types supported [in] S32 -> [out] U8, U16, S16");
+
+ // Get data sizes
+ const size_t input_size = data_size_from_type(input->info()->data_type());
+ const size_t output_size = data_size_from_type(output->info()->data_type());
+
+ // Construct kernel name and build options
+ std::string kernel_name = "convert_depth";
+ std::set<std::string> build_opts;
+ if(input_size > output_size)
+ {
+ kernel_name += "_down";
+ build_opts.insert((policy == ConvertPolicy::WRAP) ? "-DWRAP" : "-DSATURATE");
+ }
+ else
+ {
+ kernel_name += "_up";
+ }
+ build_opts.insert("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.insert("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Set shift arg
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, shift);
+
+ // Configure kernel
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLDerivativeKernel.cpp b/src/core/CL/kernels/CLDerivativeKernel.cpp
new file mode 100644
index 0000000000..36ba06d528
--- /dev/null
+++ b/src/core/CL/kernels/CLDerivativeKernel.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLDerivativeKernel::CLDerivativeKernel()
+ : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_derivative_x(false), _run_derivative_y(false)
+{
+}
+
+BorderSize CLDerivativeKernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLDerivativeKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+ _run_derivative_x = output_x != nullptr;
+ _run_derivative_y = output_y != nullptr;
+
+ if(_run_derivative_x)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+ }
+
+ if(_run_derivative_y)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+ }
+
+ _input = input;
+ _output_x = output_x;
+ _output_y = output_y;
+
+ // Set build options
+ std::set<std::string> build_opts;
+
+ if(_run_derivative_x)
+ {
+ build_opts.insert("-DGRAD_X");
+ }
+
+ if(_run_derivative_y)
+ {
+ build_opts.insert("-DGRAD_Y");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("derivative", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ constexpr unsigned int num_read_rows_per_iteration = 3;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), 0, 0, 0, 0);
+ AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
+ if(_run_derivative_x && _run_derivative_y)
+ {
+ input_access = AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
+ }
+ else if(_run_derivative_x)
+ {
+ input_access = AccessWindowHorizontal(input->info(), -border_size().left, num_elems_processed_per_iteration);
+ }
+ else if(_run_derivative_y)
+ {
+ input_access = AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_processed_per_iteration, num_read_rows_per_iteration);
+ }
+
+ update_window_and_padding(win,
+ input_access,
+ output_x_access,
+ output_y_access);
+
+ output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+ output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLDerivativeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+
+ if(_run_derivative_x)
+ {
+ add_2D_tensor_argument(idx, _output_x, slice);
+ }
+
+ if(_run_derivative_y)
+ {
+ add_2D_tensor_argument(idx, _output_y, slice);
+ }
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLDilateKernel.cpp b/src/core/CL/kernels/CLDilateKernel.cpp
new file mode 100644
index 0000000000..3abd747011
--- /dev/null
+++ b/src/core/CL/kernels/CLDilateKernel.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLDilateKernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLDilateKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dilate"));
+
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLErodeKernel.cpp b/src/core/CL/kernels/CLErodeKernel.cpp
new file mode 100644
index 0000000000..a7aa88fc5c
--- /dev/null
+++ b/src/core/CL/kernels/CLErodeKernel.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLErodeKernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLErodeKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("erode"));
+
+ _input = input;
+ _output = output;
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_rows_read_pes_iteration = 3;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_pes_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLFastCornersKernel.cpp b/src/core/CL/kernels/CLFastCornersKernel.cpp
new file mode 100644
index 0000000000..1d4d776730
--- /dev/null
+++ b/src/core/CL/kernels/CLFastCornersKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLFastCornersKernel::CLFastCornersKernel()
+ : ICLKernel(), _input(nullptr), _output(nullptr)
+{
+}
+
+BorderSize CLFastCornersKernel::border_size() const
+{
+ return BorderSize(3);
+}
+
+void CLFastCornersKernel::configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MSG(border_mode != BorderMode::UNDEFINED, "Not implemented");
+
+ _input = input;
+ _output = output;
+
+ // Create build options
+ std::set<std::string> build_opts;
+
+ if(non_max_suppression)
+ {
+ build_opts.emplace("-DUSE_MAXSUPPRESSION");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("fast_corners", build_opts));
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<cl_float>(idx, static_cast<float>(threshold));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+ constexpr unsigned int num_elems_read_per_iteration = 7;
+ constexpr unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_mode == BorderMode::UNDEFINED, BorderSize(3));
+
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_mode == BorderMode::UNDEFINED, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLFastCornersKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLCopyToArrayKernel::CLCopyToArrayKernel()
+ : ICLKernel(), _input(nullptr), _corners(nullptr), _num_buffer(nullptr)
+{
+}
+
+void CLCopyToArrayKernel::configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(corners == nullptr);
+ ARM_COMPUTE_ERROR_ON(num_buffers == nullptr);
+
+ _input = input;
+ _corners = corners;
+ _num_buffer = num_buffers;
+
+ std::set<std::string> build_opts;
+
+ if(update_number)
+ {
+ build_opts.emplace("-DUPDATE_NUMBER");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("copy_to_keypoint", build_opts));
+
+ //Get how many pixels skipped in the x dimension in the previous stages
+ unsigned int offset = _input->info()->valid_region().anchor.x();
+
+ // Set static kernel arguments
+ unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<unsigned int>(idx++, corners->max_num_values());
+ _kernel.setArg<cl_uint>(idx++, offset);
+ _kernel.setArg(idx++, *_num_buffer);
+ _kernel.setArg(idx++, _corners->cl_buffer());
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+ ICLKernel::configure(win);
+}
+
+void CLCopyToArrayKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ //Initialise the _num_buffer as it used as both input and output
+ static const unsigned int zero_init = 0;
+ queue.enqueueWriteBuffer(*_num_buffer, CL_FALSE, 0, sizeof(unsigned int), &zero_init);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
new file mode 100644
index 0000000000..981aad665a
--- /dev/null
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstdint>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLFillBorderKernel::CLFillBorderKernel()
+ : ICLKernel(), _tensor(nullptr)
+{
+}
+
+bool CLFillBorderKernel::is_parallelisable() const
+{
+ return false;
+}
+
+template <class T>
+void CLFillBorderKernel::set_constant_border(unsigned int idx, const PixelValue &constant_border_value)
+{
+ T value;
+ constant_border_value.get(value);
+ ICLKernel::add_argument<T>(idx, static_cast<T>(value));
+}
+
+void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+ ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+ ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+
+ border_size.limit(tensor->info()->padding());
+
+ // If there is no border: early exit
+ if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+ {
+ return;
+ }
+
+ // Select appropriate kernel
+ std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
+
+ // Define select type required by replicate border > 1
+ const DataType dt = tensor->info()->data_type();
+ std::string select_type = get_cl_type_from_data_type(dt);
+ if(is_data_type_float(dt))
+ {
+ select_type = (DataType::F32 == dt) ? "int" : "short";
+ }
+
+ // Define build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
+ build_opts.emplace(("-DSELECT_TYPE=" + select_type));
+ build_opts.emplace(("-DBORDER_SIZE_TOP=" + val_to_string(border_size.top)));
+ build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + val_to_string(border_size.bottom)));
+ build_opts.emplace(("-DBORDER_SIZE_LEFT=" + val_to_string(border_size.left)));
+ build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + val_to_string(border_size.right)));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+ _tensor = tensor;
+
+ // Create static kernel arguments
+ const unsigned int valid_width = tensor->info()->valid_region().shape[0];
+ const unsigned int valid_height = tensor->info()->valid_region().shape[1];
+ const cl_int2 valid_region_coords =
+ {
+ {
+ static_cast<cl_int>(tensor->info()->valid_region().anchor[0]),
+ static_cast<cl_int>(tensor->info()->valid_region().anchor[1]),
+ }
+ };
+ const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+
+ // Set static kernel arguments
+ unsigned int idx = num_arguments_per_2D_tensor(); //Skip the tensor parameters
+ ICLKernel::add_argument<cl_uint>(idx, valid_width);
+ ICLKernel::add_argument<cl_uint>(idx, valid_height);
+ ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
+ if(BorderMode::CONSTANT == border_mode)
+ {
+ switch(dt)
+ {
+ case DataType::U8:
+ set_constant_border<uint8_t>(idx, constant_border_value);
+ break;
+ case DataType::U16:
+ set_constant_border<uint16_t>(idx, constant_border_value);
+ break;
+ case DataType::S16:
+ set_constant_border<int16_t>(idx, constant_border_value);
+ break;
+ case DataType::U32:
+ set_constant_border<uint32_t>(idx, constant_border_value);
+ break;
+ case DataType::S32:
+ set_constant_border<int32_t>(idx, constant_border_value);
+ break;
+ case DataType::F32:
+ static_assert(sizeof(float) == 4, "Float must be 32 bit");
+ set_constant_border<float>(idx, constant_border_value);
+ break;
+ case DataType::F16:
+ static_assert(sizeof(cl_half) == 2, "Half must be 16 bit");
+ set_constant_border<cl_half>(idx, constant_border_value);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Not handled");
+ }
+ }
+
+ // Configure kernel window
+ Window win;
+ win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
+ win.set(Window::DimY, Window::Dimension(0, 1, 1));
+ win.use_tensor_dimensions(tensor->info(), Window::DimZ);
+ ICLKernel::configure(win);
+}
+
+void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ // Border mode undefined or border width == 0
+ if(_kernel() == nullptr)
+ {
+ return;
+ }
+
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _tensor, slice);
+ enqueue(queue, *this, slice, cl::NullRange);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
new file mode 100644
index 0000000000..71d42c5606
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLGEMMInterleave4x4Kernel::CLGEMMInterleave4x4Kernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4);
+ ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(static_cast<float>(input->info()->dimension(1)) / 4.0f));
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ std::string data_type_name;
+ data_type_name = val_to_string(input->info()->element_size() * 8) + "bit";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_interleave4x4_" + data_type_name));
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+ constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+ const unsigned int num_elems_written_per_iteration = num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, 1, 4.f, 0.25f);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ /*
+ * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
+ * |a00 a01 a02 a03|
+ * |a10 a11 a12 a13|
+ * |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 |
+ * |a30 a31 a32 a33|
+ *
+ * After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ]
+ */
+ Window in_slice = window.first_slice_window_2D();
+ Window out_slice = window.first_slice_window_2D();
+
+ // Change x and y steps for the slide of output tensor
+ out_slice.scale(Window::DimX, 4.f);
+ out_slice.scale(Window::DimY, 0.25f);
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..c6e05b92a2
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <tuple>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
+ int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+
+ // Create kernel and set static arguments
+ std::set<std::string> build_opts = { ("-DWIDTH_MATRIX_B=" + val_to_string(input1->info()->dimension(0))) };
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_u8", build_opts));
+ unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg<int32_t>(idx++, a_offset);
+ _kernel.setArg<int32_t>(idx++, b_offset);
+ _kernel.setArg<int32_t>(idx++, output_offset);
+ _kernel.setArg<int32_t>(idx++, output_mult_int);
+ _kernel.setArg<int32_t>(idx++, shift);
+
+ // Configure window
+ constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+ constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+ constexpr unsigned int num_elems_read_per_iteration_input0 = 4;
+ constexpr unsigned int num_elems_read_per_iteration_input1 = 16;
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_read_per_iteration_input0, 1);
+ AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_read_per_iteration_input1, 1);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ Window slice_matrix_b = slice;
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
+ slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ if(_input1->info()->num_dimensions() < 3)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input0, slice);
+ add_2D_tensor_argument(idx, _input1, slice_b);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 0000000000..289873c23f
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
+ : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+ ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1);
+
+ _biases = biases;
+ _accum = accum;
+
+ // Create kernel
+ std::string data_type_name = lower_string(string_from_data_type(accum->info()->data_type()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_accumulate_biases_" + data_type_name));
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(accum->info()->data_type());
+
+ Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1));
+ AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, biases_access, accum_access);
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window accum_slice = window.first_slice_window_2D();
+
+ Window biases_slice(accum_slice);
+ biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ // Run kernel
+ do
+ {
+ // Set arguments
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _accum, accum_slice);
+ add_1D_tensor_argument(idx, _biases, biases_slice);
+
+ enqueue(queue, *this, accum_slice);
+ }
+ while(window.slide_window_slice_2D(accum_slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
new file mode 100644
index 0000000000..343838f2f9
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixAdditionKernel.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAdditionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLGEMMMatrixAdditionKernel::CLGEMMMatrixAdditionKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMMatrixAdditionKernel::configure(const ICLTensor *input, ICLTensor *output, const float beta)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+ _input = input;
+ _output = output;
+ const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+
+ std::ostringstream ma_arguments;
+ ma_arguments << "-DBETA=" << beta;
+ std::set<std::string> build_opts;
+ build_opts.emplace(ma_arguments.str());
+
+ // Create kernel
+ std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_ma_" + data_type_name), build_opts));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMMatrixAdditionKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..d7388e8579
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/AccessWindowTranspose.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ if(output->info()->dimension(1) == 1)
+ {
+ ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+ }
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+
+ if(output->info()->dimension(1) == 196)
+ {
+ _lws_hint = cl::NDRange(1, 7);
+ }
+ else
+ {
+ _lws_hint = cl::NDRange(8, 8);
+ }
+
+ std::ostringstream mm_arguments;
+ mm_arguments << "-DWIDTH_MATRIX_B=" << input1->info()->dimension(0) << " ";
+ mm_arguments << "-DALPHA=" << alpha << " ";
+ std::set<std::string> build_opts;
+
+ // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
+ if(output->info()->dimension(1) == 1)
+ {
+ mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+ build_opts.emplace(mm_arguments.str());
+
+ // Create kernel
+ std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_vm_" + data_type_name), build_opts));
+
+ // Configure window kernel
+ const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+ AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+ AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+ }
+ else
+ {
+ build_opts.emplace(mm_arguments.str());
+
+ // Create kernel
+ std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+
+ if(data_type_name == "f32")
+ {
+ GPUTarget arch_target = get_arch_from_target(get_target());
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts));
+ }
+ else
+ {
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts));
+ }
+
+ // Configure window kernel
+ const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+ constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f);
+ AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+ }
+}
+
+void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ Window slice_matrix_b = slice;
+ slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1));
+ slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1));
+ slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ do
+ {
+ Window slice_b = slice;
+ // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+ // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+ if(_input1->info()->num_dimensions() < 3)
+ {
+ slice_b = slice_matrix_b;
+ }
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input0, slice);
+ add_2D_tensor_argument(idx, _input1, slice_b);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, _lws_hint);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
new file mode 100644
index 0000000000..ecee1abd72
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMTranspose1xWKernel.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void CLGEMMTranspose1xWKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ const size_t transpose_w = 16 / input->info()->element_size();
+ output_shape.set(0, input->info()->dimension(1) * transpose_w);
+ output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+ _input = input;
+ _output = output;
+ const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / data_size_from_type(input->info()->data_type());
+
+ /*
+ * Following an example of how the transposition1xW works when the input data type is F32
+ *
+ * |a00 a01 a02 a03|
+ * |a10 a11 a12 a13|
+ * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 |
+ * |a30 a31 a32 a33|
+ *
+ * If the input data type is F32, the output matrix will have the following shape: [ height * 4, width / 4 ]
+ * If the input data type is F16, the output matrix will have the following shape: [ height * 8, width / 8 ]
+ */
+ // Create kernel
+ std::string data_type_name = lower_string(string_from_data_type(input->info()->data_type()));
+ std::string kernel_name = "gemm_transpose1x" + val_to_string(num_elems_processed_per_iteration) + "_" + data_type_name;
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ float scale_x = 1.f;
+
+ switch(input->info()->data_type())
+ {
+ case DataType::U8:
+ scale_x = 16.f;
+ break;
+ case DataType::F16:
+ scale_x = 8.f;
+ break;
+ case DataType::F32:
+ scale_x = 4.f;
+ break;
+ default:
+ // Do nothing
+ break;
+ }
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLGEMMTranspose1xWKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Output is transposed
+ Window out_window(window);
+ out_window.set(Window::DimX, window.y());
+ out_window.set(Window::DimY, window.x());
+
+ Window in_slice = window.first_slice_window_2D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ enqueue(queue, *this, in_slice, _lws_hint);
+ }
+ while(window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLGaussian3x3Kernel.cpp b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
new file mode 100644
index 0000000000..e5bc3f9656
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian3x3Kernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLGaussian3x3Kernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLGaussian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ _input = input;
+ _output = output;
+
+ // Set build options
+ std::set<std::string> build_opts = { "-DMAT0=1", "-DMAT1=2", "-DMAT2=1",
+ "-DMAT3=2", "-DMAT4=4", "-DMAT5=2",
+ "-DMAT6=1", "-DMAT7=2", "-DMAT8=1",
+ "-DSCALE=16", "-DDATA_TYPE_OUT=uchar"
+ };
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("convolution3x3_static", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLGaussian5x5Kernel.cpp b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
new file mode 100644
index 0000000000..bd523c883d
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussian5x5Kernel.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
+
+#include <cstdint>
+
+using namespace arm_compute;
+
+void CLGaussian5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ const int16_t matrix[] = { 1, 4, 6, 4, 1 };
+
+ // Set arguments
+ CLSeparableConvolution5x5HorKernel::configure(input, output, matrix, border_undefined);
+}
+
+void CLGaussian5x5VertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ const uint32_t scale = 256;
+ const int16_t matrix[] = { 1, 4, 6, 4, 1 };
+
+ // Set arguments
+ CLSeparableConvolution5x5VertKernel::configure(input, output, matrix, scale, border_undefined);
+}
diff --git a/src/core/CL/kernels/CLGaussianPyramidKernel.cpp b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
new file mode 100644
index 0000000000..34a228c717
--- /dev/null
+++ b/src/core/CL/kernels/CLGaussianPyramidKernel.cpp
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLGaussianPyramidHorKernel::CLGaussianPyramidHorKernel()
+ : _border_size(0), _l2_load_offset(0)
+{
+}
+
+BorderSize CLGaussianPyramidHorKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLGaussianPyramidHorKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
+
+ for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+ }
+
+ _input = input;
+ _output = output;
+ _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian1x5_sub_x"));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ constexpr unsigned int num_elems_read_per_iteration = 20;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr float scale_x = 0.5f;
+
+ Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
+
+ // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
+ // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
+ // a pixel is even or odd is determined based on the tensor shape not the
+ // valid region!)
+ // Thus the offset from which the first pixel (L2) for the convolution is
+ // loaded depends on the anchor and shape of the valid region.
+ // In the case of an even shape (= even image width) we need to load L2
+ // from -2 if the anchor is odd and from -1 if the anchor is even. That
+ // makes sure that L2 is always loaded from an odd pixel.
+ // On the other hand, for an odd shape (= odd image width) we need to load
+ // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
+ // achieve the opposite effect.
+ // The condition can be simplified to checking whether anchor + shape is
+ // odd (-2) or even (-1) as only adding an odd and an even number will have
+ // an odd result.
+ _l2_load_offset = -border_size().left;
+
+ if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
+ {
+ _l2_load_offset += 1;
+ }
+
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
+ output_access);
+
+ ValidRegion valid_region = input->info()->valid_region();
+ valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f));
+ valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLGaussianPyramidHorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window win_in(window);
+ win_in.shift(Window::DimX, _l2_load_offset);
+
+ //The output is half the width of the input:
+ Window win_out(window);
+ win_out.scale(Window::DimX, 0.5f);
+
+ Window slice_in = win_in.first_slice_window_2D();
+ Window slice_out = win_out.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice_in);
+ add_2D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ }
+ while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
+}
+
+CLGaussianPyramidVertKernel::CLGaussianPyramidVertKernel()
+ : _t2_load_offset(0)
+{
+}
+
+BorderSize CLGaussianPyramidVertKernel::border_size() const
+{
+ return BorderSize(2, 0);
+}
+
+void CLGaussianPyramidVertKernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1));
+
+ for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+ {
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
+ }
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gaussian5x1_sub_y"));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_rows_processed_per_iteration = 2;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 8;
+ constexpr unsigned int num_rows_per_iteration = 5;
+ constexpr float scale_y = 0.5f;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration),
+ border_undefined, border_size());
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_per_iteration, 1.f, scale_y);
+
+ // Determine whether we need to load even or odd rows. See above for a
+ // detailed explanation.
+ _t2_load_offset = -border_size().top;
+
+ if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
+ {
+ _t2_load_offset += 1;
+ }
+
+ update_window_and_padding(win,
+ AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_per_iteration),
+ output_access);
+
+ ValidRegion valid_region = input->info()->valid_region();
+ valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f));
+ valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]);
+
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLGaussianPyramidVertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(window.x().step() != 8);
+ ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
+
+ Window win_in(window);
+ win_in.shift(Window::DimY, _t2_load_offset);
+
+ Window win_out(window);
+ win_out.scale(Window::DimY, 0.5f);
+
+ Window slice_in = win_in.first_slice_window_2D();
+ Window slice_out = win_out.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice_in);
+ add_2D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice_out);
+ }
+ while(win_in.slide_window_slice_2D(slice_in) && win_out.slide_window_slice_2D(slice_out));
+}
diff --git a/src/core/CL/kernels/CLHOGDescriptorKernel.cpp b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
new file mode 100644
index 0000000000..87659c4ba9
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDescriptorKernel.cpp
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHOGOrientationBinningKernel::CLHOGOrientationBinningKernel()
+ : _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_size()
+{
+}
+
+void CLHOGOrientationBinningKernel::configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
+ ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
+ ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
+
+ _input_magnitude = input_magnitude;
+ _input_phase = input_phase;
+ _output = output;
+ _cell_size = hog_info->cell_size();
+
+ float phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? hog_info->num_bins() / 360.0f : hog_info->num_bins() / 180.0f);
+ phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
+
+ std::stringstream args_str;
+ args_str << "-DCELL_WIDTH=" << hog_info->cell_size().width << " ";
+ args_str << "-DCELL_HEIGHT=" << hog_info->cell_size().height << " ";
+ args_str << "-DNUM_BINS=" << hog_info->num_bins() << " ";
+ args_str << "-DPHASE_SCALE=" << phase_scale << " ";
+
+ // Construct kernel name
+ std::set<std::string> build_opts = {};
+ build_opts.insert(args_str.str());
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_orientation_binning", build_opts));
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+ constexpr unsigned int num_elems_read_per_iteration = 1;
+ const unsigned int num_rows_read_per_iteration = hog_info->cell_size().height;
+ constexpr unsigned int num_elems_written_per_iteration = 1;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+ AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+ output_access);
+
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLHOGOrientationBinningKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ // Compute slice for the magnitude and phase tensors
+ Window slice_mag_phase = window.first_slice_window_2D();
+ slice_mag_phase.set(Window::DimX, Window::Dimension(window.x().start() * _cell_size.width, window.x().start() * _cell_size.width, _cell_size.width));
+ slice_mag_phase.set(Window::DimY, Window::Dimension(window.y().start() * _cell_size.height, window.y().start() * _cell_size.height, _cell_size.height));
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input_magnitude, slice_mag_phase);
+ add_2D_tensor_argument(idx, _input_phase, slice_mag_phase);
+ add_2D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLHOGBlockNormalizationKernel::CLHOGBlockNormalizationKernel()
+ : _input(nullptr), _output(nullptr), _num_cells_per_block_stride()
+{
+}
+
+void CLHOGBlockNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info)
+{
+ ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
+
+ // Number of cells per block
+ const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
+ hog_info->block_size().height / hog_info->cell_size().height);
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins() * num_cells_per_block.area(), DataType::F32);
+
+ // Number of cells per block stride
+ const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
+ hog_info->block_stride().height / hog_info->cell_size().height);
+
+ _input = input;
+ _output = output;
+ _num_cells_per_block_stride = num_cells_per_block_stride;
+
+ std::stringstream args_str;
+ args_str << "-DL2_HYST_THRESHOLD=" << hog_info->l2_hyst_threshold() << " ";
+ args_str << "-DNUM_CELLS_PER_BLOCK_HEIGHT=" << num_cells_per_block.height << " ";
+ args_str << "-DNUM_BINS_PER_BLOCK_X=" << num_cells_per_block.width *hog_info->num_bins() << " ";
+ args_str << "-DNUM_BINS_PER_BLOCK=" << _output->info()->num_channels() << " ";
+ args_str << "-DL2_NORM=" << static_cast<int>(HOGNormType::L2_NORM) << " ";
+ args_str << "-DL1_NORM=" << static_cast<int>(HOGNormType::L1_NORM) << " ";
+ args_str << "-DL2HYS_NORM=" << static_cast<int>(HOGNormType::L2HYS_NORM) << " ";
+ args_str << "-DHOG_NORM_TYPE=" << static_cast<int>(hog_info->normalization_type()) << " ";
+
+ // Construct kernel name
+ std::set<std::string> build_opts = {};
+ build_opts.insert(args_str.str());
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_block_normalization", build_opts));
+
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+ constexpr unsigned int num_elems_read_per_iteration = 1;
+ const unsigned int num_rows_read_per_iteration = num_cells_per_block.height;
+ constexpr unsigned int num_elems_written_per_iteration = 1;
+ const unsigned int num_rows_written_per_iteration = num_cells_per_block.height;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
+ output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLHOGBlockNormalizationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ // Compute slice for the magnitude and phase tensors
+ Window slice_in = window.first_slice_window_2D();
+ slice_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
+ slice_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height);
+
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice_in);
+ add_2D_tensor_argument(idx, _output, slice);
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHOGDetectorKernel.cpp b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
new file mode 100644
index 0000000000..0f9a98950d
--- /dev/null
+++ b/src/core/CL/kernels/CLHOGDetectorKernel.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLHOG.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLHOGDetectorKernel::CLHOGDetectorKernel()
+ : _input(nullptr), _detection_windows(), _num_detection_windows(nullptr)
+{
+}
+
+void CLHOGDetectorKernel::configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride,
+ float threshold, uint16_t idx_class)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(hog == nullptr);
+ ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
+ ARM_COMPUTE_ERROR_ON(num_detection_windows == nullptr);
+ ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
+ ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
+
+ const Size2D &detection_window_size = hog->info()->detection_window_size();
+ const Size2D &block_size = hog->info()->block_size();
+ const Size2D &block_stride = hog->info()->block_stride();
+
+ _input = input;
+ _detection_windows = detection_windows;
+ _num_detection_windows = num_detection_windows;
+
+ const unsigned int num_bins_per_descriptor_x = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
+ const unsigned int num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
+
+ ARM_COMPUTE_ERROR_ON((num_bins_per_descriptor_x * num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
+
+ std::stringstream args_str;
+ args_str << "-DNUM_BLOCKS_PER_DESCRIPTOR_Y=" << num_blocks_per_descriptor_y << " ";
+ args_str << "-DNUM_BINS_PER_DESCRIPTOR_X=" << num_bins_per_descriptor_x << " ";
+ args_str << "-DTHRESHOLD=" << threshold << " ";
+ args_str << "-DMAX_NUM_DETECTION_WINDOWS=" << detection_windows->max_num_values() << " ";
+ args_str << "-DIDX_CLASS=" << idx_class << " ";
+ args_str << "-DBLOCK_STRIDE_WIDTH=" << block_stride.width << " ";
+ args_str << "-DBLOCK_STRIDE_HEIGHT=" << block_stride.height << " ";
+ args_str << "-DDETECTION_WINDOW_WIDTH=" << detection_window_size.width << " ";
+ args_str << "-DDETECTION_WINDOW_HEIGHT=" << detection_window_size.height << " ";
+
+ // Construct kernel name
+ std::set<std::string> build_opts = {};
+ build_opts.insert(args_str.str());
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("hog_detector", build_opts));
+
+ // Set static kernel arguments
+ unsigned int idx = num_arguments_per_2D_tensor(); // Skip the input parameters
+ _kernel.setArg(idx++, hog->cl_buffer());
+ _kernel.setArg(idx++, detection_windows->cl_buffer());
+ _kernel.setArg(idx++, *_num_detection_windows);
+
+ // Get the number of blocks along the x and y directions of the input tensor
+ const ValidRegion &valid_region = input->info()->valid_region();
+ const size_t num_blocks_x = valid_region.shape[0];
+ const size_t num_blocks_y = valid_region.shape[1];
+
+ // Get the number of blocks along the x and y directions of the detection window
+ const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
+ const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
+
+ const size_t window_step_x = detection_window_stride.width / block_stride.width;
+ const size_t window_step_y = detection_window_stride.height / block_stride.height;
+
+ // Configure kernel window
+ Window win;
+ win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x));
+ win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y));
+
+ constexpr unsigned int num_elems_read_per_iteration = 1;
+ const unsigned int num_rows_read_per_iteration = num_blocks_per_descriptor_y;
+
+ update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
+
+ ICLKernel::configure(win);
+}
+
+void CLHOGDetectorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHarrisCornersKernel.cpp b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
new file mode 100644
index 0000000000..9fc34a7760
--- /dev/null
+++ b/src/core/CL/kernels/CLHarrisCornersKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLHarrisScoreKernel::CLHarrisScoreKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(), _strength_thresh(), _norm_factor(), _border_size(0)
+{
+}
+
+BorderSize CLHarrisScoreKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLHarrisScoreKernel::configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
+ int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
+ bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
+ ARM_COMPUTE_ERROR_ON(!(block_size == 3 || block_size == 5 || block_size == 7));
+ ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+ _sensitivity = sensitivity;
+ _strength_thresh = strength_thresh;
+ _norm_factor = norm_factor;
+ _border_size = BorderSize(block_size / 2);
+
+ // Select kernel
+ std::stringstream harris_score_kernel_name;
+ harris_score_kernel_name << "harris_score_" << block_size << "x" << block_size;
+
+ // Create build options
+ std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type())) };
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(harris_score_kernel_name.str(), build_opts));
+
+ // Set static kernel arguments
+ unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, sensitivity);
+ _kernel.setArg(idx++, strength_thresh);
+ _kernel.setArg(idx++, norm_factor);
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+ constexpr unsigned int num_elems_written_per_iteration = 4;
+ constexpr unsigned int num_elems_read_per_iteration = 8;
+ constexpr unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window(*_input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input1_access(input1->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowRectangle input2_access(input2->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region());
+ output_access.set_valid_region(win, valid_region, border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLHarrisScoreKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, slice);
+ add_2D_tensor_argument(idx, _input2, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLHistogramKernel.cpp b/src/core/CL/kernels/CLHistogramKernel.cpp
new file mode 100644
index 0000000000..87ee5fb74e
--- /dev/null
+++ b/src/core/CL/kernels/CLHistogramKernel.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLDistribution1D.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstring>
+#include <string>
+
+using namespace arm_compute;
+
+// each thread handle 16 pixels
+constexpr signed int pixels_per_item = 16;
+
+// local work group size in X dimension
+constexpr unsigned int local_x_size = 16;
+
+CLHistogramKernel::CLHistogramKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLHistogramKernel::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+ ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+ // Check input size
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+ // Check offset
+ ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
+
+ // Check range
+ ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
+
+ _input = input;
+ _output = output;
+
+ if(_input->info()->dimension(0) < pixels_per_item)
+ {
+ return;
+ }
+
+ unsigned int num_bins = _output->num_bins();
+ unsigned int window_size = _output->window();
+ unsigned int offset = _output->offset();
+ unsigned int range = _output->range();
+ unsigned int offrange = offset + range;
+ unsigned int bin_size = _output->size();
+ unsigned int buffer_size = bin_size + 1; // We need one extra place for pixels that don't meet the conditions
+
+ // Create kernel
+ bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
+ std::string kernel_name = is_fixed_size ? "hist_local_kernel_fixed" : "hist_local_kernel";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+ // Set static kernel arguments
+ unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, buffer_size, nullptr);
+ _kernel.setArg(idx++, _output->cl_buffer());
+ if(!is_fixed_size)
+ {
+ _kernel.setArg<cl_uint>(idx++, num_bins);
+ _kernel.setArg<cl_uint>(idx++, offset);
+ _kernel.setArg<cl_uint>(idx++, range);
+ _kernel.setArg<cl_uint>(idx++, offrange);
+ }
+
+ // We only run histogram on Image, therefore only 2 dimensions here
+ unsigned int end_position = (_input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(0, end_position, pixels_per_item));
+ win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
+
+ update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, pixels_per_item));
+
+ ICLKernel::configure(win);
+}
+
+void CLHistogramKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ if(_input->info()->dimension(0) < pixels_per_item)
+ {
+ return;
+ }
+
+ _output->map(queue, true);
+ ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
+ memset(_output->buffer(), 0, _output->size());
+ _output->unmap(queue);
+
+ Window slice = window.first_slice_window_2D();
+ cl::NDRange lws = cl::NDRange(local_x_size, 1);
+
+ do
+ {
+ /* Run the core part which has width can be divided by 16 */
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+
+ enqueue(queue, *this, slice, lws);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLHistogramBorderKernel::CLHistogramBorderKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void CLHistogramBorderKernel::configure(const ICLImage *input, ICLDistribution1D *output)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+ ARM_COMPUTE_ERROR_ON(nullptr == output);
+
+ // Check input size
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+ // Check offset
+ ARM_COMPUTE_ERROR_ON_MSG(0 > output->offset() || output->offset() > 256, "Offset is larger than the image value range.");
+
+ // Check range
+ ARM_COMPUTE_ERROR_ON_MSG(output->range() > 256 /* max range */, "Range larger than the image value range.");
+
+ // We only run histogram on Image, therefore only 2 dimensions here
+ unsigned int start_position = (input->info()->dimension(0) / pixels_per_item) * pixels_per_item;
+
+ if(start_position >= input->info()->dimension(0))
+ {
+ return; // no need to run histogram border kernel
+ }
+
+ _input = input;
+ _output = output;
+
+ unsigned int num_bins = _output->num_bins();
+ unsigned int window_size = _output->window();
+ unsigned int offset = _output->offset();
+ unsigned int range = _output->range();
+ unsigned int offrange = offset + range;
+
+ // Create kernel
+ bool is_fixed_size = (256 == num_bins) && (1 == window_size) && (0 == offset) && (256 == offrange);
+ std::string kernel_name = is_fixed_size ? "hist_border_kernel_fixed" : "hist_border_kernel";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+ // Set static kernel arguments
+ unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, _output->cl_buffer());
+ if(!is_fixed_size)
+ {
+ _kernel.setArg<cl_uint>(idx++, num_bins);
+ _kernel.setArg<cl_uint>(idx++, offset);
+ _kernel.setArg<cl_uint>(idx++, range);
+ _kernel.setArg<cl_uint>(idx++, offrange);
+ }
+
+ // Configure kernel window
+ Window win;
+ win.set(0, Window::Dimension(start_position, _input->info()->dimension(0)));
+ win.set(1, Window::Dimension(0, _input->info()->dimension(1)));
+ update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, 1));
+ ICLKernel::configure(win);
+}
+
+void CLHistogramBorderKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ if(window.x().start() >= window.x().end())
+ {
+ return;
+ }
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ cl::NDRange lws = cl::NDRange(1, 1);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ /* Run the border part which has width cannot be divided by 16 */
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+
+ enqueue(queue, *this, slice, lws);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
new file mode 100644
index 0000000000..8c0fe26666
--- /dev/null
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cmath>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLIm2ColKernel::CLIm2ColKernel()
+ : _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _num_elems_processed_per_iteration(1), _run_func(nullptr)
+{
+}
+
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace((has_bias ? "-DHAS_BIAS" : ""));
+
+ int pad_x = 0;
+ int pad_y = 0;
+ int stride_x = 0;
+ int stride_y = 0;
+ std::tie(pad_x, pad_y) = conv_info.pad();
+ std::tie(stride_x, stride_y) = conv_info.stride();
+
+ const bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4)
+ && (std::equal(input->info()->tensor_shape().cbegin() + 3,
+ input->info()->tensor_shape().cend(),
+ output->info()->tensor_shape().cbegin() + 1))
+ && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0));
+
+ if(!run_img2col_reduced)
+ {
+ _convolved_dims = convolved_dims;
+ _conv_info = conv_info;
+ _kernel_size = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2));
+ _num_elems_processed_per_iteration = output->info()->dimension(0);
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_generic", build_opts));
+
+ // Create static kernel arguments
+ const cl_int2 input_dims =
+ {
+ {
+ static_cast<cl_int>(input->info()->dimension(0)),
+ static_cast<cl_int>(input->info()->dimension(1)),
+ }
+ };
+ const cl_int2 strides =
+ {
+ {
+ stride_x,
+ stride_y,
+ }
+ };
+ const cl_int2 paddings =
+ {
+ {
+ pad_x,
+ pad_y,
+ }
+ };
+
+ // Set static kernel arguments
+ unsigned int idx = num_arguments_per_2D_tensor() + num_arguments_per_3D_tensor();
+ _kernel.setArg<cl_int>(idx++, _kernel_size);
+ _kernel.setArg<cl_int>(idx++, input->info()->dimension(2) /* depth */);
+ _kernel.setArg<cl_int>(idx++, _convolved_dims.first /* output width */);
+ _kernel.setArg<cl_int2>(idx++, input_dims);
+ _kernel.setArg<cl_int2>(idx++, strides);
+ _kernel.setArg<cl_int2>(idx++, paddings);
+
+ _run_func = &CLIm2ColKernel::run_generic;
+ }
+ else
+ {
+ _num_elems_processed_per_iteration = 1;
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("im2col_reduced", build_opts));
+ _run_func = &CLIm2ColKernel::run_reduced;
+ }
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+ // The CLIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ ICLKernel::configure(win);
+}
+
+void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON(_run_func == nullptr);
+ (this->*_run_func)(window, queue);
+}
+
+void CLIm2ColKernel::run_generic(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ int pad_x = 0;
+ int pad_y = 0;
+ int stride_x = 0;
+ int stride_y = 0;
+ std::tie(pad_x, pad_y) = _conv_info.pad();
+ std::tie(stride_x, stride_y) = _conv_info.stride();
+
+ // Get initial windows
+ Window slice = window.first_slice_window_3D();
+ Window slice_in = window.first_slice_window_3D();
+ Window slice_out = window.first_slice_window_3D();
+
+ // Setup slice
+ slice.set(Window::DimX, Window::Dimension(0, static_cast<int>(_convolved_dims.first), 1));
+ slice.set(Window::DimY, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+ slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ // Setup input slice
+ // The first three dimensions of the input are increased by the inner loops
+ slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+ slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ // Setup output slice
+ slice_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _num_elems_processed_per_iteration));
+ slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
+ slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+ do
+ {
+ // Set inputs
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice_in);
+ add_2D_tensor_argument(idx, _output, slice_out);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out) && window.slide_window_slice_3D(slice_in));
+}
+
+void CLIm2ColKernel::run_reduced(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window out_window;
+ out_window.use_tensor_dimensions(_output->info());
+
+ Window out_slice = out_window.first_slice_window_1D();
+ Window in_slice = window.first_slice_window_3D();
+
+ // Run kernel
+ do
+ {
+ // Set arguments
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_1D_tensor_argument(idx, _output, out_slice);
+
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+}
diff --git a/src/core/CL/kernels/CLIntegralImageKernel.cpp b/src/core/CL/kernels/CLIntegralImageKernel.cpp
new file mode 100644
index 0000000000..69ede457df
--- /dev/null
+++ b/src/core/CL/kernels/CLIntegralImageKernel.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cstddef>
+
+using namespace arm_compute;
+
+void CLIntegralImageHorKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_horizontal"));
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+ const unsigned int num_elems_accessed_per_iteration = ceil_to_multiple(num_elems_processed_per_iteration, 16);
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_accessed_per_iteration);
+
+ update_window_and_padding(win,
+ AccessWindowHorizontal(input->info(), 0, num_elems_accessed_per_iteration),
+ output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+CLIntegralImageVertKernel::CLIntegralImageVertKernel()
+ : _in_out(nullptr)
+{
+}
+
+void CLIntegralImageVertKernel::configure(ICLTensor *in_out)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(in_out, 1, DataType::U32);
+
+ _in_out = in_out;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("integral_vertical"));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+ const unsigned int num_elems_processed_per_iteration_y = in_out->info()->dimension(Window::DimY);
+
+ Window win = calculate_max_window(*in_out->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+ AccessWindowRectangle in_out_access(in_out->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+ update_window_and_padding(win, in_out_access);
+
+ in_out_access.set_valid_region(win, in_out->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLIntegralImageVertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const size_t height = _in_out->info()->dimension(1);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _in_out, slice);
+ _kernel.setArg<cl_uint>(idx++, height);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLLKTrackerKernel.cpp b/src/core/CL/kernels/CLLKTrackerKernel.cpp
new file mode 100644
index 0000000000..12cdd0ec93
--- /dev/null
+++ b/src/core/CL/kernels/CLLKTrackerKernel.cpp
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLArray.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+
+using namespace arm_compute;
+
+void CLLKTrackerInitKernel::configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
+ ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+ bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale)
+
+{
+ ARM_COMPUTE_ERROR_ON(old_points == nullptr);
+ ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
+ ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+
+ const float scale = std::pow(pyramid_scale, level);
+
+ // Create kernel
+ std::string kernel_name = "init_level";
+ if(level == (num_levels - 1))
+ {
+ kernel_name += (use_initial_estimate) ? std::string("_max_initial_estimate") : std::string("_max");
+ }
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+ // Set static kernel arguments
+ unsigned int idx = 0;
+ if(level == (num_levels - 1))
+ {
+ _kernel.setArg(idx++, old_points->cl_buffer());
+ if(use_initial_estimate)
+ {
+ _kernel.setArg(idx++, new_points_estimates->cl_buffer());
+ }
+ }
+ _kernel.setArg(idx++, old_points_internal->cl_buffer());
+ _kernel.setArg(idx++, new_points_internal->cl_buffer());
+ _kernel.setArg<cl_float>(idx++, scale);
+
+ // Configure kernel window
+ Window window;
+ window.set(Window::DimX, Window::Dimension(0, old_points->num_values(), 1));
+ window.set(Window::DimY, Window::Dimension(0, 1, 1));
+ ICLKernel::configure(window);
+}
+
+void CLLKTrackerInitKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ enqueue(queue, *this, window);
+}
+
+void CLLKTrackerFinalizeKernel::configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points)
+
+{
+ ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+ ARM_COMPUTE_ERROR_ON(new_points == nullptr);
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("finalize"));
+
+ // Set static kernel arguments
+ unsigned int idx = 0;
+ _kernel.setArg(idx++, new_points_internal->cl_buffer());
+ _kernel.setArg(idx++, new_points->cl_buffer());
+
+ // Configure kernel window
+ Window window;
+ window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+ window.set(Window::DimY, Window::Dimension(0, 1, 1));
+ ICLKernel::configure(window);
+}
+
+void CLLKTrackerFinalizeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ enqueue(queue, *this, window);
+}
+
+CLLKTrackerStage0Kernel::CLLKTrackerStage0Kernel()
+ : _old_input(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr)
+{
+}
+
+void CLLKTrackerStage0Kernel::configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
+ ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
+ ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+ size_t window_dimension, size_t level)
+
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON(old_points_internal == nullptr);
+ ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+ ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
+ ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
+
+ _old_input = old_input;
+ _old_scharr_gx = old_scharr_gx;
+ _old_scharr_gy = old_scharr_gy;
+
+ // Configure kernel window
+ Window window;
+ window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+ window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ const ValidRegion valid_region = intersect_valid_regions(
+ old_input->info()->valid_region(),
+ old_scharr_gx->info()->valid_region(),
+ old_scharr_gy->info()->valid_region());
+
+ update_window_and_padding(window,
+ AccessWindowStatic(old_input->info(), valid_region.start(0), valid_region.start(1),
+ valid_region.end(0), valid_region.end(1)),
+ AccessWindowStatic(old_scharr_gx->info(), valid_region.start(0), valid_region.start(1),
+ valid_region.end(0), valid_region.end(1)),
+ AccessWindowStatic(old_scharr_gy->info(), valid_region.start(0), valid_region.start(1),
+ valid_region.end(0), valid_region.end(1)));
+
+ ICLKernel::configure(window);
+
+ // Initialize required variables
+ const int level0 = (level == 0) ? 1 : 0;
+ const int window_size = window_dimension;
+ const int window_size_squared = window_dimension * window_dimension;
+ const int window_size_half = window_dimension / 2;
+ const float eig_const = 1.0f / (2.0f * window_size_squared);
+ const cl_float3 border_limits =
+ {
+ {
+ // -1 because we load 2 values at once for bilinear interpolation
+ static_cast<cl_float>(valid_region.end(0) - window_size - 1),
+ static_cast<cl_float>(valid_region.end(1) - window_size - 1),
+ static_cast<cl_float>(valid_region.start(0))
+ }
+ };
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage0"));
+
+ // Set arguments
+ unsigned int idx = 3 * num_arguments_per_2D_tensor();
+ _kernel.setArg(idx++, old_points_internal->cl_buffer());
+ _kernel.setArg(idx++, new_points_internal->cl_buffer());
+ _kernel.setArg(idx++, coeff_table->cl_buffer());
+ _kernel.setArg(idx++, old_ival->cl_buffer());
+ _kernel.setArg<cl_int>(idx++, window_size);
+ _kernel.setArg<cl_int>(idx++, window_size_squared);
+ _kernel.setArg<cl_int>(idx++, window_size_half);
+ _kernel.setArg<cl_float3>(idx++, border_limits);
+ _kernel.setArg<cl_float>(idx++, eig_const);
+ _kernel.setArg<cl_int>(idx++, level0);
+}
+
+void CLLKTrackerStage0Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Set static tensor arguments. Setting here as allocation might be deferred.
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _old_input, window);
+ add_2D_tensor_argument(idx, _old_scharr_gx, window);
+ add_2D_tensor_argument(idx, _old_scharr_gy, window);
+
+ enqueue(queue, *this, window);
+}
+
+CLLKTrackerStage1Kernel::CLLKTrackerStage1Kernel()
+ : _new_input(nullptr)
+{
+}
+
+void CLLKTrackerStage1Kernel::configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
+ Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level)
+
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(new_input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(new_points_internal == nullptr);
+ ARM_COMPUTE_ERROR_ON(coeff_table == nullptr);
+ ARM_COMPUTE_ERROR_ON(old_ival == nullptr);
+
+ _new_input = new_input;
+
+ // Configure kernel window
+ Window window;
+ window.set(Window::DimX, Window::Dimension(0, new_points_internal->num_values(), 1));
+ window.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+ const ValidRegion &valid_region = new_input->info()->valid_region();
+
+ update_window_and_padding(window,
+ AccessWindowStatic(new_input->info(), valid_region.start(0), valid_region.start(1),
+ valid_region.end(0), valid_region.end(1)));
+
+ ICLKernel::configure(window);
+
+ // Initialize required variables
+ const int level0 = (level == 0) ? 1 : 0;
+ const int window_size = window_dimension;
+ const int window_size_squared = window_dimension * window_dimension;
+ const int window_size_half = window_dimension / 2;
+ const float eig_const = 1.0f / (2.0f * window_size_squared);
+ const cl_float3 border_limits =
+ {
+ {
+ // -1 because we load 2 values at once for bilinear interpolation
+ static_cast<cl_float>(valid_region.end(0) - window_size - 1),
+ static_cast<cl_float>(valid_region.end(1) - window_size - 1),
+ static_cast<cl_float>(valid_region.start(0))
+ }
+ };
+ const int term_iteration = (termination == Termination::TERM_CRITERIA_ITERATIONS || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+ const int term_epsilon = (termination == Termination::TERM_CRITERIA_EPSILON || termination == Termination::TERM_CRITERIA_BOTH) ? 1 : 0;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("lktracker_stage1"));
+
+ // Set static kernel arguments
+ unsigned int idx = num_arguments_per_2D_tensor();
+ _kernel.setArg(idx++, new_points_internal->cl_buffer());
+ _kernel.setArg(idx++, coeff_table->cl_buffer());
+ _kernel.setArg(idx++, old_ival->cl_buffer());
+ _kernel.setArg<cl_int>(idx++, window_size);
+ _kernel.setArg<cl_int>(idx++, window_size_squared);
+ _kernel.setArg<cl_int>(idx++, window_size_half);
+ _kernel.setArg<cl_int>(idx++, num_iterations);
+ _kernel.setArg<cl_float>(idx++, epsilon);
+ _kernel.setArg<cl_float3>(idx++, border_limits);
+ _kernel.setArg<cl_float>(idx++, eig_const);
+ _kernel.setArg<cl_int>(idx++, level0);
+ _kernel.setArg<cl_int>(idx++, term_iteration);
+ _kernel.setArg<cl_int>(idx++, term_epsilon);
+}
+
+void CLLKTrackerStage1Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Set static tensor arguments. Setting here as allocation might be deferred.
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _new_input, window);
+
+ enqueue(queue, *this, window);
+}
diff --git a/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
new file mode 100644
index 0000000000..794a1bc56e
--- /dev/null
+++ b/src/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLLocallyConnectedMatrixMultiplyKernel::CLLocallyConnectedMatrixMultiplyKernel()
+ : _input0(nullptr), _input1(nullptr), _output(nullptr)
+{
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1));
+
+ _input0 = input0;
+ _input1 = input1;
+ _output = output;
+
+ if(output->info()->dimension(1) == 196)
+ {
+ _lws_hint = cl::NDRange(1, 7);
+ }
+ else
+ {
+ _lws_hint = cl::NDRange(8, 8);
+ }
+
+ std::ostringstream mm_arguments;
+ std::set<std::string> build_opts;
+
+ mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " ";
+ build_opts.emplace(mm_arguments.str());
+
+ // Create kernel
+ std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type()));
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(("gemm_lc_vm_" + data_type_name), build_opts));
+
+ // Configure window kernel
+ const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type());
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x));
+
+ AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+ AccessWindowRectangle input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+ AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, 1);
+
+ update_window_and_padding(win, input0_access, input1_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLLocallyConnectedMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ Window matrix_b_window;
+ matrix_b_window.use_tensor_dimensions(_input1->info());
+ Window slice_matrix_b = matrix_b_window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input0, slice);
+ add_3D_tensor_argument(idx, _input1, slice_matrix_b);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice, _lws_hint);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
new file mode 100644
index 0000000000..c504189169
--- /dev/null
+++ b/src/core/CL/kernels/CLMagnitudePhaseKernel.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLMagnitudePhaseKernel::CLMagnitudePhaseKernel()
+ : _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr), _run_mag(false), _run_phase(false)
+{
+}
+
+void CLMagnitudePhaseKernel::configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
+ MagnitudeType mag_type, PhaseType phase_type)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
+ ARM_COMPUTE_ERROR_ON((magnitude == nullptr) && (phase == nullptr));
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
+
+ _run_mag = (magnitude != nullptr);
+ _run_phase = (phase != nullptr);
+ if(_run_mag)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, magnitude);
+ }
+ if(_run_phase)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
+ }
+
+ if(!_run_mag && !_run_phase)
+ {
+ ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
+ }
+
+ _gx = gx;
+ _gy = gy;
+ _magnitude = magnitude;
+ _phase = phase;
+
+ // Construct kernel name
+ std::set<std::string> build_opts = {};
+
+ // Add magnitude type
+ if(_run_mag)
+ {
+ switch(mag_type)
+ {
+ case MagnitudeType::L1NORM:
+ build_opts.insert("-DMAGNITUDE=1");
+ break;
+ case MagnitudeType::L2NORM:
+ build_opts.insert("-DMAGNITUDE=2");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported magnitude calculation type.");
+ build_opts.insert("-DMAGNITUDE=0");
+ break;
+ }
+ }
+
+ // Add phase type
+ if(_run_phase)
+ {
+ switch(phase_type)
+ {
+ case PhaseType::UNSIGNED:
+ build_opts.insert("-DPHASE=1");
+ break;
+ case PhaseType::SIGNED:
+ build_opts.insert("-DPHASE=2");
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported phase calculation type.");
+ build_opts.insert("-DPHASE=0");
+ break;
+ }
+ }
+
+ // Add data_type
+ build_opts.insert("-DDATA_TYPE=" + get_cl_type_from_data_type(gx->info()->data_type()));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("magnitude_phase", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal gx_access(gx->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal gy_access(gy->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win,
+ gx_access, gy_access,
+ output_magnitude_access, output_phase_access);
+
+ ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
+ gy->info()->valid_region());
+ output_magnitude_access.set_valid_region(win, valid_region);
+ output_phase_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLMagnitudePhaseKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _gx, slice);
+ add_2D_tensor_argument(idx, _gy, slice);
+
+ if(_run_mag)
+ {
+ add_2D_tensor_argument(idx, _magnitude, slice);
+ }
+
+ if(_run_phase)
+ {
+ add_2D_tensor_argument(idx, _phase, slice);
+ }
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
new file mode 100644
index 0000000000..b0b748f466
--- /dev/null
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLMeanStdDevKernel::CLMeanStdDevKernel()
+ : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr)
+{
+}
+
+void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
+{
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(nullptr == mean);
+ ARM_COMPUTE_ERROR_ON(nullptr == global_sum);
+ ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
+
+ _input = input;
+ _mean = mean;
+ _stddev = stddev;
+ _global_sum = global_sum;
+ _global_sum_squared = global_sum_squared;
+
+ // Create kernel
+ std::set<std::string> build_opts;
+
+ if(_stddev != nullptr)
+ {
+ build_opts.insert("-DSTDDEV");
+ }
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("mean_stddev_accumulate", build_opts));
+
+ // Set fixed arguments
+ unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input parameters
+
+ _kernel.setArg(idx++, static_cast<cl_uint>(input->info()->dimension(1)));
+ _kernel.setArg(idx++, *_global_sum);
+
+ if(_stddev != nullptr)
+ {
+ _kernel.setArg(idx++, *_global_sum_squared);
+ }
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration_x = 8;
+ const unsigned int num_elems_processed_per_iteration_y = input->info()->dimension(1);
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+ AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+ update_window_and_padding(win, input_access);
+
+ ICLKernel::configure(win);
+}
+
+void CLMeanStdDevKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ // Clear sums
+ static const cl_ulong zero = 0;
+ queue.enqueueWriteBuffer(*_global_sum, CL_FALSE, 0, sizeof(cl_ulong), &zero);
+
+ if(_stddev != nullptr)
+ {
+ queue.enqueueWriteBuffer(*_global_sum_squared, CL_FALSE, 0, sizeof(cl_ulong), &zero);
+ }
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ // Set slice step equal to height to force gws[1] to 1,
+ // as each thread calculates the sum across all rows and columns equal to the number of elements processed by each work-item
+ slice.set_dimension_step(Window::DimY, _input->info()->dimension(1));
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+
+ // Calculate mean and stddev
+ cl_ulong global_sum = 0;
+ cl_ulong global_sum_squared = 0;
+ const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1);
+
+ queue.enqueueReadBuffer(*_global_sum, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum));
+ const float mean = global_sum / num_pixels;
+ *_mean = mean;
+
+ if(_stddev != nullptr)
+ {
+ queue.enqueueReadBuffer(*_global_sum_squared, CL_TRUE, 0, sizeof(cl_ulong), static_cast<void *>(&global_sum_squared));
+ *_stddev = std::sqrt((global_sum_squared / num_pixels) - (mean * mean));
+ }
+}
diff --git a/src/core/CL/kernels/CLMedian3x3Kernel.cpp b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
new file mode 100644
index 0000000000..95334c7b5f
--- /dev/null
+++ b/src/core/CL/kernels/CLMedian3x3Kernel.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+BorderSize CLMedian3x3Kernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLMedian3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_linear_filter_box3x3", { "-DMEDIAN" }));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLMinMaxLocationKernel.cpp b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
new file mode 100644
index 0000000000..939a53b03a
--- /dev/null
+++ b/src/core/CL/kernels/CLMinMaxLocationKernel.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <climits>
+
+using namespace arm_compute;
+
+CLMinMaxKernel::CLMinMaxKernel()
+ : _input(nullptr), _min_max(), _data_type_max_min()
+{
+}
+
+void CLMinMaxKernel::configure(const ICLImage *input, cl::Buffer *min_max)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+ ARM_COMPUTE_ERROR_ON(min_max == nullptr);
+
+ _input = input;
+ _min_max = min_max;
+ const unsigned int num_elems_processed_per_iteration = input->info()->dimension(0);
+
+ switch(input->info()->data_type())
+ {
+ case DataType::U8:
+ _data_type_max_min[0] = UCHAR_MAX;
+ _data_type_max_min[1] = 0;
+ break;
+ case DataType::S16:
+ _data_type_max_min[0] = SHRT_MAX;
+ _data_type_max_min[1] = SHRT_MIN;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("You called with the wrong image data types");
+ }
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_MAX=" + val_to_string<int>(_data_type_max_min[0]));
+ build_opts.emplace("-DDATA_TYPE_MIN=" + val_to_string<int>(_data_type_max_min[1]));
+ build_opts.emplace((0 != (num_elems_processed_per_iteration % max_cl_vector_width)) ? "-DNON_MULTIPLE_OF_16" : "");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmax", build_opts));
+
+ // Set fixed arguments
+ unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, *_min_max);
+ _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+ ICLKernel::configure(win);
+}
+
+void CLMinMaxKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ // Reset mininum and maximum values
+ queue.enqueueWriteBuffer(*_min_max, CL_FALSE /* blocking */, 0, _data_type_max_min.size() * sizeof(int), _data_type_max_min.data());
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLMinMaxLocationKernel::CLMinMaxLocationKernel()
+ : _input(nullptr), _min_max_count(nullptr)
+{
+}
+
+void CLMinMaxLocationKernel::configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count, ICLCoordinates2DArray *min_loc, ICLCoordinates2DArray *max_loc)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
+ ARM_COMPUTE_ERROR_ON(min_max == nullptr);
+ ARM_COMPUTE_ERROR_ON(min_max_count == nullptr && min_loc == nullptr && max_loc == nullptr);
+
+ _input = input;
+ _min_max_count = min_max_count;
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+ build_opts.emplace((min_max_count != nullptr) ? "-DCOUNT_MIN_MAX" : "");
+ build_opts.emplace((min_loc != nullptr) ? "-DLOCATE_MIN" : "");
+ build_opts.emplace((max_loc != nullptr) ? "-DLOCATE_MAX" : "");
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("minmaxloc", build_opts));
+
+ // Set static arguments
+ unsigned int idx = num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, *min_max);
+ _kernel.setArg(idx++, *min_max_count);
+ if(min_loc != nullptr)
+ {
+ _kernel.setArg(idx++, min_loc->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, min_loc->max_num_values());
+ }
+ if(max_loc != nullptr)
+ {
+ _kernel.setArg(idx++, max_loc->cl_buffer());
+ _kernel.setArg<cl_uint>(idx++, max_loc->max_num_values());
+ }
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 1;
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
+ ICLKernel::configure(win);
+}
+
+void CLMinMaxLocationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ static const unsigned int zero_count = 0;
+ queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 0 * sizeof(zero_count), sizeof(zero_count), &zero_count);
+ queue.enqueueWriteBuffer(*_min_max_count, CL_FALSE, 1 * sizeof(zero_count), sizeof(zero_count), &zero_count);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLNonLinearFilterKernel.cpp b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
new file mode 100644
index 0000000000..6afa5822ba
--- /dev/null
+++ b/src/core/CL/kernels/CLNonLinearFilterKernel.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+CLNonLinearFilterKernel::CLNonLinearFilterKernel()
+ : _border_size(0)
+{
+}
+
+BorderSize CLNonLinearFilterKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLNonLinearFilterKernel::configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
+ unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
+ bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(mask_size != 3 && mask_size != 5);
+ ARM_COMPUTE_ERROR_ON_MSG(pattern == MatrixPattern::OTHER, "MatrixPattern::OTHER is not supported!");
+ ARM_COMPUTE_UNUSED(mask);
+
+ _input = input;
+ _output = output;
+ _border_size = BorderSize(mask_size / 2);
+
+ // Define build options
+ std::set<std::string> build_opts;
+ build_opts.emplace("-D" + string_from_non_linear_filter_function(function));
+
+ // Define kernel
+ std::string pattern_name = string_from_matrix_pattern(pattern);
+ std::transform(pattern_name.begin(), pattern_name.end(), pattern_name.begin(), ::tolower);
+ std::stringstream ss;
+ ss << "non_linear_filter_" << pattern_name << mask_size << "x" << mask_size;
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(ss.str(), build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ const unsigned int num_rows_read_per_iteration = mask_size;
+
+ Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
new file mode 100644
index 0000000000..6a96b0effd
--- /dev/null
+++ b/src/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLNonMaximaSuppression3x3Kernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLNonMaximaSuppression3x3Kernel::configure(const ICLTensor *input, ICLTensor *output, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32);
+
+ _input = input;
+ _output = output;
+
+ // Create kernel
+ std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("non_max_suppression", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..106a5113db
--- /dev/null
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+using namespace arm_compute;
+
+CLNormalizationLayerKernel::CLNormalizationLayerKernel()
+ : _input(nullptr), _squared_input(nullptr), _output(nullptr), _border_size(0)
+{
+}
+
+BorderSize CLNormalizationLayerKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLNormalizationLayerKernel::configure(const ICLTensor *input, const ICLTensor *squared_input, ICLTensor *output, NormalizationLayerInfo norm_info)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16, DataType::U16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
+ ARM_COMPUTE_ERROR_ON_MSG(norm_info.type() == NormType::IN_MAP_2D, "2D In-Map Normalization not implemented");
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+ _input = input;
+ _squared_input = squared_input;
+ _output = output;
+
+ const bool is_in_map = (norm_info.type() == NormType::IN_MAP_1D);
+ const unsigned int border_width = is_in_map ? std::min(norm_info.norm_size() / 2, 3U) : 0;
+ _border_size = BorderSize(0, border_width);
+
+ // Create kernel
+ std::string kernel_name = (norm_info.type() == NormType::IN_MAP_1D) ? "normalization_layer_in_map_1D" : "normalization_layer_cross_map";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Set kernel static arguments
+ unsigned int idx = 3 * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+ _kernel.setArg<cl_float>(idx++, norm_info.scale_coeff());
+ _kernel.setArg<cl_float>(idx++, norm_info.beta());
+ _kernel.setArg<cl_float>(idx++, norm_info.kappa());
+ _kernel.setArg<cl_uint>(idx++, norm_info.norm_size() / 2);
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = (is_in_map) ? 4 : 1;
+ const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2);
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), -_border_size.left, num_elems_read_per_iteration);
+ AccessWindowHorizontal squared_input_access(squared_input->info(), -_border_size.left, num_elems_read_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, squared_input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLNormalizationLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, slice);
+ add_3D_tensor_argument(idx, _squared_input, slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
new file mode 100644
index 0000000000..84eb434bc9
--- /dev/null
+++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLPixelWiseMultiplicationKernel::CLPixelWiseMultiplicationKernel()
+ : _input1(nullptr), _input2(nullptr), _output(nullptr)
+{
+}
+
+void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
+ ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
+ "Output can only be U8 if both inputs are U8");
+ ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. ");
+
+ _input1 = input1;
+ _input2 = input2;
+ _output = output;
+
+ int scale_int = -1;
+ // Extract sign, exponent and mantissa
+ int exponent = 0;
+ float normalized_mantissa = std::frexp(scale, &exponent);
+ // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+ // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+ // Moreover, it will be negative as we deal with 1/2^n
+ if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+ {
+ // Store the positive exponent. We know that we compute 1/2^n
+ // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+ scale_int = std::abs(exponent - 1);
+ }
+
+ std::string data_type;
+ std::string compute_type;
+ // Check if it has float inputs and output
+ if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type()))
+ {
+ scale_int = -1;
+ compute_type = (DataType::F32 == input1->info()->data_type() || DataType::F32 == input2->info()->data_type()) ? "float" : "half";
+ data_type = "DATA_TYPE_FLOAT";
+ }
+ else
+ {
+ compute_type = (DataType::S16 == input1->info()->data_type() || DataType::S16 == input2->info()->data_type()) ? "int" : "ushort";
+ data_type = "DATA_TYPE_INT";
+ }
+
+ // Construct kernel name
+ std::string kernel_name = "pixelwise_mul";
+ kernel_name += (scale_int >= 0) ? "_int" : "_float";
+
+ // Set kernel build options
+ std::set<std::string> build_opts;
+ build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE");
+ build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte");
+ build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type()));
+ build_opts.emplace("-DDATA_TYPE_RES=" + compute_type);
+ build_opts.emplace("-D" + data_type);
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Set scale argument
+ unsigned int idx = 3 * num_arguments_per_2D_tensor(); //Skip the inputs and output parameters
+
+ if(scale_int >= 0)
+ {
+ _kernel.setArg(idx++, scale_int);
+ }
+ else
+ {
+ _kernel.setArg(idx++, scale);
+ }
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal input2_access(input2->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input1_access, input2_access, output_access);
+
+ ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
+ input2->info()->valid_region());
+ output_access.set_valid_region(win, valid_region);
+
+ ICLKernel::configure(win);
+}
+
+void CLPixelWiseMultiplicationKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input1, slice);
+ add_2D_tensor_argument(idx, _input2, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
new file mode 100644
index 0000000000..dc5ae4ec7a
--- /dev/null
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+#include <tuple>
+
+using namespace arm_compute;
+
+CLPoolingLayerKernel::CLPoolingLayerKernel()
+ : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0)
+{
+}
+
+BorderSize CLPoolingLayerKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
+{
+ int pool_pad_x = 0;
+ int pool_pad_y = 0;
+ int pool_stride_x = 0;
+ int pool_stride_y = 0;
+ unsigned int pooled_w = 0;
+ unsigned int pooled_h = 0;
+ const PoolingType pool_type = pool_info.pool_type();
+ const int pool_size = pool_info.pool_size();
+ const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
+ DimensionRoundingType pool_round = pad_stride_info.round();
+ std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad();
+ std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size);
+ ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size);
+
+ // Check output dimensions
+ std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0),
+ input->info()->dimension(1),
+ pool_size,
+ pool_stride_x, pool_stride_y,
+ pool_pad_x, pool_pad_y,
+ pool_round);
+ ARM_COMPUTE_UNUSED(pooled_w);
+ ARM_COMPUTE_UNUSED(pooled_h);
+ ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h));
+
+ const int input_width = input->info()->dimension(0);
+ const int input_height = input->info()->dimension(1);
+ const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
+ const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
+
+ // Set instance variables
+ _input = input;
+ _output = output;
+ _pool_info = pool_info;
+ _border_size = BorderSize(pool_pad_y, pool_pad_x);
+ _border_size.right = std::max(upper_bound_w, pool_pad_x);
+ _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(("-DPOOL_" + ((PoolingType::MAX == pool_type) ? std::string("MAX") : std::string("AVG"))));
+
+ // Create kernel
+ std::string kernel_name = "pooling_layer_" + val_to_string(pool_size);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Set static kernel arguments
+ if(pool_type == PoolingType::AVG)
+ {
+ // Create static kernel arguments
+ const cl_int2 max_dims =
+ {
+ {
+ static_cast<cl_int>(input->info()->dimension(0)) + pool_pad_x,
+ static_cast<cl_int>(input->info()->dimension(1)) + pool_pad_y,
+ }
+ };
+ const cl_int2 strides =
+ {
+ {
+ pool_stride_x,
+ pool_stride_y,
+ }
+ };
+ const cl_int2 paddings =
+ {
+ {
+ pool_pad_x,
+ pool_pad_y,
+ }
+ };
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_3D_tensor();
+ _kernel.setArg<cl_int2>(idx++, max_dims);
+ _kernel.setArg<cl_int2>(idx++, strides);
+ _kernel.setArg<cl_int2>(idx++, paddings);
+ }
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = 1;
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ unsigned int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0;
+ std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad();
+ std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride();
+
+ Window slice = window.first_slice_window_3D();
+
+ do
+ {
+ // Upsample input by pool size
+ Window in_slice(slice);
+ in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - pool_pad_x, in_slice.x().end() * pool_stride_x, pool_stride_x));
+ in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - pool_pad_y, in_slice.y().end() * pool_stride_y, pool_stride_y));
+
+ // Set inputs
+ unsigned int idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_3D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLRemapKernel.cpp b/src/core/CL/kernels/CLRemapKernel.cpp
new file mode 100644
index 0000000000..e63a5ef7c6
--- /dev/null
+++ b/src/core/CL/kernels/CLRemapKernel.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <algorithm>
+
+using namespace arm_compute;
+
+CLRemapKernel::CLRemapKernel()
+ : _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
+{
+}
+
+BorderSize CLRemapKernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLRemapKernel::configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MSG(policy == InterpolationPolicy::AREA, "Area interpolation is not supported!");
+
+ _input = input;
+ _output = output;
+ _map_x = map_x;
+ _map_y = map_y;
+
+ // Create kernel
+ std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+ std::string interpolation_name = string_from_interpolation_policy(policy);
+ std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+ std::string kernel_name = "remap_" + interpolation_name;
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Configure window
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+ const int border_offset = (border_undefined) ? 0 : border_size().left;
+
+ Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowStatic input_access(output->info(), -border_offset, -border_offset,
+ _output->info()->dimension(0) + border_offset, _output->info()->dimension(1) + border_offset);
+ AccessWindowHorizontal output_access(input->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+
+ // Set static arguments
+ unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg<cl_float>(idx++, input->info()->dimension(0));
+ _kernel.setArg<cl_float>(idx++, input->info()->dimension(1));
+}
+
+void CLRemapKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ add_2D_tensor_argument(idx, _map_x, slice);
+ add_2D_tensor_argument(idx, _map_y, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
new file mode 100644
index 0000000000..d74e837ace
--- /dev/null
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+BorderSize CLScaleKernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ /* Compute the ratio between source width/height and destination width/height */
+ const auto wr = static_cast<float>(input->info()->dimension(0)) / static_cast<float>(output->info()->dimension(0));
+ const auto hr = static_cast<float>(input->info()->dimension(1)) / static_cast<float>(output->info()->dimension(1));
+
+ /* Area interpolation behaves as Nearest Neighbour in case of up-sampling */
+ if(policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+ {
+ policy = InterpolationPolicy::NEAREST_NEIGHBOR;
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(policy == InterpolationPolicy::AREA);
+ }
+
+ // Create kernel
+ std::set<std::string> build_opts = { ("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())) };
+ std::string interpolation_name = string_from_interpolation_policy(policy);
+ std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+ std::string kernel_name = "scale_" + interpolation_name;
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+ const int border_offset = (border_undefined) ? 0 : border_size().left;
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowStatic input_access(input->info(), -border_offset, -border_offset,
+ input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg<float>(idx++, input->info()->dimension(0));
+ _kernel.setArg<float>(idx++, input->info()->dimension(1));
+ _kernel.setArg<float>(idx++, output->info()->dimension(0));
+ _kernel.setArg<float>(idx++, output->info()->dimension(1));
+}
diff --git a/src/core/CL/kernels/CLScharr3x3Kernel.cpp b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
new file mode 100644
index 0000000000..913ef592d4
--- /dev/null
+++ b/src/core/CL/kernels/CLScharr3x3Kernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLScharr3x3Kernel::CLScharr3x3Kernel()
+ : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
+{
+}
+
+BorderSize CLScharr3x3Kernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLScharr3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+ _run_scharr_x = output_x != nullptr;
+ _run_scharr_y = output_y != nullptr;
+
+ if(_run_scharr_x)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+ }
+
+ if(_run_scharr_y)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+ }
+
+ _input = input;
+ _output_x = output_x;
+ _output_y = output_y;
+
+ // Set build options
+ std::set<std::string> build_opts;
+
+ if(_run_scharr_x)
+ {
+ build_opts.insert("-DGRAD_X");
+ }
+
+ if(_run_scharr_y)
+ {
+ build_opts.insert("-DGRAD_Y");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("scharr3x3", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+ AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+ output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+ output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLScharr3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+
+ if(_run_scharr_x)
+ {
+ add_2D_tensor_argument(idx, _output_x, slice);
+ }
+
+ if(_run_scharr_y)
+ {
+ add_2D_tensor_argument(idx, _output_y, slice);
+ }
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel3x3Kernel.cpp b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
new file mode 100644
index 0000000000..436aaa498a
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel3x3Kernel.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel3x3Kernel::CLSobel3x3Kernel()
+ : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel3x3Kernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLSobel3x3Kernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+ _run_sobel_x = output_x != nullptr;
+ _run_sobel_y = output_y != nullptr;
+
+ if(_run_sobel_x)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+ }
+
+ if(_run_sobel_y)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+ }
+
+ _input = input;
+ _output_x = output_x;
+ _output_y = output_y;
+
+ // Set build options
+ std::set<std::string> build_opts;
+
+ if(_run_sobel_x)
+ {
+ build_opts.insert("-DGRAD_X");
+ }
+
+ if(_run_sobel_y)
+ {
+ build_opts.insert("-DGRAD_Y");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel3x3", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_rows_read_per_iteration = 3;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+ AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+ output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+ output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLSobel3x3Kernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+
+ if(_run_sobel_x)
+ {
+ add_2D_tensor_argument(idx, _output_x, slice);
+ }
+
+ if(_run_sobel_y)
+ {
+ add_2D_tensor_argument(idx, _output_y, slice);
+ }
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel5x5Kernel.cpp b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
new file mode 100644
index 0000000000..4c0316f19e
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel5x5Kernel.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel5x5HorKernel::CLSobel5x5HorKernel()
+ : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize CLSobel5x5HorKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLSobel5x5HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+ _run_sobel_x = output_x != nullptr;
+ _run_sobel_y = output_y != nullptr;
+
+ if(_run_sobel_x)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+ }
+
+ if(_run_sobel_y)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+ }
+
+ _input = input;
+ _output_x = output_x;
+ _output_y = output_y;
+ _border_size = BorderSize(border_undefined ? 0 : 2, 2);
+
+ // Set build options
+ std::set<std::string> build_opts;
+
+ if(_run_sobel_x)
+ {
+ build_opts.insert("-DGRAD_X");
+ }
+
+ if(_run_sobel_y)
+ {
+ build_opts.insert("-DGRAD_Y");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable1x5", build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+
+ Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+ AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+ AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+ output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+ output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLSobel5x5HorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+
+ if(_run_sobel_x)
+ {
+ add_2D_tensor_argument(idx, _output_x, slice);
+ }
+
+ if(_run_sobel_y)
+ {
+ add_2D_tensor_argument(idx, _output_y, slice);
+ }
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLSobel5x5VertKernel::CLSobel5x5VertKernel()
+ : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel5x5VertKernel::border_size() const
+{
+ return BorderSize(2, 0);
+}
+
+void CLSobel5x5VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+ _run_sobel_x = output_x != nullptr;
+ _run_sobel_y = output_y != nullptr;
+
+ if(_run_sobel_x)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
+ }
+
+ if(_run_sobel_y)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
+ }
+
+ _input_x = input_x;
+ _input_y = input_y;
+ _output_x = output_x;
+ _output_y = output_y;
+
+ // Set build options
+ std::set<std::string> build_opts;
+
+ if(_run_sobel_x)
+ {
+ build_opts.insert("-DGRAD_X");
+ }
+
+ if(_run_sobel_y)
+ {
+ build_opts.insert("-DGRAD_Y");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable5x1", build_opts));
+
+ const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 8;
+ constexpr unsigned int num_rows_read_per_iteration = 5;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+ AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
+
+ output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+ output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLSobel5x5VertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+
+ if(_run_sobel_x)
+ {
+ add_2D_tensor_argument(idx, _input_x, slice);
+ add_2D_tensor_argument(idx, _output_x, slice);
+ }
+
+ if(_run_sobel_y)
+ {
+ add_2D_tensor_argument(idx, _input_y, slice);
+ add_2D_tensor_argument(idx, _output_y, slice);
+ }
+
+ _kernel.setArg(idx++, 0 /*dummy*/);
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSobel7x7Kernel.cpp b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
new file mode 100644
index 0000000000..a477953cfb
--- /dev/null
+++ b/src/core/CL/kernels/CLSobel7x7Kernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+CLSobel7x7HorKernel::CLSobel7x7HorKernel()
+ : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
+{
+}
+
+BorderSize CLSobel7x7HorKernel::border_size() const
+{
+ return _border_size;
+}
+
+void CLSobel7x7HorKernel::configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+ _run_sobel_x = output_x != nullptr;
+ _run_sobel_y = output_y != nullptr;
+
+ if(_run_sobel_x)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
+ }
+
+ if(_run_sobel_y)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
+ }
+
+ _input = input;
+ _output_x = output_x;
+ _output_y = output_y;
+ _border_size = BorderSize(border_undefined ? 0 : 3, 3);
+
+ // Construct kernel name
+ std::string kernel_name = "sobel_separable1x7";
+
+ // Set build options
+ std::set<std::string> build_opts;
+
+ if(_run_sobel_x)
+ {
+ build_opts.insert("-DGRAD_X");
+ }
+
+ if(_run_sobel_y)
+ {
+ build_opts.insert("-DGRAD_Y");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 16;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+
+ Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowHorizontal input_access(input->info(), -border_size().left, num_elems_read_per_iteration);
+ AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+ AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_x_access, output_y_access);
+
+ output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+ output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLSobel7x7HorKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+ do
+ {
+ unsigned int idx = 0;
+ add_2D_tensor_argument(idx, _input, slice);
+
+ if(_run_sobel_x)
+ {
+ add_2D_tensor_argument(idx, _output_x, slice);
+ }
+
+ if(_run_sobel_y)
+ {
+ add_2D_tensor_argument(idx, _output_y, slice);
+ }
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLSobel7x7VertKernel::CLSobel7x7VertKernel()
+ : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
+{
+}
+
+BorderSize CLSobel7x7VertKernel::border_size() const
+{
+ return BorderSize(3, 0);
+}
+
+void CLSobel7x7VertKernel::configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined)
+{
+ ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
+
+ _run_sobel_x = output_x != nullptr;
+ _run_sobel_y = output_y != nullptr;
+
+ if(_run_sobel_x)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_x, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S32);
+ }
+
+ if(_run_sobel_y)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_y, 1, DataType::S32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S32);
+ }
+
+ _input_x = input_x;
+ _input_y = input_y;
+ _output_x = output_x;
+ _output_y = output_y;
+
+ // Set build options
+ std::set<std::string> build_opts;
+
+ if(_run_sobel_x)
+ {
+ build_opts.insert("-DGRAD_X");
+ }
+
+ if(_run_sobel_y)
+ {
+ build_opts.insert("-DGRAD_Y");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("sobel_separable7x1", build_opts));
+
+ const ICLTensor *input = _run_sobel_x ? _input_x : _input_y;
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ constexpr unsigned int num_elems_written_per_iteration = 8;
+ constexpr unsigned int num_elems_read_per_iteration = 8;
+ constexpr unsigned int num_rows_read_per_iteration = 7;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
+
+ AccessWindowRectangle input_x_access(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowRectangle input_y_access(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
+ AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
+ AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_x_access, input_y_access, output_x_access, output_y_access);
+
+ output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+ output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
+
+ ICLKernel::configure(win);
+}
+
+void CLSobel7x7VertKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+
+ if(_run_sobel_x)
+ {
+ add_2D_tensor_argument(idx, _input_x, slice);
+ add_2D_tensor_argument(idx, _output_x, slice);
+ }
+
+ if(_run_sobel_y)
+ {
+ add_2D_tensor_argument(idx, _input_y, slice);
+ add_2D_tensor_argument(idx, _output_y, slice);
+ }
+
+ _kernel.setArg(idx++, 0 /*dummy*/);
+
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
new file mode 100644
index 0000000000..0470d5243e
--- /dev/null
+++ b/src/core/CL/kernels/CLSoftmaxLayerKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+void CLLogits1DMaxKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ _input = input;
+ _output = output;
+
+ // The kernel loops over all elements in steps of 16
+ const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+
+ // Set build options
+ std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+ // Tell the kernel that the width is not a multiple of 16
+ if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+ {
+ build_opts.emplace("-DNON_MULTIPLE_OF_16");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_max", build_opts));
+
+ // Set fixed arguments
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_written_per_iteration = 1;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+CLLogits1DShiftExpSumKernel::CLLogits1DShiftExpSumKernel()
+ : _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr)
+{
+}
+
+void CLLogits1DShiftExpSumKernel::configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, max, sum);
+
+ _input = input;
+ _max = max;
+ _output = output;
+ _sum = sum;
+
+ // The kernel loops over all elements in steps of 16
+ const unsigned int num_elems_processed_per_iteration = ceil_to_multiple(input->info()->dimension(0), 16);
+
+ // Set build options
+ std::set<std::string> build_opts{ "-DUSE_" + string_from_data_type(input->info()->data_type()) };
+
+ // Tell the kernel that the width is not a multiple of 16
+ if((input->info()->dimension(0) % max_cl_vector_width) != 0)
+ {
+ build_opts.emplace("-DNON_MULTIPLE_OF_16");
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_shift_exp_sum", build_opts));
+
+ // Set fixed arguments
+ unsigned int idx = 4 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg<cl_uint>(idx++, input->info()->dimension(0));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal max_access(max->info(), 0, 1);
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowHorizontal sum_access(sum->info(), 0, 1);
+
+ update_window_and_padding(win, input_access, max_access, output_access, sum_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+ sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
+
+void CLLogits1DShiftExpSumKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ unsigned int idx = 0;
+ // Set inputs
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _max, slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ add_2D_tensor_argument(idx, _sum, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
+
+CLLogits1DNormKernel::CLLogits1DNormKernel()
+ : _input(nullptr), _sum(nullptr), _output(nullptr)
+{
+}
+
+void CLLogits1DNormKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(sum, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum);
+
+ _input = input;
+ _sum = sum;
+ _output = output;
+
+ // Set build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DUSE_" + string_from_data_type(input->info()->data_type())));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("softmax_layer_norm", build_opts));
+
+ // Configure window
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1));
+ AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, sum_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
+
+void CLLogits1DNormKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+ Window slice = window.first_slice_window_2D();
+
+ do
+ {
+ Window sum_slice = slice;
+ sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ unsigned int idx = 0;
+ // Set inputs
+ add_2D_tensor_argument(idx, _input, slice);
+ add_2D_tensor_argument(idx, _sum, sum_slice);
+ add_2D_tensor_argument(idx, _output, slice);
+ enqueue(queue, *this, slice);
+ }
+ while(window.slide_window_slice_2D(slice));
+}
diff --git a/src/core/CL/kernels/CLTableLookupKernel.cpp b/src/core/CL/kernels/CLTableLookupKernel.cpp
new file mode 100644
index 0000000000..bbdaa37410
--- /dev/null
+++ b/src/core/CL/kernels/CLTableLookupKernel.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLLut.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstdint>
+#include <string>
+
+using namespace arm_compute;
+
+void CLTableLookupKernel::configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
+ ARM_COMPUTE_ERROR_ON(lut == nullptr);
+ ARM_COMPUTE_ERROR_ON(DataType::U8 != lut->type() && DataType::S16 != lut->type());
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+
+ // Create kernel
+ std::string kernel_name = (DataType::S16 == lut->type()) ? "tablelookup_S16" : "tablelookup_U8";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+ // Set lut argument
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, lut->cl_buffer());
+ if(DataType::S16 == lut->type())
+ {
+ _kernel.setArg(idx++, lut->index_offset());
+ _kernel.setArg(idx++, static_cast<uint32_t>(lut->num_elements()));
+ }
+
+ // Configure kernel
+ constexpr unsigned int num_elems_processed_per_iteration = 8;
+ ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLThresholdKernel.cpp b/src/core/CL/kernels/CLThresholdKernel.cpp
new file mode 100644
index 0000000000..6e07cefc77
--- /dev/null
+++ b/src/core/CL/kernels/CLThresholdKernel.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <string>
+
+using namespace arm_compute;
+
+void CLThresholdKernel::configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
+ uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+
+ // Construct kernel name
+ std::string kernel_name = "threshold";
+
+ switch(type)
+ {
+ case ThresholdType::BINARY:
+ kernel_name += "_binary";
+ break;
+ case ThresholdType::RANGE:
+ kernel_name += "_range";
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Thresholding type not recognized");
+ break;
+ }
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name));
+
+ // Set arguments
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg(idx++, false_value);
+ _kernel.setArg(idx++, true_value);
+ _kernel.setArg(idx++, threshold);
+
+ if(ThresholdType::RANGE == type)
+ {
+ _kernel.setArg(idx++, upper);
+ }
+
+ // Make sure _kernel is initialized before calling the parent's configure
+ constexpr unsigned int num_elems_processed_per_iteration = 16;
+ ICLSimple2DKernel::configure(input, output, num_elems_processed_per_iteration);
+}
diff --git a/src/core/CL/kernels/CLTransposeKernel.cpp b/src/core/CL/kernels/CLTransposeKernel.cpp
new file mode 100644
index 0000000000..2ee6fcb9dc
--- /dev/null
+++ b/src/core/CL/kernels/CLTransposeKernel.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+
+#include "arm_compute/core/AccessWindowTranspose.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+void CLTransposeKernel::configure(const ICLTensor *input, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+ TensorShape output_shape{ input->info()->tensor_shape() };
+ const size_t w_out = input->info()->dimension(1);
+ const size_t h_out = input->info()->dimension(0);
+ output_shape.set(0, w_out);
+ output_shape.set(1, h_out);
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+
+ _input = input;
+ _output = output;
+ _lws_hint = cl::NDRange(2, 8);
+
+ std::set<std::string> build_opts;
+ std::ostringstream data_type_in_bytes;
+ data_type_in_bytes << input->info()->element_size();
+ build_opts.emplace("-DDATA_TYPE_IN_BYTES=" + data_type_in_bytes.str());
+
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("transpose", build_opts));
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = max_cl_vector_width / input->info()->element_size();
+
+ Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration));
+
+ AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+ AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, input->info()->valid_region());
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWarpAffineKernel.cpp b/src/core/CL/kernels/CLWarpAffineKernel.cpp
new file mode 100644
index 0000000000..e549dbc258
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpAffineKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size)
+{
+ for(size_t i = 0; i < size; ++i)
+ {
+ std::stringstream mat_str;
+ mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
+ options.insert(mat_str.str());
+ }
+}
+} // namespace
+
+BorderSize CLWarpAffineKernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLWarpAffineKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
+
+ _input = input;
+ _output = output;
+
+ // Create build options
+ std::set<std::string> options;
+ options_add_matrix(options, matrix, 6);
+ options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+ // Create kernel
+ std::string interpolation_name = string_from_interpolation_policy(policy);
+ std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+ std::string kernel_name = "warp_affine_" + interpolation_name;
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
+ _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
+
+ // Configure kernel window
+ const unsigned int num_elems_processed_per_iteration = 4;
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
new file mode 100644
index 0000000000..fddb580750
--- /dev/null
+++ b/src/core/CL/kernels/CLWarpPerspectiveKernel.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+
+#include <cstddef>
+#include <set>
+#include <sstream>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+inline void options_add_matrix(std::set<std::string> &options, const float *matrix, size_t size)
+{
+ for(size_t i = 0; i < size; ++i)
+ {
+ std::stringstream mat_str;
+ mat_str << "-DMAT" << i << "=" << matrix[i] << " ";
+ options.insert(mat_str.str());
+ }
+}
+} // namespace
+
+BorderSize CLWarpPerspectiveKernel::border_size() const
+{
+ return BorderSize(1);
+}
+
+void CLWarpPerspectiveKernel::configure(const ICLTensor *input, ICLTensor *output, const float *matrix, InterpolationPolicy policy)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
+ ARM_COMPUTE_ERROR_ON(InterpolationPolicy::AREA == policy);
+
+ _input = input;
+ _output = output;
+
+ // Create build options
+ std::set<std::string> options;
+ options_add_matrix(options, matrix, 9);
+ options.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+
+ // Create kernel
+ std::string interpolation_name = string_from_interpolation_policy(policy);
+ std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+ std::string kernel_name = "warp_perspective_" + interpolation_name;
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, options));
+
+ // Set static kernel arguments
+ unsigned int idx = 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
+ _kernel.setArg<cl_int>(idx++, input->info()->dimension(0));
+ _kernel.setArg<cl_int>(idx++, input->info()->dimension(1));
+
+ // Configure kernel window
+ constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+ Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+
+ AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+ AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
+
+ update_window_and_padding(win, input_access, output_access);
+
+ output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ ICLKernel::configure(win);
+}
diff --git a/src/core/CL/kernels/CLWeightsReshapeKernel.cpp b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..018f272921
--- /dev/null
+++ b/src/core/CL/kernels/CLWeightsReshapeKernel.cpp
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+
+using namespace arm_compute;
+
+CLWeightsReshapeKernel::CLWeightsReshapeKernel(bool is_shared)
+ : _is_shared(is_shared), _input(nullptr), _biases(nullptr), _output(nullptr)
+{
+}
+
+void CLWeightsReshapeKernel::configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32);
+ if(_is_shared)
+ {
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(4) != (output->info()->dimension(2)));
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 5);
+ ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 3);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR_ON(input->info()->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(output->info()->num_dimensions() > 2);
+ }
+
+ // Check biases
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F16, DataType::F32);
+ }
+
+ _biases = biases;
+ _output = output;
+ _input = input;
+
+ // Create build options
+ std::set<std::string> build_opts;
+ build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
+ build_opts.emplace(((biases != nullptr) ? "-DHAS_BIAS" : ""));
+
+ // Create kernel
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("reshape_to_columns", build_opts));
+
+ // Set static arguments
+ unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+ idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(0));
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(1));
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(2));
+ _kernel.setArg<cl_uint>(idx++, _input->info()->dimension(3));
+
+ // Configure window
+ Window win = calculate_max_window(*input->info(), Steps());
+ // The CLWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+ ICLKernel::configure(win);
+}
+
+CLConvolutionLayerWeightsReshapeKernel::CLConvolutionLayerWeightsReshapeKernel()
+ : CLWeightsReshapeKernel(false)
+{
+}
+
+void CLConvolutionLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window out_window;
+ out_window.use_tensor_dimensions(_output->info());
+
+ Window in_slice = window.first_slice_window_3D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ // Set arguments
+ unsigned idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ if(_biases != nullptr)
+ {
+ Window biases_slice;
+ biases_slice.set(Window::DimX, Window::Dimension(0, _biases->info()->tensor_shape().x(), 1));
+ add_1D_tensor_argument(idx, _biases, biases_slice);
+ }
+
+ // Run kernel
+ enqueue(queue, *this, in_slice);
+}
+
+CLLocallyConnectedLayerWeightsReshapeKernel::CLLocallyConnectedLayerWeightsReshapeKernel()
+ : CLWeightsReshapeKernel(true)
+{
+}
+
+void CLLocallyConnectedLayerWeightsReshapeKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+ Window out_window;
+ out_window.use_tensor_dimensions(_output->info());
+
+ Window in_slice = window.first_slice_window_3D();
+ Window out_slice = out_window.first_slice_window_2D();
+
+ Window biases_window;
+ Window biases_slice;
+
+ if(_biases != nullptr)
+ {
+ biases_window.use_tensor_dimensions(_biases->info());
+ biases_slice = biases_window.first_slice_window_1D();
+ }
+
+ do
+ {
+ // Set arguments
+ unsigned idx = 0;
+ add_3D_tensor_argument(idx, _input, in_slice);
+ add_2D_tensor_argument(idx, _output, out_slice);
+ if(_biases != nullptr)
+ {
+ add_1D_tensor_argument(idx, _biases, biases_slice);
+ biases_window.slide_window_slice_1D(biases_slice);
+ }
+
+ // Run kernel
+ enqueue(queue, *this, in_slice);
+ }
+ while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}