From b63b1196adea8b07dd8db77c2492a212650deba0 Mon Sep 17 00:00:00 2001
From: SiCong Li <sicong.li@arm.com>
Date: Fri, 28 Jan 2022 18:24:39 +0000
Subject: Integrate Dynamic Fusion patches

* Add public interfaces:
    * OperatorGraph: Describe a workload that could contain fused kernels
    * IWorkload: Generic interface for workloads built from OperatorGraph
    * ClWorkload: OpenCL workloads built from OperatorGraph
    * ClCompositeOperator: Runtime async operator to execute a ClWorkload
    * DependencyGraph (will likely be deprecated in later iterations)

* Add example
    * cl_fused_conv2d_elementwise_add.cpp to explain how to use the new
      interfaces

* Add internal translation layer

* Refactor ClKernelBuildingAPI
    * Remove non-tile based gemm native kernel component
    * Minor interface changes

* Add integration tests

Resolves COMPMID-5161

Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
---
 examples/SConscript                                |  11 +-
 .../cl_fused_conv2d_elementwise_add.cpp            | 386 +++++++++++++++++++++
 .../cl_ref_conv2d_elementwise_add.cpp              | 223 ++++++++++++
 3 files changed, 619 insertions(+), 1 deletion(-)
 create mode 100644 examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
 create mode 100644 examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp

(limited to 'examples')

diff --git a/examples/SConscript b/examples/SConscript
index 8ee688e76d..d456b7246c 100644
--- a/examples/SConscript
+++ b/examples/SConscript
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 Arm Limited.
+# Copyright (c) 2017-2022 Arm Limited.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -95,6 +95,15 @@ if env['opencl']:
         prog = install_bin(prog)
         alias = examples_env.Alias(example, prog)
         Default(alias)
+    if env['experimental_dynamic_fusion']:
+        examples_env.Append(CPPDEFINES = ['ARM_COMPUTE_CL', 'ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
+        for file in Glob("./dynamic_fusion/*.cpp"):
+            example = os.path.basename(os.path.splitext(str(file))[0])
+            prog = examples_env.Program(example, ["./dynamic_fusion/{}.cpp".format(example), utils], LIBS = examples_libs + arm_compute_libs)
+            Depends(prog, arm_compute_dependency)
+            prog = install_bin(prog)
+            alias = examples_env.Alias(example, prog)
+            Default(alias)
 
 if env['gemm_tuner'] and env['opencl']:
     gemm_tuner_common_options = examples_env.Object("./gemm_tuner/CommonGemmExampleOptions.cpp")
diff --git a/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
new file mode 100644
index 0000000000..6048024d30
--- /dev/null
+++ b/examples/dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/// @example dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp
+/// @copybrief example_dynamic_fusion_cl_conv2d_elementwise_add
+///
+/// @page example_dynamic_fusion_cl_conv2d_elementwise_add Dynamic Fusion Example: Conv2d + Elementwise Addition (OpenCL target)
+/// This example demonstrates how to fuse a Conv2d with an Addition using the new OperatorGraph API, and to run it with the Async Composite Operator
+
+#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION
+#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION"
+#endif                 /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */
+#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */
+#error "This example needs to be built with -DARM_COMPUTE_CL"
+#endif /* ARM_COMPUTE_CL */
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/ClWorkload.h"
+#include "arm_compute/core/experimental/OperatorGraph.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/experimental/ClCompositeOperator.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "utils/TypePrinter.h"
+
+#include "utils/Utils.h"
+
+#include <cstdlib>
+
+using namespace arm_compute;
+using namespace utils;
+using namespace arm_compute::experimental::dynamic_fusion;
+
+#define TICK(clock_name) \
+    auto clock_name##_tick = std::chrono::high_resolution_clock::now();
+#define TOCK(clock_name, measurement_map)                                               \
+    auto clock_name##_tock                 = std::chrono::high_resolution_clock::now(); \
+    measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
+#define TOCK_AVG(clock_name, measurement_map, num_iterations)                           \
+    auto clock_name##_tock                 = std::chrono::high_resolution_clock::now(); \
+    measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
+
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+
+class ClFusedConv2dEltwiseAddExample : public Example
+{
+public:
+    bool do_setup(int argc, char **argv) override
+    {
+        size_t       ih;
+        size_t       iw;
+        size_t       ifm;
+        size_t       wh;
+        size_t       ww;
+        size_t       ofm;
+        size_t       tuner_choice;
+        unsigned int pad_x;
+        unsigned int pad_y;
+        if(argc < 10)
+        {
+            // Print help
+            std::cout << "Usage:  ./cl_fused_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive) pad_x pad_y\n";
+            std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n";
+            ih           = 512;
+            iw           = 512;
+            ifm          = 64;
+            wh           = 1;
+            ww           = 1;
+            ofm          = 3;
+            tuner_choice = 2;
+            pad_x        = 0;
+            pad_y        = 0;
+        }
+        else
+        {
+            ih           = strtol(argv[1], nullptr, 10);
+            iw           = strtol(argv[2], nullptr, 10);
+            ifm          = strtol(argv[3], nullptr, 10);
+            wh           = strtol(argv[4], nullptr, 10);
+            ww           = strtol(argv[5], nullptr, 10);
+            ofm          = strtol(argv[6], nullptr, 10);
+            tuner_choice = strtol(argv[7], nullptr, 10);
+            pad_x        = strtol(argv[8], nullptr, 10);
+            pad_y        = strtol(argv[9], nullptr, 10);
+        }
+
+        CLTuner *tuner_to_use;
+        switch(tuner_choice)
+        {
+            case 0:
+            {
+                tuner_to_use = nullptr;
+                break;
+            }
+            case 1:
+            {
+                tuner.set_tuner_mode(CLTunerMode::RAPID);
+                tuner_to_use = &tuner;
+                break;
+            }
+            case 3:
+            {
+                tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE);
+                tuner_to_use = &tuner;
+                break;
+            }
+            case 2:
+            default:
+            {
+                tuner.set_tuner_mode(CLTunerMode::NORMAL);
+                tuner_to_use = &tuner;
+                break;
+            }
+        }
+        CLScheduler::get().default_init(tuner_to_use);
+
+        TICK(startup_time);
+        /* Computation:
+         * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias))
+         */
+        const auto data_type   = DataType::F32;
+        const auto data_layout = DataLayout::NHWC;
+
+        const auto t_input_shape     = TensorShape(ifm, iw, ih);
+        const auto t_weight_shape    = TensorShape(ifm, ww, wh, ofm);
+        const auto t_bias_shape      = TensorShape(ofm);
+        const auto t_l1_addend_shape = TensorShape(ofm, iw);
+
+        std::cout << "input_shape: " << t_input_shape << std::endl;
+        std::cout << "weight_shape: " << t_weight_shape << std::endl;
+        std::cout << "bias_shape: " << t_bias_shape << std::endl;
+        std::cout << "addend_shape: " << t_l1_addend_shape << std::endl;
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// @section describe_workload_using_operator_graph Describe the workload to run using OperatorGraph
+        /// OperatorGraph is a graph of Tensors and Operators. Let's first default-construct it
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct OperatorGraph
+        // [Construct OperatorGraph]
+        OperatorGraph op_graph;
+        // [Construct OperatorGraph]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// @subsection add_conv2d Add the first operator (root operator) Conv2d
+        /// The first operator to be added to the graph is called the "root operator" of the entire graph.
+        /// @note As of now, operators need to be inserted according to their dependency order. This is because output tensor auto-initialization occurs during construction time.
+        ///       Later this might be changed to allow out-of-order insertion.
+
+        /// Before we insert the operator, we need to initialize the required TensorInfo objects.
+        /// We can choose not to initialize an output TensorInfo; if so, they will be auto-initialized during the construction of the OperatorGraph
+        /// The "t_acc_info" is the TensorInfo of the accumulator tensor, which is the output tensor of our first operator conv2d
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp  Initialize Conv2d TensorInfo
+        // [Initialize Conv2d TensorInfo]
+        auto t_input_info  = TensorInfo(t_input_shape, 1, data_type, data_layout);
+        auto t_weight_info = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+        auto t_bias_info   = TensorInfo(t_bias_shape, 1, data_type, data_layout);
+        auto t_acc_info    = TensorInfo();
+        // [Initialize Conv2d TensorInfo]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// Next we associate the TensorInfo with the OpTensor s created in the op_graph.
+        /// @note The associated TensorInfo objects must be in scope and remain valid until the ClWorkload building is completed
+
+        /// @note The associated TensorInfo objects must be declard as non-const, since they may be updated during the OperatorGraph construction
+
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp  Add OpTensors
+        // [Add OpTensors]
+        const auto op_t_input  = add_tensor(op_graph, t_input_info);
+        const auto op_t_weight = add_tensor(op_graph, t_weight_info);
+        const auto op_t_bias   = add_tensor(op_graph, t_bias_info);
+        const auto op_t_acc    = add_tensor(op_graph, t_acc_info);
+        // [Add OpTensors]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// Finally we add the Conv2d operator to op_graph. The Conv2dDescriptor contains all the TOSA-compliant attribute parameters
+        /// The add_op... group of functions accept the OpTensors created by the add_tensor function, and return an Operator handle.
+        /// This handle can be used to further query and modify the operator inside the OperatorGraph after its creation
+        /// For example, here we use the handle to force the ConvolutionMethod to be Direct Convolution
+        /// @note The force_conv2d_method is only for debug purpose for now, as the end user is not expected to decide on the ConvolutionMethod
+
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp  Add Conv2d Operator
+        // [Add Conv2d Operator]
+        Conv2dDescriptor conv2d_desc{ Padding2D{ pad_x, pad_x, pad_y, pad_y } };
+        auto             conv2d = add_op_conv2d(op_graph, conv2d_desc, op_t_input, op_t_weight, op_t_bias, op_t_acc);
+        force_conv2d_method(op_graph, conv2d, ConvolutionMethod::DIRECT); // Only for debug purposes
+        // [Add Conv2d Operator]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// @subsection add_elementwise_add Add the second operator Elementwise Add
+        /// This is similar to adding the first operator to op_graph, except that we link the two operators together by their common tensor,
+        /// namely the accumulator tensor op_t_acc, which is the output of conv2d and the input (lhs) of the addition
+        /// @note At the moment, it is recommended to always declare a separate TensorInfo (even if empty) for each OpTensor.
+        ///       For example, here op_t_dst could be associated with op_t_acc info as they are the same,
+        ///       but we still recommend creating a separate object.
+
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp  Add Elementwise Add Operator
+        // [Add Elementwise Add Operator]
+        auto          t_l1_addend_info = TensorInfo(t_l1_addend_shape, 1, data_type, data_layout);
+        auto          t_dst_info       = TensorInfo();
+        const auto    op_t_l1_addend   = add_tensor(op_graph, t_l1_addend_info);
+        const auto    op_t_dst         = add_tensor(op_graph, t_dst_info);
+        AddDescriptor add_desc{};
+        add_op_elementwise_add(op_graph, add_desc, op_t_acc, op_t_l1_addend, op_t_dst);
+        // [Add Elementwise Add Operator]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// @section build_clworkload Build ClWorkload
+        /// ClWorkload is an intermediate object which contains all the built kernel codes and all other descriptors on how to schedule them
+        /// We build ClWorkload from the op_graph object that we just described
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp  Build ClWorkload
+        // [Build ClWorkload]
+        const ClWorkloadContext workload_ctx
+        {
+            GpuInfo{ CLScheduler::get().target() }
+        };
+        ClWorkload workload;
+        build(workload, op_graph, workload_ctx);
+        // [Build ClWorkload]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// @section run_fused_op_with_clcompositeoperator Run the fused operator workload with ClCompositeOperator
+        /// @subsection configure_and_validate_clcompositeoperator Validate ClWorkload and Configure ClCompositeOperator
+        /// After ClWorkload is built, we need to configure it with the Compute Library runtime ClCompositeOperator to run it.
+        /// Optionally we can explicitly validate the workload to check if the workload has been built successfully.
+        /// The validate is automatically run inside configure and would throw if it fails.
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClCompositeOperator
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp  Validate and configure ClCompositeOperator
+        // [Validate and configure ClCompositeOperator]
+        const auto success = ClCompositeOperator::validate(workload); // Optional
+        op.configure(CLKernelLibrary::get().get_compile_context(), workload);
+        // [Validate and configure ClCompositeOperator]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// @subsection run_clcompositeoperator Run ClCompositeOperator
+        /// Construct the runtime CLTensor s with backing memory
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct CLTensor objects
+
+        /// Initialize, allocate and fill the CLTensor objects
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize, Allocate and Fill CLTensor objects
+        // [Initialize, Allocate and Fill CLTensor objects]
+        t_input.allocator()->init(t_input_info);
+        t_weight.allocator()->init(t_weight_info);
+        t_bias.allocator()->init(t_bias_info);
+        t_l1_addend.allocator()->init(t_dst_info);
+        t_dst.allocator()->init(t_dst_info);
+
+        t_input.allocator()->allocate();
+        t_weight.allocator()->allocate();
+        t_bias.allocator()->allocate();
+        t_l1_addend.allocator()->allocate();
+        t_dst.allocator()->allocate();
+
+        fill_random_tensor(t_input, -1.f, 1.f);
+        fill_random_tensor(t_weight, -1.f, 1.f);
+        fill_random_tensor(t_l1_addend, -1.f, 1.f);
+        // [Initialize, Allocate and Fill CLTensor objects]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// The OpTensorBinding creates a mapping from the OpTensor handles that we created early to the real CLTensors
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Create OpTensorBinding
+        // [Create OpTensorBinding]
+        OpTensorBinding op_tensors({ { op_t_input, &t_input },
+            { op_t_weight, &t_weight },
+            { op_t_bias, &t_bias },
+            { op_t_l1_addend, &t_l1_addend },
+            { op_t_dst, &t_dst }
+        });
+        // [Create OpTensorBinding]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// Bind the CLTensor objects to the prepare_pack_map and run_pack_map, which are used to prepare and run the op
+        /// This step additionally creates empty auxiliary CLTensor objects if any, and contain them inside a ClAuxTensorData aux_tensor_data
+        /// @note This step associates all the CLTensors contained in op_tensors and aux_tensor_data, with prepare_pack_map and run_pack_map
+        ///       Make sure these CLTensors remain valid as long as the two pack_maps are still in use
+
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct ClAuxTensorData
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Construct TensorPackMaps
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Bind Tensors
+        // [Bind Tensors]
+        bind_tensors(aux_tensor_data, prepare_pack_map, run_pack_map, workload, op_tensors);
+        // [Bind Tensors]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// Initialize and Allocate Auxiliary CLTensor objects.
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Initialize and Allocate Auxiliary CLTensor objects
+        // [Initialize and Allocate Auxiliary CLTensor objects]
+        for(auto tensor_data : aux_tensor_data.get_tensors())
+        {
+            tensor_data.tensor->allocator()->init(tensor_data.tensor_info);
+            tensor_data.tensor->allocator()->allocate();
+        }
+        // [Initialize and Allocate Auxiliary CLTensor objects]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// Run the ClCompositeOperator prepare job. This performs any jobs that are required for the first run, like
+        /// reshaping tensors for a more performant format.
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Prepare ClCompositeOperator
+        // [Prepare ClCompositeOperator]
+        op.prepare(prepare_pack_map);
+        // [Prepare ClCompositeOperator]
+
+        /// @page example_dynamic_fusion_cl_conv2d_elementwise_add
+        /// At last, we run our operator
+        /// @snippet dynamic_fusion/cl_fused_conv2d_elementwise_add.cpp Run ClCompositeOperator
+        // [Run ClCompositeOperator]
+        op.run(run_pack_map);
+        // [Run ClCompositeOperator]
+        TOCK(startup_time, measurements);
+        return true;
+    }
+    void do_run() override
+    {
+        // Run the fused op
+        op.run(run_pack_map);
+
+        // Make sure all the OpenCL jobs are done executing:
+        CLScheduler::get().sync();
+    }
+
+    void do_teardown() override
+    {
+        for(auto m : measurements)
+        {
+            std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
+        }
+    }
+
+private:
+    // [Construct CLTensor objects]
+    CLTensor t_input{};
+    CLTensor t_weight{};
+    CLTensor t_bias{};
+    CLTensor t_l1_addend{};
+    CLTensor t_dst{};
+    // [Construct CLTensor objects]
+    // [Construct ClAuxTensorData]
+    ClAuxTensorData aux_tensor_data{};
+    // [Construct ClAuxTensorData]
+    // [Construct TensorPackMaps]
+    TensorPackMap prepare_pack_map{};
+    TensorPackMap run_pack_map{};
+    // [Construct TensorPackMaps]
+    // [Construct ClCompositeOperator]
+    ClCompositeOperator op{};
+    // [Construct ClCompositeOperator]
+    CLTuner tuner{};
+    std::map<std::string, std::chrono::microseconds> measurements{};
+};
+
+/** Main program for sgemm test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta )
+ */
+int main(int argc, char **argv)
+{
+    return utils::run_example<ClFusedConv2dEltwiseAddExample>(argc, argv);
+}
+
+#undef TICK
+#undef TOCK
+#undef TOCK_AVG
\ No newline at end of file
diff --git a/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp
new file mode 100644
index 0000000000..4f68372b49
--- /dev/null
+++ b/examples/dynamic_fusion/cl_ref_conv2d_elementwise_add.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */
+#error "This example needs to be built with -DARM_COMPUTE_CL"
+#endif /* ARM_COMPUTE_CL */
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "utils/TypePrinter.h"
+#include "utils/Utils.h"
+
+#include <cstdlib>
+
+using namespace arm_compute;
+using namespace utils;
+
+#define TICK(clock_name) \
+    auto clock_name##_tick = std::chrono::high_resolution_clock::now();
+#define TOCK(clock_name, measurement_map)                                               \
+    auto clock_name##_tock                 = std::chrono::high_resolution_clock::now(); \
+    measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick);
+#define TOCK_AVG(clock_name, measurement_map, num_iterations)                           \
+    auto clock_name##_tock                 = std::chrono::high_resolution_clock::now(); \
+    measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations));
+
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+class ClRefConv2dEltwiseAddExample : public Example
+{
+public:
+    bool do_setup(int argc, char **argv) override
+    {
+        size_t       ih;
+        size_t       iw;
+        size_t       ifm;
+        size_t       wh;
+        size_t       ww;
+        size_t       ofm;
+        size_t       tuner_choice;
+        unsigned int pad_x;
+        unsigned int pad_y;
+        if(argc < 10)
+        {
+            // Print help
+            std::cout << "Usage:  ./cl_conv2d_elementwise_add ih iw ifm wh ww ofm tuner_choice(0=Disable, 1=Rapid, 2=Normal, 3=Exhaustive)\n";
+            std::cout << "Too few or no input_matrices provided. Using shape config = SRGAN_0, tuner_choice=2\n\n";
+            ih           = 512;
+            iw           = 512;
+            ifm          = 64;
+            wh           = 1;
+            ww           = 1;
+            ofm          = 3;
+            tuner_choice = 2;
+            pad_x        = 0;
+            pad_y        = 0;
+        }
+        else
+        {
+            ih           = strtol(argv[1], nullptr, 10);
+            iw           = strtol(argv[2], nullptr, 10);
+            ifm          = strtol(argv[3], nullptr, 10);
+            wh           = strtol(argv[4], nullptr, 10);
+            ww           = strtol(argv[5], nullptr, 10);
+            ofm          = strtol(argv[6], nullptr, 10);
+            tuner_choice = strtol(argv[7], nullptr, 10);
+            pad_x        = strtol(argv[8], nullptr, 10);
+            pad_y        = strtol(argv[9], nullptr, 10);
+        }
+
+        CLTuner *tuner_to_use;
+        switch(tuner_choice)
+        {
+            case 0:
+            {
+                tuner_to_use = nullptr;
+                break;
+            }
+            case 1:
+            {
+                tuner.set_tuner_mode(CLTunerMode::RAPID);
+                tuner_to_use = &tuner;
+                break;
+            }
+            case 3:
+            {
+                tuner.set_tuner_mode(CLTunerMode::EXHAUSTIVE);
+                tuner_to_use = &tuner;
+                break;
+            }
+            case 2:
+            default:
+            {
+                tuner.set_tuner_mode(CLTunerMode::NORMAL);
+                tuner_to_use = &tuner;
+                break;
+            }
+        }
+
+        CLScheduler::get().default_init(tuner_to_use);
+
+        TICK(startup_time);
+
+        /* Computation:
+         * out = add_desc(addend, conv2d1x1(direct_conv)(input, weights, bias))
+         */
+        const auto          data_type   = DataType::F32;
+        const auto          data_layout = DataLayout::NHWC;
+        const PadStrideInfo conv_info{ 1, 1, pad_x, pad_y };
+        // const auto t_input_shape    = TensorShape(384, 12, 12);
+        // const auto t_weight_shape   = TensorShape(384, 1, 1, 64);
+        // const auto t_dst_shape      = TensorShape(64, 12, 12);
+        const auto t_input_shape  = TensorShape(ifm, iw, ih);
+        const auto t_weight_shape = TensorShape(ifm, ww, wh, ofm);
+        const auto t_dst_shape    = misc::shape_calculator::compute_deep_convolution_shape(t_input_shape, data_layout, t_weight_shape, conv_info);
+        std::cout << "input_shape: " << t_input_shape << std::endl;
+        std::cout << "weight_shape: " << t_weight_shape << std::endl;
+        std::cout << "dst_shape: " << t_dst_shape << std::endl;
+        auto t_input_info     = TensorInfo(t_input_shape, 1, data_type, data_layout);
+        auto t_weight_info    = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+        auto t_l0_dst_info    = TensorInfo(t_dst_shape, 1, data_type, data_layout); // Intermediate tensor for cond3
+        auto t_l1_addend_info = TensorInfo(t_dst_shape, 1, data_type, data_layout);
+        auto t_dst_info       = TensorInfo(t_dst_shape, 1, data_type, data_layout);
+
+        // Init tensors
+        {
+            t_input.allocator()->init(t_input_info);
+            t_weight.allocator()->init(t_weight_info);
+            t_l1_addend.allocator()->init(t_dst_info);
+            t_l0_dst.allocator()->init(t_l0_dst_info);
+            t_dst.allocator()->init(t_dst_info);
+        }
+
+        op0.configure(&t_input, &t_weight, nullptr, &t_l0_dst, conv_info);
+        op1.configure(&t_l0_dst, &t_l1_addend, &t_dst, ConvertPolicy{});
+
+        // Construct tensors
+        // Allocate and fill tensors
+        {
+            t_input.allocator()->allocate();
+            t_weight.allocator()->allocate();
+            t_l1_addend.allocator()->allocate();
+            t_l0_dst.allocator()->allocate();
+            t_dst.allocator()->allocate();
+            fill_random_tensor(t_input, -1.f, 1.f);
+            fill_random_tensor(t_weight, -1.f, 1.f);
+            fill_random_tensor(t_l1_addend, -1.f, 1.f);
+        }
+        // Dummy run for CLTuner
+        op0.run();
+        op1.run();
+        TOCK(startup_time, measurements);
+        return true;
+    }
+    void do_run() override
+    {
+        // Run the fused op
+        op0.run();
+        op1.run();
+
+        // Make sure all the OpenCL jobs are done executing:
+        CLScheduler::get().sync();
+    }
+
+    void do_teardown() override
+    {
+        for(auto m : measurements)
+        {
+            std::cout << m.first << ": " << m.second.count() << "us" << std::endl;
+        }
+    }
+
+private:
+    CLTensor                 t_input{};
+    CLTensor                 t_weight{};
+    CLTensor                 t_l1_addend{};
+    CLTensor                 t_l0_dst{};
+    CLTensor                 t_dst{};
+    CLDirectConvolutionLayer op0{};
+    CLArithmeticAddition     op1{};
+    CLTuner                  tuner{};
+    std::map<std::string, std::chrono::microseconds> measurements{};
+};
+
+/** Main program for sgemm test
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta )
+ */
+int main(int argc, char **argv)
+{
+    return utils::run_example<ClRefConv2dEltwiseAddExample>(argc, argv);
+}
+
+#undef TICK
+#undef TOCK
+#undef TOCK_AVG
\ No newline at end of file
-- 
cgit v1.2.1