Rewrite dynamic fusion

The new version introduces the following major changes: * Change public interface to simplify and standardize the user experience - Use the term "Workload" uniformly - Simplify operator interface to be a set of static methods: validate_op(), create_op() * Separate the kernel writing into its own component (template_writer). This is to allow the co-development of GpuKernelWriter, and to allow easy replacement once GpuKernelWriter is mature. * Optimize the core fusion algorithm used by the component graph. The details can be found in GpuKernelComponentGraph::fuse() * Use Gpu instead of Cl prefixes for most of the Workload interfaces (except for runtime and kernel components, which have to be language specific) This allows the potential extension to other Gpu langauges in the future. * Refactor runtime memory interface so that auxiliary tensor handling is separate from the user tensor passing. This is because the former is less stable and may require extension in the future. * Hide source code object from the user as it is not required at the moment * Deprecate the old prototype entirely by disabling it in SCons build Resolves COMPMID-5510, COMPMID-5512, COMPMID-5513 Change-Id: If69d2362856f2de4503546b7b6cf48a525cf3079 Signed-off-by: SiCong Li <sicong.li@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8406 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
author: SiCong Li <sicong.li@arm.com> 2022-08-29 18:25:51 +0100
committer: SiCong Li <sicong.li@arm.com> 2022-11-01 10:38:21 +0000
commit: f44bbc5c697de841dce97c0f2fa39bae391a8174 (patch)
tree: 56468ef833726318e545043f4abcd16ad3775094 /tests
parent: 3394f3e3df7fd2d924c41822a8564493fc06473a (diff)
download: ComputeLibrary-f44bbc5c697de841dce97c0f2fa39bae391a8174.tar.gz
2 files changed, 202 insertions, 14 deletions
diff --git a/tests/SConscript b/tests/SConscript
index b848f27043..95ecd27afa 100644
--- a/tests/SConscript
+++ b/tests/SConscript
@@ -121,7 +121,7 @@ files_validation += Glob('validation/CPP/' + filter_pattern)
 if env['opencl']:
     if env['experimental_dynamic_fusion']:
         test_env.Append(CPPDEFINES = ['ENABLE_EXPERIMENTAL_DYNAMIC_FUSION'])
-        files_validation += Glob('validation/CL/UNIT/dynamic_fusion/' + filter_pattern)
+        files_validation += Glob('validation/dynamic_fusion/gpu/' + filter_pattern)
 
     filter_pattern = test_env['test_filter']
 
@@ -291,19 +291,6 @@ if test_env['benchmark_examples']:
                 prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
                 arm_compute_benchmark_examples += [ prog ]
 
-        # Dynamic fusion examples
-        if env['opencl']:
-            if env['experimental_dynamic_fusion']:
-                for file in Glob("%s/dynamic_fusion/*.cpp" % examples_folder):
-                    example = "benchmark_" + os.path.basename(os.path.splitext(str(file))[0])
-                    if env['os'] in ['android', 'macos', 'bare_metal'] or env['standalone']:
-                        prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"], LINKFLAGS=test_env["LINKFLAGS"]+[load_whole_archive, arm_compute_lib, noload_whole_archive] + bm_link_flags + extra_link_flags)
-                        arm_compute_benchmark_examples += [ prog ]
-                    else:
-                        #-Wl,--allow-shlib-undefined: Ignore dependencies of dependencies
-                        prog = test_env.Program(example, [ test_env.Object(source=file, target=example), graph_utils, graph_params]+ files_benchmark_examples, LIBS = test_env["LIBS"] + ["arm_compute_graph"], LINKFLAGS=test_env["LINKFLAGS"]+['-Wl,--allow-shlib-undefined'])
-                        arm_compute_benchmark_examples += [ prog ]
-
     arm_compute_benchmark_examples = install_bin(arm_compute_benchmark_examples)
     Depends(arm_compute_benchmark_examples, arm_compute_test_framework)
     Depends(arm_compute_benchmark_examples, arm_compute_lib)
diff --git a/tests/validation/dynamic_fusion/gpu/Integration.cpp b/tests/validation/dynamic_fusion/gpu/Integration.cpp
new file mode 100644
index 0000000000..87720a629d
--- /dev/null
+++ b/tests/validation/dynamic_fusion/gpu/Integration.cpp
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"
+#include "arm_compute/dynamic_fusion/sketch/OperatorAttributes.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/gpu/cl/operators/ClAdd.h"
+#include "src/gpu/cl/operators/ClConv2d.h"
+
+#include "tests/CL/CLAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/reference/ConvolutionLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+#include "tests/validation/reference/Permute.h"
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+#include "tests/SimpleTensorPrinter.h"
+#endif /* ARM_COMPUTE_ASSERTS_ENABLED */
+
+using namespace arm_compute::experimental::dynamic_fusion;
+using namespace arm_compute::test::validation::utils;
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(INTEGRATION)
+TEST_SUITE(DYNAMIC_FUSION)
+TEST_CASE(Conv2d, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d1x1(direct_conv)(input, weights, bias)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    const auto t_dst_shape    = TensorShape(16, 12, 12);
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Fuse conv2d
+    Conv2dAttributes conv2d_attr{};
+    auto             input_info  = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    auto             weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+    auto             dst_info    = sketch.create_tensor_info();
+    GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+
+    // Configure runtime
+    ClWorkloadRuntime runtime;
+    runtime.configure(sketch);
+
+    // (Important) Allocate auxiliary tensor memory if there are any
+    // Instead of using ACL allocated memory, the user can choose to import memory into the tensors
+    for(auto &data : runtime.get_auxiliary_tensors())
+    {
+        CLTensor     *tensor      = data.first;
+        AuxMemoryInfo aux_mem_req = data.second;
+        tensor->allocator()->init(*data.first->info(), aux_mem_req.alignment);
+        tensor->allocator()->allocate(); // Use ACL allocated memory
+        // auto buf = cl::Buffer();
+        // tensor->allocator()->import_memory(buf);  // Or, import external memory
+    }
+
+    // Construct user tensors
+    CLTensor t_input{};
+    CLTensor t_weight{};
+    CLTensor t_dst{};
+
+    // Initialize user tensors
+    t_input.allocator()->init(input_info);
+    t_weight.allocator()->init(weight_info);
+    t_dst.allocator()->init(dst_info);
+
+    // Allocate and fill user tensors
+    // Instead of using ACL allocator, the user can choose to import memory into the tensors
+    t_input.allocator()->allocate();
+    t_weight.allocator()->allocate();
+    t_dst.allocator()->allocate();
+    fill<float>(CLAccessor(t_input), 0, library.get());
+    fill<float>(CLAccessor(t_weight), 1, library.get());
+
+    // Run runtime
+    runtime.run({ &t_input, &t_weight, &t_dst });
+
+    // Create reference
+    SimpleTensor<float> ref_t_input{ t_input_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+    SimpleTensor<float> ref_t_weight{ t_weight_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+    SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC };
+
+    // Fill reference
+    fill<float>(ref_t_input, 0, library.get());
+    fill<float>(ref_t_weight, 1, library.get());
+
+    auto ref_t_input_nchw            = reference::permute(ref_t_input, PermutationVector(1U, 2U, 0U));
+    auto ref_t_weight_nchw           = reference::permute(ref_t_weight, PermutationVector(1U, 2U, 0U));
+    auto ref_t_bias_placeholder_nchw = reference::permute(ref_t_bias_placeholder, PermutationVector(1U, 2U, 0U));
+    auto t_dst_shape_nchw            = t_dst_shape;
+    permute(t_dst_shape_nchw, PermutationVector(1U, 2U, 0U));
+
+    PadStrideInfo legacy_pad_stride(conv2d_attr.stride().x(), conv2d_attr.stride().y(), conv2d_attr.pad().left, conv2d_attr.pad().right, conv2d_attr.pad().top, conv2d_attr.pad().bottom,
+                                    DimensionRoundingType{});
+    auto       ref_t_dst_nchw = reference::convolution_layer(ref_t_input_nchw, ref_t_weight_nchw, ref_t_bias_placeholder_nchw, t_dst_shape_nchw, legacy_pad_stride, conv2d_attr.dilation());
+    const auto ref_t_dst      = reference::permute(ref_t_dst_nchw, PermutationVector(2U, 0U, 1U));
+
+    RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */
+    validate(CLAccessor(t_dst), ref_t_dst_nchw, tolerance_f32);
+}
+TEST_SUITE(Invalid_Fusion_Should_Fail)
+TEST_CASE(Multiple_Complex_Ops_0, framework::DatasetMode::ALL)
+{
+    /* Computation:
+     * out = conv2d(conv2d(l0_input, l0_weight), l1_weight)
+     */
+    CLScheduler::get().default_reinit();
+
+    const auto data_type      = DataType::F32;
+    const auto data_layout    = DataLayout::NHWC;
+    const auto t_input_shape  = TensorShape(384, 12, 12);
+    const auto t_weight_shape = TensorShape(384, 1, 1, 16);
+    const auto t_dst_shape    = TensorShape(16, 12, 12);
+    auto       t_input_info   = TensorInfo(t_input_shape, 1, data_type, data_layout);
+    auto       t_weight_info  = TensorInfo(t_weight_shape, 1, data_type, data_layout);
+    auto       t_dst_info     = TensorInfo();
+
+    Conv2dAttributes conv2d_attr{};
+
+    // Create a new workload sketch
+    auto              cl_compile_ctx = CLKernelLibrary::get().get_compile_context();
+    auto              gpu_ctx        = GpuWorkloadContext{ &cl_compile_ctx };
+    GpuWorkloadSketch sketch{ &gpu_ctx };
+
+    // Create tensor infos
+    auto input_info  = sketch.create_tensor_info(t_input_shape, 1, data_type, data_layout);
+    auto weight_info = sketch.create_tensor_info(TensorInfo(t_weight_shape, 1, data_type, data_layout));
+    auto dst_info    = sketch.create_tensor_info();
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator
+        const auto success = GpuConv2d::validate_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+        ARM_COMPUTE_EXPECT(bool(success), framework::LogLevel::ERRORS);
+
+        GpuConv2d::create_op(sketch, &input_info, &weight_info, nullptr, &dst_info, conv2d_attr);
+    }
+
+    // Create tensor infos
+    auto weight_info_2 = sketch.create_tensor_info(t_weight_info);
+    auto dst_info_2    = sketch.create_tensor_info();
+
+    // Fuse conv2d into the workload
+    {
+        // Validate operator, should fail
+        const auto success = GpuConv2d::validate_op(sketch, &dst_info, &weight_info_2, nullptr, &dst_info_2, conv2d_attr);
+        ARM_COMPUTE_EXPECT(!bool(success), framework::LogLevel::ERRORS);
+    }
+}
+TEST_SUITE_END() // Invalid_Fusion_Should_Fail
+TEST_SUITE_END() // DYNAMIC_FUSION
+TEST_SUITE_END() // INTEGRATION
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
author	SiCong Li <sicong.li@arm.com>	2022-08-29 18:25:51 +0100
committer	SiCong Li <sicong.li@arm.com>	2022-11-01 10:38:21 +0000
commit	f44bbc5c697de841dce97c0f2fa39bae391a8174 (patch)
tree	56468ef833726318e545043f4abcd16ad3775094 /tests
parent	3394f3e3df7fd2d924c41822a8564493fc06473a (diff)
download	ComputeLibrary-f44bbc5c697de841dce97c0f2fa39bae391a8174.tar.gz