diff options
author | SiCong Li <sicong.li@arm.com> | 2022-01-28 18:24:39 +0000 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2022-05-06 15:01:45 +0000 |
commit | b63b1196adea8b07dd8db77c2492a212650deba0 (patch) | |
tree | b264035197873f56c69784bec68cad7041b5d423 /tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp | |
parent | 3bb72b69566f18ad5c9446d318d2fc2b5f6dba42 (diff) | |
download | ComputeLibrary-b63b1196adea8b07dd8db77c2492a212650deba0.tar.gz |
Integrate Dynamic Fusion patches
* Add public interfaces:
* OperatorGraph: Describe a workload that could contain fused kernels
* IWorkload: Generic interface for workloads built from OperatorGraph
* ClWorkload: OpenCL workloads built from OperatorGraph
* ClCompositeOperator: Runtime async operator to execute a ClWorkload
* DependencyGraph (will likely be deprecated in later iterations)
* Add example
* cl_fused_conv2d_elementwise_add.cpp to explain how to use the new
interfaces
* Add internal translation layer
* Refactor ClKernelBuildingAPI
* Remove non-tile based gemm native kernel component
* Minor interface changes
* Add integration tests
Resolves COMPMID-5161
Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: Ib987ed79289ab0bcbd3130d54f5793408d9f1240
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7510
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp')
-rw-r--r-- | tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp | 515 |
1 files changed, 52 insertions, 463 deletions
diff --git a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp index 9e1b4d897b..a6b09ccdea 100644 --- a/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp +++ b/tests/validation/CL/UNIT/dynamic_fusion/ClCompositeKernel.cpp @@ -21,9 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#if defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION) +#ifndef ENABLE_EXPERIMENTAL_DYNAMIC_FUSION +#error "This experimental feature must be enabled with -DENABLE_EXPERIMENTAL_DYNAMIC_FUSION" +#endif /* ENABLE_EXPERIMENTAL_DYNAMIC_FUSION */ #include "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.h" +#include "src/core/experimental/dynamic_fusion/ClKernelBuildingAPI.h" #include "src/core/utils/helpers/float_ops.h" #include "src/gpu/cl/kernels/ClElementwiseKernel.h" @@ -42,9 +45,12 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" +#include "tests/validation/CL/UNIT/dynamic_fusion/Utils.h" + #include <chrono> using namespace arm_compute::experimental::dynamic_fusion; +using namespace arm_compute::test::validation::utils; namespace arm_compute { @@ -52,149 +58,12 @@ namespace test { namespace validation { -namespace -{ -/** Macros which measures the wall clock time, and records it into a map measurement_map with name clock_name */ -#define TICK(clock_name) \ - auto clock_name##_tick = std::chrono::high_resolution_clock::now(); -#define TOCK(clock_name, measurement_map) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>(clock_name##_tock - clock_name##_tick); -#define TOCK_AVG(clock_name, measurement_map, num_iterations) \ - auto clock_name##_tock = std::chrono::high_resolution_clock::now(); \ - measurement_map["\"" #clock_name "\""] = duration_cast<microseconds>((clock_name##_tock - clock_name##_tick) / (num_iterations)); - -template <typename T, typename U> -void fill(U &&tensor, int seed) -{ - static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported."); - using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type; - - DistributionType distribution{ T(-1.0f), T(1.0f) }; - library->fill(tensor, distribution, seed); - - // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) - DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) }; - library->fill_borders_with_garbage(tensor, distribution_inf, seed); -} -} // namespace - TEST_SUITE(CL) TEST_SUITE(UNIT) TEST_SUITE(DYNAMIC_FUSION) TEST_SUITE(ClCompositeKernel) TEST_SUITE(Validate) -TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL) -{ - /* Computation: - * out = add(addend, gemm_native(lhs, rhs, bias)) (non-broadcast) - */ - const auto data_type = DataType::F32; - const auto m = 5U; - const auto n = 4U; - const auto k = 3U; - const auto t_lhs_shape = TensorShape(k, m); - const auto t_rhs_shape = TensorShape(n, k); - const auto t_dst_shape = TensorShape(n, m); - auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type); - auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type); - auto t_bias_info = TensorInfo(TensorShape(), 1, DataType::F32); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); - - const ClTensorDescriptor t_lhs_desc{ &t_lhs_info }; - const ClTensorDescriptor t_rhs_desc{ &t_rhs_info }; - const ClTensorDescriptor t_bias_desc{ &t_bias_info }; - const ClTensorDescriptor t_addend_desc{ &t_dst_info }; - const ClTensorDescriptor t_dst_desc{ &t_dst_info }; - - ClKernelBlueprint bp; - ArgumentID tid_lhs; - ArgumentID tid_rhs; - ArgumentID tid_l0_bias = g_arg_placeholder; - ArgumentID tid_l1_addend; - ArgumentID tid_dst; - auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs); - st = add_tensor_argument(bp, t_rhs_desc, tid_rhs); - st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend); - st = add_tensor_argument(bp, t_dst_desc, tid_dst); - - const auto common_kernel_desc = ClKernelComponentDescriptor{}; - const GemmNativeDescriptor gemm_native_desc{ 1.0, 1.0, m, n, k }; - const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; - const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT }; - - ArgumentID tid_acc; - st = add_tensor_intermed(bp, tid_acc); - st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc); - st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc); - st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware); - - ClKernelCode cl_code; - - st = set_tile_info(bp, store_tile_info); - st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - - ClExecutionDescriptor exec_desc{}; - st = tune_static(exec_desc, cl_code); - - CLScheduler::get().default_reinit(); - ClCompositeKernel kernel; - kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code); - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l1_addend{}; - CLTensor t_dst{}; - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l1_addend.allocator()->init(t_dst_info); - t_dst.allocator()->init(t_dst_info); - } - // "Pack" tensors - TensorBinding tensors({ { tid_lhs, &t_lhs }, - { tid_rhs, &t_rhs }, - { tid_l1_addend, &t_l1_addend }, - { tid_dst, &t_dst } - }); - // Allocate and fill tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - t_dst.allocator()->allocate(); - fill<float>(CLAccessor(t_lhs), 0); - fill<float>(CLAccessor(t_rhs), 1); - fill<float>(CLAccessor(t_l1_addend), 2); - } - - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - - // Create reference - SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 }; - SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 }; - SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 }; - SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 }; - - // Fill reference - fill<float>(ref_t_lhs, 0); - fill<float>(ref_t_rhs, 1); - fill<float>(ref_t_l1_addend, 2); - const auto ref_t_dst = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_t_l1_addend, - reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */), - data_type, - eltwise_add_desc.convert_policy); - - RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - validate(CLAccessor(t_dst), ref_t_dst, tolerance_f32); -} - TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) { /* Computation: @@ -208,7 +77,7 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) Status st{}; const auto data_type = DataType::F32; - const auto conv_info = PadStrideInfo(1U, 1U, 1U, 1U); + const auto conv_info = Conv2dDescriptor{ Padding2D{ 1U, 1U, 1U, 1U }, { 1U, 1U } /* stride */ }; const auto width = 7U; const auto height = 6U; @@ -216,47 +85,44 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) const auto OFM = 4U; const auto kernel_sz = 3U; - const auto src_shape = TensorShape(IFM, width, height); - const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM); - const auto bia_shape = TensorShape(OFM); - const auto dst_shape = TensorShape(OFM, width, height); + const auto src_shape = TensorShape(IFM, width, height); + const auto wei_shape = TensorShape(IFM, kernel_sz, kernel_sz, OFM); + const auto bia_shape = TensorShape(OFM); + const auto addend_shape = TensorShape(1, 1); + const auto dst_shape = TensorShape(OFM, width, height); - auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC); - auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC); - auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC); - auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC); - - const auto src_desc = ClTensorDescriptor(&src_info); - const auto wei_desc = ClTensorDescriptor(&wei_info); - const auto bia_desc = ClTensorDescriptor(&bia_info); - const auto addend_desc = ClTensorDescriptor(&dst_info); - const auto dst_desc = ClTensorDescriptor(&dst_info); + auto src_info = TensorInfo(src_shape, 1, data_type, DataLayout::NHWC); + auto wei_info = TensorInfo(wei_shape, 1, data_type, DataLayout::NHWC); + auto bia_info = TensorInfo(bia_shape, 1, data_type, DataLayout::NHWC); + auto addend_info = TensorInfo(addend_shape, 1, data_type, DataLayout::NHWC); + auto dst_info = TensorInfo(dst_shape, 1, data_type, DataLayout::NHWC); const auto n0 = std::min(OFM, 4u); const auto m0 = (OFM > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U; - const ClKernelComponentDescriptor common_kernel_desc{}; - const DirectConvolutionDescriptor direct_conv2d_desc{ conv_info }; - const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT }; + const ClDirectConv2dKernelDescriptor direct_conv2d_desc{ conv_info }; + const ClEltwiseAddKernelDescriptor eltwise_add_desc{}; + const TileDescriptor store_tile_info{ Size2D(n0, m0), Size2D(width, height), ClippingStrategy::TOP_LEFT }; ArgumentID src_id{ g_arg_placeholder }; ArgumentID wei_id{ g_arg_placeholder }; ArgumentID bia_id{ g_arg_placeholder }; ArgumentID acc_id{ g_arg_placeholder }; + ArgumentID acc_1_id{ g_arg_placeholder }; ArgumentID addend_id{ g_arg_placeholder }; ArgumentID dst_id{ g_arg_placeholder }; - st = add_tensor_argument(bp, src_desc, src_id); - st = add_tensor_argument(bp, wei_desc, wei_id); - st = add_tensor_argument(bp, bia_desc, bia_id); - st = add_tensor_intermed(bp, acc_id); - st = add_tensor_argument(bp, addend_desc, addend_id); - st = add_tensor_argument(bp, dst_desc, dst_id); + st = add_tensor(bp, &src_info, src_id); + st = add_tensor(bp, &wei_info, wei_id); + st = add_tensor(bp, &bia_info, bia_id); + st = add_tensor(bp, &dst_info, acc_id); + st = add_tensor(bp, &dst_info, acc_1_id); + st = add_tensor(bp, &addend_info, addend_id); + st = add_tensor(bp, &dst_info, dst_id); - st = add_kcomp_direct_conv(bp, common_kernel_desc, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id); - st = add_kcomp_eltwise_add(bp, common_kernel_desc, eltwise_add_desc, addend_id, acc_id, acc_id); - st = add_kcomp_store(bp, common_kernel_desc, acc_id, dst_id, StoreType::TStoreIndirectWidthSelect); + st = add_kcomp_direct_conv2d(bp, direct_conv2d_desc, src_id, wei_id, bia_id, acc_id); + st = add_kcomp_eltwise_add(bp, eltwise_add_desc, addend_id, acc_id, acc_1_id); + st = add_kcomp_store(bp, StoreType::TStoreIndirectWidthSelect, acc_1_id, dst_id); exec_desc.skip_sliding_window = true; @@ -282,12 +148,11 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) dst.allocator()->init(dst_info); // "Pack" tensors - TensorBinding tensors({ { src_id, &src }, + ITensorPack tensors{ { src_id, &src }, { wei_id, &wei }, { bia_id, &bia }, { addend_id, &addend }, - { dst_id, &dst } - }); + { dst_id, &dst } }; // Allocate and fill tensors src.allocator()->allocate(); @@ -296,10 +161,10 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) addend.allocator()->allocate(); dst.allocator()->allocate(); - fill<float>(CLAccessor(src), 0); - fill<float>(CLAccessor(wei), 1); - fill<float>(CLAccessor(bia), 2); - fill<float>(CLAccessor(addend), 3); + fill<float>(CLAccessor(src), 0, library.get()); + fill<float>(CLAccessor(wei), 1, library.get()); + fill<float>(CLAccessor(bia), 2, library.get()); + fill<float>(CLAccessor(addend), 3, library.get()); CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); @@ -310,10 +175,10 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) SimpleTensor<float> ref_addend_nhwc{ dst_shape, data_type, 1, QuantizationInfo(), DataLayout::NHWC }; // Fill reference - fill<float>(ref_src_nhwc, 0); - fill<float>(ref_wei_nhwc, 1); - fill<float>(ref_bia_nhwc, 2); - fill<float>(ref_addend_nhwc, 3); + fill<float>(ref_src_nhwc, 0, library.get()); + fill<float>(ref_wei_nhwc, 1, library.get()); + fill<float>(ref_bia_nhwc, 2, library.get()); + fill<float>(ref_addend_nhwc, 3, library.get()); auto ref_src = reference::permute(ref_src_nhwc, PermutationVector(1U, 2U, 0U)); auto ref_wei = reference::permute(ref_wei_nhwc, PermutationVector(1U, 2U, 0U)); @@ -326,301 +191,25 @@ TEST_CASE(MoveNet_SubGraph_1_DirectConv2d, framework::DatasetMode::ALL) const auto ref_dst = reference::arithmetic_operation( ArithmeticOperation::ADD, ref_addend, - reference::convolution_layer<float>(ref_src, ref_wei, ref_bia, dst_shape_nchw, conv_info), - data_type, - eltwise_add_desc.convert_policy); + reference::convolution_layer<float>(ref_src, ref_wei, ref_bia, dst_shape_nchw, + PadStrideInfo + { + static_cast<unsigned int>(conv_info.stride.x()), + static_cast<unsigned int>(conv_info.stride.y()), + static_cast<unsigned int>(conv_info.pad.left), + static_cast<unsigned int>(conv_info.pad.top) }), + data_type, + ConvertPolicy::SATURATE); RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ validate(CLAccessor(dst), ref_dst, tolerance_f32); } TEST_SUITE_END() // Validate - -TEST_SUITE(Benchmark) -TEST_CASE(MoveNet_SubGraph_1_Gemm, framework::DatasetMode::ALL) -{ - using std::chrono::duration_cast; - using std::chrono::microseconds; - const int num_iterations = 200; - std::map<std::string, std::chrono::microseconds> measurements; - /* Computation: - * out = add(addend, gemm_native(lhs, rhs, bias)) - */ - const auto data_type = DataType::F32; - const auto m = 12U * 12U; - const auto n = 64U; - const auto k = 384U; - const auto t_lhs_shape = TensorShape(k, m); - const auto t_rhs_shape = TensorShape(n, k); - const auto t_dst_shape = TensorShape(n, m); - auto t_lhs_info = TensorInfo(t_lhs_shape, 1, data_type); - auto t_rhs_info = TensorInfo(t_rhs_shape, 1, data_type); - auto t_bias_info = TensorInfo(TensorShape(), 1, data_type); - auto t_l0_dst_info = TensorInfo(t_dst_shape, 1, data_type); // Intermediate tensor for cond3 - auto t_l1_rhs_info = TensorInfo(t_dst_shape, 1, data_type); - auto t_dst_info = TensorInfo(t_dst_shape, 1, data_type); - - const auto common_kernel_desc = ClKernelComponentDescriptor{}; - const GemmNativeDescriptor gemm_native_desc{ 1.0, 0.0, m, n, k }; - const GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; - const EltwiseAddDescriptor eltwise_add_desc{ ConvertPolicy::WRAP }; - const TileDescriptor store_tile_info{ Size2D(gemm_info.rhs_info.n0, gemm_info.lhs_info.m0), Size2D(gemm_info.n, gemm_info.m), ClippingStrategy::TOP_LEFT }; - - // Create reference - SimpleTensor<float> ref_t_lhs{ t_lhs_shape, data_type, 1 }; - SimpleTensor<float> ref_t_rhs{ t_rhs_shape, data_type, 1 }; - SimpleTensor<float> ref_t_bias_placeholder{ t_dst_shape, data_type, 1 }; - SimpleTensor<float> ref_t_l1_addend{ t_dst_shape, data_type, 1 }; - - // Fill reference - fill<float>(ref_t_lhs, 0); - fill<float>(ref_t_rhs, 1); - fill<float>(ref_t_l1_addend, 2); - const auto ref_t_dst = reference::arithmetic_operation( - ArithmeticOperation::ADD, - ref_t_l1_addend, - reference::gemm(ref_t_lhs, ref_t_rhs, ref_t_bias_placeholder, gemm_native_desc.alpha, 0.f /* To disable bias */), - data_type, - eltwise_add_desc.convert_policy); - - CLScheduler::get().default_reinit(); - - /* Condition 0: Dynamic Fused Kernel */ - CLTensor cond0_t_dst{}; - { - TICK(cond0_0_startup_time); - - ClKernelBlueprint bp; - ArgumentID tid_lhs; - ArgumentID tid_rhs; - ArgumentID tid_l0_bias = g_arg_placeholder; - ArgumentID tid_l1_addend; - ArgumentID tid_dst; - - const ClTensorDescriptor t_lhs_desc{ &t_lhs_info }; - const ClTensorDescriptor t_rhs_desc{ &t_rhs_info }; - const ClTensorDescriptor t_bias_desc{ &t_bias_info }; - const ClTensorDescriptor t_addend_desc{ &t_dst_info }; - const ClTensorDescriptor t_dst_desc{ &t_dst_info }; - - ClKernelCode cl_code; - TICK(cond0_build_time) - auto st = add_tensor_argument(bp, t_lhs_desc, tid_lhs); - st = add_tensor_argument(bp, t_rhs_desc, tid_rhs); - st = add_tensor_argument(bp, t_addend_desc, tid_l1_addend); - st = add_tensor_argument(bp, t_dst_desc, tid_dst); - - ArgumentID tid_acc; - st = add_tensor_intermed(bp, tid_acc); - st = add_kcomp_gemm_native(bp, common_kernel_desc, gemm_native_desc, tid_lhs, tid_rhs, tid_l0_bias, tid_acc); - - st = add_kcomp_eltwise_add(bp, common_kernel_desc, EltwiseAddDescriptor{}, tid_l1_addend, tid_acc, tid_acc); - - st = add_kcomp_store(bp, common_kernel_desc, tid_acc, tid_dst, StoreType::StoreBlockBoundaryAware); - - st = set_tile_info(bp, store_tile_info); - st = build(cl_code, ClCodeBuilderContext{ GpuInfo{ GPUTarget::G71 } }, bp); - TOCK(cond0_build_time, measurements) - - TICK(cond0_tune_time) - ClExecutionDescriptor exec_desc{}; - st = tune_static(exec_desc, cl_code); - TOCK(cond0_tune_time, measurements) - - TICK(cond0_configure_time) - ClCompositeKernel kernel; - kernel.configure(CLKernelLibrary::get().get_compile_context(), cl_code); - TOCK(cond0_configure_time, measurements) - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l1_addend{}; - - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l1_addend.allocator()->init(t_dst_info); - cond0_t_dst.allocator()->init(t_dst_info); - } - // Allocate tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - cond0_t_dst.allocator()->allocate(); - fill<float>(CLAccessor(t_lhs), 0); - fill<float>(CLAccessor(t_rhs), 1); - fill<float>(CLAccessor(t_l1_addend), 2); - } - - // "Pack" tensors - TensorBinding tensors({ { tid_lhs, &t_lhs }, { tid_rhs, &t_rhs }, { tid_l1_addend, &t_l1_addend }, { tid_dst, &cond0_t_dst } }); - - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - CLScheduler::get().sync(); - TOCK(cond0_0_startup_time, measurements) - - TICK(cond0_1_latency) - for(int i = 0; i < num_iterations; ++i) - { - CLScheduler::get().enqueue_op(kernel, tensors, exec_desc, true); - } - CLScheduler::get().sync(); - TOCK_AVG(cond0_1_latency, measurements, num_iterations) - } - /* Condition 1: Dynamic Unfused Kernel */ - /* Condition 2: Static Fused Kernel (current) */ - CLTensor cond2_t_dst{}; - { - TICK(cond2_0_startup_time); - arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm; - - TICK(cond2_configure_time); - experimental::PostOpList<ITensorInfo *> post_ops; - post_ops.push_back_op<experimental::PostOpEltwiseAdd<ITensorInfo *>>(&t_dst_info, 1, eltwise_add_desc.convert_policy); - GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0, post_ops }; - l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info, - gemm_native_desc.rhs_info, gemm_info); - TOCK(cond2_configure_time, measurements); - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l1_addend{}; - - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l1_addend.allocator()->init(t_dst_info); - cond2_t_dst.allocator()->init(t_dst_info); - } - // Allocate tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - cond2_t_dst.allocator()->allocate(); - fill<float>(CLAccessor(t_lhs), 0); - fill<float>(CLAccessor(t_rhs), 1); - fill<float>(CLAccessor(t_l1_addend), 2); - } - - // "Pack" tensors - ITensorPack tensors - { - { ACL_SRC_0, &t_lhs }, - { ACL_SRC_1, &t_rhs }, - { EXPERIMENTAL_ACL_POST_OP_ARG_FIRST, &t_l1_addend }, - { ACL_DST, &cond2_t_dst }, - }; - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true); - CLScheduler::get().sync(); - TOCK(cond2_0_startup_time, measurements); - - TICK(cond2_1_latency); - for(int i = 0; i < num_iterations; ++i) - { - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors, true); - } - CLScheduler::get().sync(); - TOCK_AVG(cond2_1_latency, measurements, num_iterations); - } - /* Condition 3: Static Unfused Kernel (current) */ - CLTensor cond3_t_dst{}; - { - TICK(cond3_0_startup_time); - arm_compute::opencl::kernels::ClGemmMatrixMultiplyNativeKernel l0_gemm_mm; - arm_compute::opencl::kernels::ClSaturatedArithmeticKernel l1_add; - - TICK(cond3_configure_time); - GEMMKernelInfo gemm_info{ m, n, k, 0, false, false, false, false, ActivationLayerInfo{}, 1, 1, gemm_native_desc.lhs_info, gemm_native_desc.rhs_info, 0, 0 }; - l0_gemm_mm.configure(CLKernelLibrary::get().get_compile_context(), &t_lhs_info, &t_rhs_info, nullptr, &t_l0_dst_info, gemm_native_desc.alpha, gemm_native_desc.beta, gemm_native_desc.lhs_info, - gemm_native_desc.rhs_info, gemm_info); - l1_add.configure(CLKernelLibrary::get().get_compile_context(), ArithmeticOperation::ADD, &t_l0_dst_info, &t_l1_rhs_info, &t_dst_info, eltwise_add_desc.convert_policy); - TOCK(cond3_configure_time, measurements); - - // Construct tensors - CLTensor t_lhs{}; - CLTensor t_rhs{}; - CLTensor t_l0_dst{}; - CLTensor t_l1_addend{}; - - // Init tensors - { - t_lhs.allocator()->init(t_lhs_info); - t_rhs.allocator()->init(t_rhs_info); - t_l0_dst.allocator()->init(t_l0_dst_info); - t_l1_addend.allocator()->init(t_dst_info); - cond3_t_dst.allocator()->init(t_dst_info); - } - // Allocate tensors - { - t_lhs.allocator()->allocate(); - t_rhs.allocator()->allocate(); - t_l0_dst.allocator()->allocate(); - t_l1_addend.allocator()->allocate(); - cond3_t_dst.allocator()->allocate(); - fill<float>(CLAccessor(t_lhs), 0); - fill<float>(CLAccessor(t_rhs), 1); - fill<float>(CLAccessor(t_l1_addend), 2); - } - - // "Pack" tensors - ITensorPack tensors_l0 - { - { ACL_SRC_0, &t_lhs }, - { ACL_SRC_1, &t_rhs }, - { ACL_DST, &t_l0_dst }, - }; - ITensorPack tensors_l1 - { - { ACL_SRC_0, &t_l0_dst }, - { ACL_SRC_1, &t_l1_addend }, - { ACL_DST, &cond3_t_dst }, - }; - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true); - CLScheduler::get().enqueue_op(l1_add, tensors_l1, true); - CLScheduler::get().sync(); - TOCK(cond3_0_startup_time, measurements); - - TICK(cond3_1_latency); - for(int i = 0; i < num_iterations; ++i) - { - CLScheduler::get().enqueue_op(l0_gemm_mm, tensors_l0, true); - CLScheduler::get().enqueue_op(l1_add, tensors_l1, true); - } - CLScheduler::get().sync(); - TOCK_AVG(cond3_1_latency, measurements, num_iterations); - } - - RelativeTolerance<float> tolerance_f32(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for floating point data types */ - std::cout << "cond0 validation: " << std::endl; - validate(CLAccessor(cond0_t_dst), ref_t_dst, tolerance_f32); - std::cout << "cond2 validation: " << std::endl; - validate(CLAccessor(cond2_t_dst), ref_t_dst, tolerance_f32); - std::cout << "cond3 validation: " << std::endl; - validate(CLAccessor(cond3_t_dst), ref_t_dst, tolerance_f32); - - /* Report */ - std::cout << "Performance comparison (gemm native + add)" << std::endl; - std::cout << "cond0: dynamic fusion module" << std::endl; - std::cout << "cond2: static fused with post ops" << std::endl; - std::cout << "cond3: static unfused" << std::endl; - for(auto m : measurements) - { - std::cout << m.first << ": " << m.second.count() << "us" << std::endl; - } -} -TEST_SUITE_END() // Benchmark TEST_SUITE_END() // ClCompositeKernel TEST_SUITE_END() // DYNAMIC_FUSION TEST_SUITE_END() // UNIT TEST_SUITE_END() // CL } // namespace validation } // namespace test -} // namespace arm_compute - -#endif // defined(ENABLE_EXPERIMENTAL_DYNAMIC_FUSION)
\ No newline at end of file +} // namespace arm_compute
\ No newline at end of file |