From afa19725f7f3feb2c21a6aed02ade49d08e3097b Mon Sep 17 00:00:00 2001 From: SiCongLi Date: Sun, 24 Oct 2021 19:12:33 +0100 Subject: Add post ops to ClGemmMatrixMultiplyReshapedOnlyRHSKernel and ClGemmMatrixMultiplyNativeKernel Part 3 Partially resolves: COMPMID-4435 Change-Id: Ifc5affa3a24a70942ca2d001380205df09b03ad7 Signed-off-by: SiCongLi Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6550 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- tests/framework/Macros.h | 10 +- tests/validation/CL/GEMMMatrixMultiplyNative.cpp | 218 ++++++++++ .../CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp | 270 +++++++++++++ tests/validation/fixtures/GEMMFixture.h | 444 +++++++++++++++++++++ 4 files changed, 940 insertions(+), 2 deletions(-) (limited to 'tests') diff --git a/tests/framework/Macros.h b/tests/framework/Macros.h index a6ba137b59..ac03bb02b6 100644 --- a/tests/framework/Macros.h +++ b/tests/framework/Macros.h @@ -49,8 +49,8 @@ #define CONCAT(ARG0, ARG1) ARG0##ARG1 -#define VARIADIC_SIZE_IMPL(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, size, ...) size -#define VARIADIC_SIZE(...) VARIADIC_SIZE_IMPL(__VA_ARGS__, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) +#define VARIADIC_SIZE_IMPL(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15, size, ...) size +#define VARIADIC_SIZE(...) VARIADIC_SIZE_IMPL(__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) #define JOIN_PARAM1(OP, param) OP(0, param) #define JOIN_PARAM2(OP, param, ...) \ @@ -92,6 +92,12 @@ #define JOIN_PARAM14(OP, param, ...) \ OP(13, param) \ , JOIN_PARAM13(OP, __VA_ARGS__) +#define JOIN_PARAM15(OP, param, ...) \ + OP(14, param) \ + , JOIN_PARAM14(OP, __VA_ARGS__) +#define JOIN_PARAM16(OP, param, ...) \ + OP(15, param) \ + , JOIN_PARAM15(OP, __VA_ARGS__) #define JOIN_PARAM(OP, NUM, ...) \ CONCAT(JOIN_PARAM, NUM) \ (OP, __VA_ARGS__) diff --git a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp index dc5fbc36ba..e3f151a2ca 100644 --- a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp +++ b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp @@ -53,6 +53,11 @@ using CLGEMMMatrixMultiplyNative = CLSynthetizeOperator using CLGEMMMatrixMultiplyNativeFixture = GEMMMatrixMultiplyNativeValidationFixture; +// Fixture for CLGEMMMatrixMultiplyNative with post ops +template +using CLGEMMMatrixMultiplyNativeWithPostOpsFixture = + GEMMMatrixMultiplyNativeWithPostOpsValidationFixture; + // Fixture for CLGEMMMatrixMultiplyNative3D template using CLGEMMMatrixMultiplyNative3DFixture = GEMMMatrixMultiplyNative3DValidationFixture; @@ -141,6 +146,80 @@ const auto boundary_handling_cases = combine(combine(combine(combine(combine(com broadcast_bias_values), framework::dataset::make("Activation", ActivationLayerInfo())); +/** Post Ops */ +using PostOpArgBroadcast = CLGEMMMatrixMultiplyNativeWithPostOpsFixture::PostOpArgBroadcast; +experimental::PostOpList post_ops_1() +{ + experimental::PostOpList post_ops{}; + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F}); + post_ops.push_back_op>( + std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2 + 0, + ConvertPolicy::SATURATE); + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); + return post_ops; +} +experimental::PostOpList post_ops_2() +{ + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( + std::make_tuple(false, true, true), // If broadcast in dims 0, 1 and 2 + 1, + ConvertPolicy::SATURATE); + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); + return post_ops; +} +experimental::PostOpList post_ops_3() +{ + experimental::PostOpList post_ops{}; + // post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); + post_ops.push_back_op>( + std::make_tuple(false, false, false), // If broadcast in dims 0, 1 and 2 + 1, + ConvertPolicy::SATURATE); + return post_ops; +} + +/** Different Post Op Lists */ +const auto post_op_lists = framework::dataset::make("post_op_lists", { + post_ops_1(), + post_ops_2(), + post_ops_3(), +} ); + +bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList& post_ops) +{ + const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true); + const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false); + + // Create TensorInfo for post op arguments + TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type); + TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type); + TensorInfo input2_info(TensorShape(n), 1, data_type); + TensorInfo output_info(TensorShape(n, m, batch), 1, data_type); + + GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */, + false /**< reinterpret the input as 3D */, + true /**< Flag used to broadcast the bias addition */, + false /**< wider accumm */, + false /**< has pad y */, + ActivationLayerInfo::ActivationFunction::IDENTITY, + 1 /**< Multiplication factor for the width of the 1xW transposed block */, + 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, + lhs_info, + rhs_info, + 0 /**< Offset to be added to each element of the matrix A */, + 0 /**< Offset to be added to each element of the matrix B */, + post_ops); + return bool(ClGemmMatrixMultiplyNativeKernel::validate(&input0_info.clone()->set_is_resizable(true), + &input1_info.clone()->set_is_resizable(true), + &input2_info.clone()->set_is_resizable(true), + &output_info.clone()->set_is_resizable(true),1.f,1.f, + lhs_info, + rhs_info, + gemm_info)); +} + /** Configuration test */ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info) { @@ -191,6 +270,119 @@ void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned TEST_SUITE(CL) TEST_SUITE(GEMMMatrixMultiplyNative) +TEST_SUITE(ValidateFusedPostOpsConfigs) +TEST_SUITE(Invalid) +TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 17; + const unsigned int n = 1; + const unsigned int k = 13; + const unsigned int batch = 2; + TensorShape post_op_arg0_shape(n, m, batch); + TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type); + auto post_op_arg1_info = post_op_arg_info.clone(); + + // Unsupported sequence of post ops + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( + &post_op_arg_info, + 1, + ConvertPolicy::SATURATE); + post_ops.push_back_op>( + post_op_arg1_info.get(), + 0, + ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS); +} +TEST_CASE(OutputWidened, framework::DatasetMode::ALL) +{ + // Invalid broadcast: post op tensors "widen" the output tensor + const auto data_type = DataType::F32; + const unsigned int m = 1; + const unsigned int n = 18; + const unsigned int k = 13; + const unsigned int batch = 2; + TensorShape post_op_arg_shape(n, m + 1, batch); // output's Y dimension (m) is "widened", which is not allowed + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS); +} +TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL) +{ + // Invalid broadcast: post op tensors broadcast in the first dimension (X) only + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + TensorShape post_op_arg_shape(1, m, batch); + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS); +} +TEST_SUITE_END() // Invalid +TEST_SUITE(Valid) +TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + experimental::PostOpList post_ops{}; + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); +} +TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + TensorShape post_op_arg_shape(n, 1, batch); + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); +} +TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + TensorShape post_op_arg_shape(1, 1, batch); + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); +} +TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + TensorShape post_op_arg_shape(1, 1, 1); + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); +} +TEST_SUITE_END() // Valid +TEST_SUITE_END() // ValidateFusedPostOps TEST_SUITE(Float) TEST_SUITE(FP32) DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine( @@ -323,6 +515,32 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyNative3DFixture, f // Validate output validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); } + +TEST_SUITE(FusedPostOps) + +FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyNativeWithPostOpsFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + framework::dataset::make("M0", { 4 })), + n0_values_precommit), + k0_values_precommit), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("alpha", {1.0f} )), + framework::dataset::make("beta", {1.0f} )), + framework::dataset::make("broadcast_bias", { false, true } )), + framework::dataset::make("Activation", { ActivationLayerInfo() })), + post_op_lists) + ) +{ + // Validate output + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); +} + +TEST_SUITE_END() // FusedPostOps + TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float TEST_SUITE_END() // GEMMMatrixMulipltyNative diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp index 0f86a70e0f..9e1a185a10 100644 --- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp +++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/CL/CLTensorAllocator.h" +#include "src/core/experimental/PostOp.h" #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" #include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" #include "tests/CL/CLAccessor.h" @@ -61,6 +62,11 @@ using CLGEMMMatrixMultiplyReshapedOnlyRHSFixture = GEMMMatrixMultiplyReshapedOnl template using CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixture = GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture; +// Fixture for CLGEMMMatrixMultiplyReshapedOnlyRHS with post ops +template +using CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture = + GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture; + namespace { // *INDENT-OFF* @@ -157,6 +163,81 @@ const auto boundary_handling_cases = combine(combine(combine(combine(combine(com broadcast_bias_values), framework::dataset::make("Activation", ActivationLayerInfo())); +/** Post Ops */ +using PostOpArgBroadcast = CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture::PostOpArgBroadcast; +experimental::PostOpList post_ops_1() +{ + experimental::PostOpList post_ops{}; + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F}); + post_ops.push_back_op>( + std::make_tuple(true, true, false), // If broadcast in dims 0, 1 and 2 + 0, + ConvertPolicy::SATURATE); + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); + return post_ops; +} +experimental::PostOpList post_ops_2() +{ + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( + std::make_tuple(false, true, true), // If broadcast in dims 0, 1 and 2 + 1, + ConvertPolicy::SATURATE); + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); + return post_ops; +} +experimental::PostOpList post_ops_3() +{ + experimental::PostOpList post_ops{}; + post_ops.push_back_op>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F}); + post_ops.push_back_op>( + std::make_tuple(false, false, true), // If broadcast in dims 0, 1 and 2 + 1, + ConvertPolicy::SATURATE); + return post_ops; +} + +/** Different Post Op Lists */ +const auto post_op_lists = framework::dataset::make("post_op_lists", { + post_ops_1(), + post_ops_2(), + post_ops_3(), + } ); + + bool is_post_op_list_valid(unsigned int m, unsigned int n, unsigned int k, unsigned int batch, DataType data_type, const experimental::PostOpList& post_ops) +{ + const auto lhs_info = GEMMLHSMatrixInfo(4,4,1,false,true); + const auto rhs_info = GEMMRHSMatrixInfo(4,4,1,true,true,false); + + // Create TensorInfo for post op arguments + TensorInfo input0_info(TensorShape(k, m, batch), 1, data_type); + TensorInfo input1_info(TensorShape(n, k, batch), 1, data_type); + TensorInfo input2_info(TensorShape(n), 1, data_type); + TensorInfo output_info(TensorShape(n, m, batch), 1, data_type); + + const TensorInfo reshaped_input1_info = input1_info.clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(input1_info, rhs_info)); + + GEMMKernelInfo gemm_info(m, n, k, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */, + false /**< reinterpret the input as 3D */, + true /**< Flag used to broadcast the bias addition */, + false /**< wider accumm */, + false /**< has pad y */, + ActivationLayerInfo::ActivationFunction::IDENTITY, + 1 /**< Multiplication factor for the width of the 1xW transposed block */, + 1 /**< Multiplication factor for the height of the 4x4 interleaved block */, + lhs_info, + rhs_info, + 0 /**< Offset to be added to each element of the matrix A */, + 0 /**< Offset to be added to each element of the matrix B */, + post_ops); + return bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(&input0_info.clone()->set_is_resizable(true), + &reshaped_input1_info.clone()->set_is_resizable(true), + &input2_info.clone()->set_is_resizable(true), + &output_info.clone()->set_is_resizable(true),1.f,1.f, + lhs_info, + rhs_info, + gemm_info)); +} /** Configuration test */ bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value, @@ -211,6 +292,7 @@ bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned CLGEMMMatrixMultiplyReshapedOnlyRHS gemm; return bool(gemm.validate(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info)); } + } // namespace TEST_SUITE(CL) @@ -262,6 +344,119 @@ b_value, m0_value, n0_value, k0_value, broadcast_bias, input_as_3d, depth_output ARM_COMPUTE_EXPECT(status == expected_value, framework::LogLevel::ERRORS); } +TEST_SUITE(ValidateFusedPostOpsConfigs) +TEST_SUITE(Invalid) +TEST_CASE(UnsupportedPostOpSequence, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 17; + const unsigned int n = 1; + const unsigned int k = 13; + const unsigned int batch = 2; + TensorShape post_op_arg0_shape(n, m, batch); + TensorInfo post_op_arg_info(post_op_arg0_shape, 1, data_type); + auto post_op_arg1_info = post_op_arg_info.clone(); + + // Unsupported sequence of post ops + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( + &post_op_arg_info, + 1, + ConvertPolicy::SATURATE); + post_ops.push_back_op>( + post_op_arg1_info.get(), + 0, + ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS); +} +TEST_CASE(OutputWidened, framework::DatasetMode::ALL) +{ + // Invalid broadcast: post op tensors "widen" the output tensor + const auto data_type = DataType::F32; + const unsigned int m = 17; + const unsigned int n = 1; + const unsigned int k = 1; + const unsigned int batch = 1; + TensorShape post_op_arg_shape(n, m, batch + 4); // output's batch dimension is "widened", which is not allowed + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS); +} +TEST_CASE(BroadcastInXDimOnly, framework::DatasetMode::ALL) +{ + // Invalid broadcast: post op tensors broadcast in the first dimension (X) only + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + TensorShape post_op_arg_shape(1, m, batch); + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == false, framework::LogLevel::ERRORS); +} +TEST_SUITE_END() // Invalid +TEST_SUITE(Valid) +TEST_CASE(EmptyPostOpList, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + experimental::PostOpList post_ops{}; + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); +} +TEST_CASE(BroadcastInYDimOnly, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + TensorShape post_op_arg_shape(n, 1, batch); + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); +} +TEST_CASE(BroadcastInBothXandYDims, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + TensorShape post_op_arg_shape(1, 1, batch); + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); +} +TEST_CASE(BroadcastInAllDims, framework::DatasetMode::ALL) +{ + const auto data_type = DataType::F32; + const unsigned int m = 22; + const unsigned int n = 16; + const unsigned int k = 15; + const unsigned int batch = 3; + TensorShape post_op_arg_shape(1, 1, 1); + TensorInfo post_op_arg_info(post_op_arg_shape, 1, data_type); + experimental::PostOpList post_ops{}; + post_ops.push_back_op>( &post_op_arg_info, 0, ConvertPolicy::SATURATE); + + ARM_COMPUTE_EXPECT(is_post_op_list_valid(m, n, k, batch, data_type, post_ops) == true, framework::LogLevel::ERRORS); +} +TEST_SUITE_END() // Valid +TEST_SUITE_END() // ValidateFusedPostOps TEST_SUITE(Float) TEST_SUITE(FP32) @@ -462,6 +657,44 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur framework::ARM_COMPUTE_PRINT_INFO(); } } + +TEST_SUITE(FusedPostOps) + +FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + framework::dataset::make("H0", {1})), + framework::dataset::make("interleave_rhs", { true })), + t_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", false, true)), + framework::dataset::make("DataType", DataType::F32)), + a_values), + beta_values), + framework::dataset::make("broadcast_bias", { false } )), + act_values), + post_op_lists) + ) +{ + // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension + if(validate_result) + { + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} + +TEST_SUITE_END() // FusedPostOps + TEST_SUITE_END() // FP32 TEST_SUITE(FP16) @@ -590,6 +823,43 @@ FIXTURE_DATA_TEST_CASE(RunNightly3D, CLGEMMMatrixMultiplyReshapedOnlyRHS3DFixtur framework::ARM_COMPUTE_PRINT_INFO(); } } +TEST_SUITE(FusedPostOps) + +FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + m_values, + n_values), + k_values), + b_values), + m0_values_precommit), + n0_values_precommit), + k0_values_precommit), + framework::dataset::make("H0", {1})), + framework::dataset::make("interleave_rhs", { true })), + t_values_rhs), + framework::dataset::make("export_to_cl_image_rhs", true)), + framework::dataset::make("DataType", DataType::F16)), + a_values), + beta_values), + framework::dataset::make("broadcast_bias", { false } )), + act_values), + post_op_lists) + ) +{ + // Validate output only if the target platform supports the OpenCL cl_khr_image2d_from_buffer extension + if(validate_result) + { + validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} + +TEST_SUITE_END() // FusedPostOps + TEST_SUITE_END() // FP16 TEST_SUITE_END() // Float diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h index e1191587d5..fa273018a4 100644 --- a/tests/validation/fixtures/GEMMFixture.h +++ b/tests/validation/fixtures/GEMMFixture.h @@ -1522,6 +1522,243 @@ protected: SimpleTensor _reference{}; }; +/** (EXPERIMENTAL_POST_OPS)*/ +template +class GEMMMatrixMultiplyReshapedOnlyRHSWithPostOpsValidationFixture : public framework::Fixture +{ +public: + using PostOpArgBroadcast = std::tuple; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument + template + void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int h0, + bool interleave_rhs, bool transpose_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, + const experimental::PostOpList &post_ops) + { + GEMMLHSMatrixInfo lhs_info; + lhs_info.m0 = m0; + lhs_info.k0 = k0; + + GEMMRHSMatrixInfo rhs_info; + rhs_info.n0 = n0; + rhs_info.k0 = k0; + rhs_info.h0 = h0; + rhs_info.interleave = interleave_rhs; + rhs_info.transpose = transpose_rhs; + rhs_info.export_to_cl_image = export_to_cl_image; + + // Set the tensor shapes for LHS and RHS matrices + const TensorShape lhs_shape(k, m, batch_size); + const TensorShape rhs_shape(n, k, batch_size); + const TensorShape bias_shape(n, + broadcast_bias ? 1 : m, + broadcast_bias ? 1 : batch_size); + auto post_ops_with_shapes = experimental::transform_post_op_list_arguments(post_ops, + [ = ](auto broadcast) + { + return TensorShape + { + std::get<0>(broadcast) ? 1 : n, + std::get<1>(broadcast) ? 1 : m, + std::get<2>(broadcast) ? 1 : batch_size, + }; + }); + + _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes); + if(validate_result) + { + _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes); + } + } + +protected: + template + void fill(U &&tensor, int i) + { + static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); + using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; + + DistributionType distribution{ T(-1.0f), T(1.0f) }; + library->fill(tensor, distribution, i); + + // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) + DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; + library->fill_borders_with_garbage(tensor, distribution_inf, i); + } + + TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList &post_ops) + { + // Create tensors + TensorType lhs = create_tensor(lhs_shape, data_type, 1); + TensorType rhs = create_tensor(rhs_shape, data_type, 1); + TensorType bias = create_tensor(bias_shape, data_type, 1); + TensorType rhs_reshaped; + TensorType dst; + // Create post op tensors and populate post op with them + std::vector post_op_tensors_holder{}; + auto populated_post_ops = experimental::transform_post_op_list_arguments(post_ops, + [&post_op_tensors_holder, &data_type](auto shape) + { + auto t = create_tensor(shape, data_type, 1); + post_op_tensors_holder.push_back(std::move(t)); + return post_op_tensors_holder.back().info(); + }); + + const unsigned int M = lhs_shape[1]; + const unsigned int N = rhs_shape[0]; + const unsigned int K = lhs_shape[0]; + GEMMKernelInfo kernel_info; + kernel_info.m = M; + kernel_info.n = N; + kernel_info.k = K; + kernel_info.depth_output_gemm3d = 0; + kernel_info.reinterpret_input_as_3d = false; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = act_info; + kernel_info.post_ops = populated_post_ops; + + // The output tensor will be auto-initialized within the function + + // Create and configure function + ReshapeRHSOperatorType reshape_rhs; + GEMMOperatorType gemm; + + validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info)); + validate_result = validate_result || !rhs_info.export_to_cl_image; + if(!validate_result) + { + return nullptr; + } + + reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info); + gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info); + + ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(bias.info()->is_resizable()); + for(const auto &tensor : post_op_tensors_holder) + { + ARM_COMPUTE_ASSERT(tensor.info()->is_resizable()); + } + + // We do not pad when using image as it needs to comply to strict pitch alignment restrictions + if(!rhs_info.export_to_cl_image) + { + add_padding_x({ &lhs, &rhs, &rhs_reshaped, &bias, &dst }); + for(auto &tensor : post_op_tensors_holder) + { + add_padding_x({ &tensor }); + } + } + + // Allocate tensors + lhs.allocator()->allocate(); + rhs.allocator()->allocate(); + rhs_reshaped.allocator()->allocate(); + bias.allocator()->allocate(); + dst.allocator()->allocate(); + for(auto &tensor : post_op_tensors_holder) + { + tensor.allocator()->allocate(); + } + + ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!bias.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!dst.info()->is_resizable()); + for(const auto &tensor : post_op_tensors_holder) + { + ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable()); + } + + // Fill tensors + fill(AccessorType(lhs), 0); + fill(AccessorType(rhs), 1); + fill(AccessorType(bias), 2); + for(size_t i = 0; i < post_op_tensors_holder.size(); ++i) + { + fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i); + } + + // Compute GEMM + ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } }; + reshape_rhs.run(reshape_rhs_pack); + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, + { ACL_SRC_1, &rhs_reshaped }, + { ACL_SRC_2, &bias }, + { ACL_DST, &dst } + }); + for(size_t i = 0; i < post_op_tensors_holder.size(); ++i) + { + gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i)); + } + gemm.run(gemm_pack); + + return dst; + } + + SimpleTensor compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias, + const ActivationLayerInfo &act_info, const experimental::PostOpList &post_ops) + { + TensorShape dst_shape = lhs_shape; + dst_shape[0] = rhs_shape[0]; + dst_shape[1] = lhs_shape[1]; + + // Create reference + SimpleTensor lhs{ lhs_shape, data_type, 1 }; + SimpleTensor rhs{ rhs_shape, data_type, 1 }; + SimpleTensor bias{ dst_shape, data_type, 1 }; + // Create post op tensors and populate post op with them + auto populated_post_ops = experimental::transform_post_op_list_arguments>(post_ops, [&data_type](auto shape) + { + return SimpleTensor { shape, data_type, 1 }; + }); + + const int n = rhs_shape[0]; + const int m = lhs_shape[1]; + const int batch_size = lhs_shape[2]; + + // Fill reference + int tensor_idx = 0; + fill(lhs, tensor_idx++); + fill(rhs, tensor_idx++); + fill(bias, tensor_idx++); + for(auto &op : populated_post_ops.get_list()) + { + for(auto tensor : op->arguments()) + { + fill(*tensor, tensor_idx++); + } + } + + if(broadcast_bias) + { + // In case of broadcast, we need simply copy the first into the following "M" ones + for(int i = 1; i < m * batch_size; i++) + { + memcpy(bias.data() + i * n, bias.data(), n * sizeof(T)); + } + } + + SimpleTensor out; + out = reference::gemm(lhs, rhs, bias, alpha, beta); + // Ignore activation info if post ops are used instead + if(populated_post_ops.size() > 0) + { + out = reference::post_ops(out, populated_post_ops); + } + else + { + out = reference::activation_layer(out, act_info); + } + return out; + } + + bool validate_result = true; + TensorType _target{}; + SimpleTensor _reference{}; +}; + template class GEMMMatrixMultiplyReshapedOnlyRHS3DValidationFixture : public framework::Fixture { @@ -1829,6 +2066,213 @@ protected: SimpleTensor _reference{}; }; +template +class GEMMMatrixMultiplyNativeWithPostOpsValidationFixture : public framework::Fixture +{ +public: + using PostOpArgBroadcast = std::tuple; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument +public: + template + void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, DataType data_type, float alpha, float beta, bool broadcast_bias, + const ActivationLayerInfo &act_info, const experimental::PostOpList &post_ops) + { + GEMMLHSMatrixInfo lhs_info; + lhs_info.m0 = m0; + lhs_info.k0 = k0; + + GEMMRHSMatrixInfo rhs_info; + rhs_info.n0 = n0; + rhs_info.k0 = k0; + + // Set the tensor shapes for LHS and RHS matrices + const TensorShape lhs_shape(k, m, batch_size); + const TensorShape rhs_shape(n, k, batch_size); + const TensorShape bias_shape(n, + broadcast_bias ? 1 : m, + broadcast_bias ? 1 : batch_size); + const auto post_ops_with_shapes = experimental::transform_post_op_list_arguments(post_ops, + [ = ](auto broadcast) + { + return TensorShape + { + std::get<0>(broadcast) ? 1 : n, + std::get<1>(broadcast) ? 1 : m, + std::get<2>(broadcast) ? 1 : batch_size, + }; + }); + + _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes); + _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes); + } + +protected: + template + void fill(U &&tensor, int i) + { + static_assert(std::is_floating_point::value || std::is_same::value, "Only floating point data types supported."); + using DistributionType = typename std::conditional::value, arm_compute::utils::uniform_real_distribution_16bit, std::uniform_real_distribution>::type; + + DistributionType distribution{ T(-1.0f), T(1.0f) }; + library->fill(tensor, distribution, i); + + // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0) + DistributionType distribution_inf{ T(std::numeric_limits::infinity()), T(std::numeric_limits::infinity()) }; + library->fill_borders_with_garbage(tensor, distribution_inf, i); + } + + TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList &post_ops) + { + // Create tensors + TensorType lhs = create_tensor(lhs_shape, data_type, 1); + TensorType rhs = create_tensor(rhs_shape, data_type, 1); + TensorType bias = create_tensor(bias_shape, data_type, 1); + TensorType dst; + // Create post op tensors and populate post op with them + std::vector post_op_tensors_holder{}; + auto populated_post_ops = experimental::transform_post_op_list_arguments(post_ops, + [&post_op_tensors_holder, &data_type](auto shape) + { + auto t = create_tensor(shape, data_type, 1); + post_op_tensors_holder.push_back(std::move(t)); + return post_op_tensors_holder.back().info(); + }); + + const unsigned int M = lhs_shape[1]; + const unsigned int N = rhs_shape[0]; + const unsigned int K = lhs_shape[0]; + GEMMKernelInfo kernel_info; + kernel_info.m = M; + kernel_info.n = N; + kernel_info.k = K; + kernel_info.depth_output_gemm3d = 0; + kernel_info.reinterpret_input_as_3d = false; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = act_info; + kernel_info.post_ops = populated_post_ops; + + // Create and configure function + GEMMOperatorType gemm; + gemm.configure(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info); + + ARM_COMPUTE_ASSERT(lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(bias.info()->is_resizable()); + for(const auto &tensor : post_op_tensors_holder) + { + ARM_COMPUTE_ASSERT(tensor.info()->is_resizable()); + } + + add_padding_x({ &lhs, &rhs, &bias, &dst }); + for(auto &tensor : post_op_tensors_holder) + { + add_padding_x({ &tensor }); + } + + // Allocate tensors + lhs.allocator()->allocate(); + rhs.allocator()->allocate(); + bias.allocator()->allocate(); + dst.allocator()->allocate(); + for(auto &tensor : post_op_tensors_holder) + { + tensor.allocator()->allocate(); + } + + ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!bias.info()->is_resizable()); + ARM_COMPUTE_ASSERT(!dst.info()->is_resizable()); + for(const auto &tensor : post_op_tensors_holder) + { + ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable()); + } + + // Fill tensors + fill(AccessorType(lhs), 0); + fill(AccessorType(rhs), 1); + fill(AccessorType(bias), 2); + for(size_t i = 0; i < post_op_tensors_holder.size(); ++i) + { + fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i); + } + + // Compute GEMM + ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, + { ACL_SRC_1, &rhs }, + { ACL_SRC_2, &bias }, + { ACL_DST, &dst } + }); + for(size_t i = 0; i < post_op_tensors_holder.size(); ++i) + { + gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i)); + } + gemm.run(gemm_pack); + + return dst; + } + + SimpleTensor compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias, + const ActivationLayerInfo &act_info, const experimental::PostOpList &post_ops) + { + TensorShape dst_shape = lhs_shape; + dst_shape[0] = rhs_shape[0]; + dst_shape[1] = lhs_shape[1]; + + // Create reference + SimpleTensor lhs{ lhs_shape, data_type, 1 }; + SimpleTensor rhs{ rhs_shape, data_type, 1 }; + SimpleTensor bias{ dst_shape, data_type, 1 }; + // Create post op tensors and populate post op with them + auto populated_post_ops = experimental::transform_post_op_list_arguments>(post_ops, [&data_type](auto shape) + { + return SimpleTensor { shape, data_type, 1 }; + }); + + const int n = rhs_shape[0]; + const int m = lhs_shape[1]; + const int batch_size = lhs_shape[2]; + + // Fill reference + int tensor_idx = 0; + fill(lhs, tensor_idx++); + fill(rhs, tensor_idx++); + fill(bias, tensor_idx++); + for(auto &op : populated_post_ops.get_list()) + { + for(auto tensor : op->arguments()) + { + fill(*tensor, tensor_idx++); + } + } + + if(broadcast_bias) + { + // In case of broadcast, we need simply copy the first into the following "M" ones + for(int i = 1; i < m * batch_size; i++) + { + memcpy(bias.data() + i * n, bias.data(), n * sizeof(T)); + } + } + + SimpleTensor out; + out = reference::gemm(lhs, rhs, bias, alpha, beta); + // Ignore activation info if post ops are used instead + if(populated_post_ops.size() > 0) + { + out = reference::post_ops(out, populated_post_ops); + } + else + { + out = reference::activation_layer(out, act_info); + } + return out; + } + + TensorType _target{}; + SimpleTensor _reference{}; +}; + template class GEMMMatrixMultiplyNative3DValidationFixture : public framework::Fixture { -- cgit v1.2.1