From 1af5416917268692fcd4b34b1d7ffebd3a2aea8a Mon Sep 17 00:00:00 2001
From: SiCongLi <sicong.li@arm.com>
Date: Wed, 6 Oct 2021 15:25:57 +0100
Subject: Add experimental PostOp interface to
 ClGemmMatrixMultiplyReshapedKernel Part 1

This interface supports the fusion of multiple elementwise operations

Partially resolves: COMPMID-4435

Change-Id: If68dd7dd98dcf239fde7cb1f0a4a6d4d1e899a6f
Signed-off-by: SiCongLi <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6483
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp | 354 ++++++++++++++++++++-
 tests/validation/fixtures/GEMMFixture.h            | 261 +++++++++++++++
 tests/validation/reference/PostOps.cpp             |  76 +++++
 tests/validation/reference/PostOps.h               |  47 +++
 4 files changed, 737 insertions(+), 1 deletion(-)
 create mode 100644 tests/validation/reference/PostOps.cpp
 create mode 100644 tests/validation/reference/PostOps.h

(limited to 'tests')
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
index fd12dea4fe..b13c380470 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshaped.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTensorAllocator.h"
+#include "src/core/experimental/PostOp.h"
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
 #include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
 #include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
@@ -61,11 +62,21 @@ using CLGEMMMatrixMultiplyReshaped = CLSynthetizeOperator<ClGemmMatrixMultiplyRe
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedFixture = GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
 
+// Fixture for CLGEMMMatrixMultiplyReshaped with post ops
+template <typename T>
+using CLGEMMMatrixMultiplyReshapedWithPostOpsFixture =
+    GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
+
 // Fixture for CLGEMMMatrixMultiplyReshaped mixed precision
 template <typename T>
 using CLGEMMMatrixMultiplyReshapedMixedPrecisionFixture =
     GEMMMatrixMultiplyReshapedValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
 
+// Fixture for CLGEMMMatrixMultiplyReshaped mixed precision with post ops
+template <typename T>
+using CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture =
+    GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped, true>;
+
 // Fixture for CLGEMMMatrixMultiplyReshaped3D
 template <typename T>
 using CLGEMMMatrixMultiplyReshaped3DFixture = GEMMMatrixMultiplyReshaped3DValidationFixture<CLTensor, CLAccessor, T, CLGEMMReshapeLHSMatrix, CLGEMMReshapeRHSMatrix, CLGEMMMatrixMultiplyReshaped>;
@@ -172,6 +183,65 @@ const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", {
 /** LHS transposed values */
 const auto lhs_transpose_values = framework::dataset::make("lhs_transpose", { false, true } );
 
+/** Post Ops */
+using PostOpArgBroadcast =  CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>::PostOpArgBroadcast;
+experimental::PostOpList<PostOpArgBroadcast> empty_post_ops()
+{
+    return experimental::PostOpList<PostOpArgBroadcast>{};
+}
+
+experimental::PostOpList<PostOpArgBroadcast> post_ops_1()
+{
+    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::LINEAR, 0.5F, 0.0F});
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
+        0,
+        ConvertPolicy::SATURATE);
+    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+    return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> post_ops_2()
+{
+    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(false, true, true),   // If broadcast in dims 0, 1 and 2
+        1,
+        ConvertPolicy::SATURATE);
+    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+    return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> post_ops_3()
+{
+    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+    post_ops.push_back_op<experimental::PostOpAct<PostOpArgBroadcast>>(ActivationLayerInfo{ActivationLayerInfo::ActivationFunction::RELU, 2.1F, 1.3F});
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(false, false, true),  // If broadcast in dims 0, 1 and 2
+        1,
+        ConvertPolicy::SATURATE);
+    return post_ops;
+}
+experimental::PostOpList<PostOpArgBroadcast> invalid_post_ops_1()
+{
+    experimental::PostOpList<PostOpArgBroadcast> post_ops{};
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(true, true, false),   // If broadcast in dims 0, 1 and 2
+        1,
+        ConvertPolicy::SATURATE);
+    post_ops.push_back_op<experimental::PostOpEltwiseAdd<PostOpArgBroadcast>>(
+        std::make_tuple(false, true, false),   // If broadcast in dims 0, 1 and 2
+        0,
+        ConvertPolicy::SATURATE);
+    return post_ops;
+}
+
+/** Different Post Op Lists */
+const auto post_op_lists = framework::dataset::make("post_op_lists", {
+    post_ops_1(),
+    post_ops_2(),
+    post_ops_3(),
+ } );
+
 } // namespace
 
 TEST_SUITE(CL)
@@ -328,7 +398,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                framework::dataset::make("Expected", { true, true, false, false, false, true, true,true})),
                     input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, expected)
 {
-   ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
+    ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
                                                           &input1_info.clone()->set_is_resizable(true),
                                                           &input2_info.clone()->set_is_resizable(true),
                                                           &output_info.clone()->set_is_resizable(true),1.f,1.f,
@@ -336,6 +406,116 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zi
                                                           rhs_info,
                                                           gemm_info)) == expected, framework::LogLevel::ERRORS);
 }
+DATA_TEST_CASE(ValidateFusedPosOps, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("Input0Info", { TensorInfo(TensorShape(64U, 5U, 2U), 1, DataType::F32),      // OK. Empty post ops
+                                                        TensorInfo(TensorShape(64U, 5U, 2U), 1, DataType::F32),      // Invalid post op sequences
+                                                        TensorInfo(TensorShape(64U, 5U, 2U), 1, DataType::F32),      // OK. Supported post ops
+
+                                                      }),
+               framework::dataset::make("Input1Info",{ TensorInfo(TensorShape(64U, 6U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 6U, 2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(64U, 6U, 2U), 1, DataType::F32),
+
+                      })),
+               framework::dataset::make("Input2Info", { TensorInfo(TensorShape(21U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(21U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(21U), 1, DataType::F32),
+
+                                                      })),
+               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(21U,17U,2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(21U,17U,2U), 1, DataType::F32),
+                                                       TensorInfo(TensorShape(21U,17U,2U), 1, DataType::F32),
+
+                           })),
+               framework::dataset::make("LHSMInfo",{
+                                                          GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                          GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                          GEMMLHSMatrixInfo(4,4,1,false,true),
+
+                                })),
+               framework::dataset::make("RHSMInfo",{
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                          GEMMRHSMatrixInfo(4,4,1,true,true,false),
+
+
+                           })),
+
+
+               framework::dataset::make("GEMMInfo",{
+                                                            GEMMKernelInfo( 17 /**<M Number of LHS rows*/,
+                                                                            21 /**<N Number of RHS columns*/,
+                                                                            13 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                                     false /**< reinterpret the input as 3D */,
+                                                                     true  /**< Flag used to broadcast the bias addition */,
+                                                                     false /**< wider accumm */,
+                                                                     false /**< has pad y */,
+                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                                     1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                                     1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                                     GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                                     0  /**< Offset to be added to each element of the matrix A */,
+                                                                     0 /**< Offset to be added to each element of the matrix B */),
+
+                                                            GEMMKernelInfo( 17 /**<M Number of LHS rows*/,
+                                                                            21 /**<N Number of RHS columns*/,
+                                                                            13 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                                     false /**< reinterpret the input as 3D */,
+                                                                     true  /**< Flag used to broadcast the bias addition */,
+                                                                     false /**< wider accumm */,
+                                                                     false /**< has pad y */,
+                                                                   ActivationLayerInfo::ActivationFunction::IDENTITY,
+                                                                     1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                                     1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                                     GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                                     0  /**< Offset to be added to each element of the matrix A */,
+                                                                     0 /**< Offset to be added to each element of the matrix B */),
+                                                            GEMMKernelInfo( 17 /**<M Number of LHS rows*/,
+                                                                            21 /**<N Number of RHS columns*/,
+                                                                            13 /**<K Number of LHS columns or RHS rows */, 0 /**< Depth of the output tensor in case is reinterpreted as 3D */,
+                                                                     false /**< reinterpret the input as 3D */,
+                                                                     true  /**< Flag used to broadcast the bias addition */,
+                                                                     false /**< wider accumm */,
+                                                                     false /**< has pad y */,
+                                                                   ActivationLayerInfo::ActivationFunction::IDENTITY,
+                                                                     1   /**< Multiplication factor for the width of the 1xW transposed block */,
+                                                                     1   /**< Multiplication factor for the height of the 4x4 interleaved block */,
+                                                                     GEMMLHSMatrixInfo(4,4,1,false,true),
+                                                                     GEMMRHSMatrixInfo(4,4,1,true,true,false),
+                                                                     0  /**< Offset to be added to each element of the matrix A */,
+                                                                     0 /**< Offset to be added to each element of the matrix B */),
+                                                    })),
+               framework::dataset::make("PostOps",{
+                                                          empty_post_ops(),
+                                                          invalid_post_ops_1(),
+                                                          post_ops_1(),
+                           })),
+               framework::dataset::make("Expected", { true, false, true})),
+                    input0_info ,input1_info, input2_info, output_info, lhs_info, rhs_info, gemm_info, post_ops, expected)
+{
+    // Create TensorInfo for post op arguments
+    std::vector<TensorInfo> post_op_tensor_infos;
+    auto populated_post_ops = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, ITensorInfo*>(post_ops,
+        [&output_info, &post_op_tensor_infos](auto broadcast){
+                post_op_tensor_infos.emplace_back(TensorShape{
+                        std::get<0>(broadcast) ? 1 : output_info.dimension(0),
+                        std::get<1>(broadcast) ? 1 : output_info.dimension(1),
+                        std::get<2>(broadcast) ? 1 : output_info.dimension(2)
+                        }, 1, output_info.data_type());
+                return &post_op_tensor_infos.back();
+            });
+    GEMMKernelInfo gemm_info_with_post_ops(std::move(gemm_info));
+    gemm_info_with_post_ops.post_ops = populated_post_ops;
+    ARM_COMPUTE_EXPECT(bool(ClGemmMatrixMultiplyReshapedKernel::validate(&input0_info.clone()->set_is_resizable(true),
+                                                          &input1_info.clone()->set_is_resizable(true),
+                                                          &input2_info.clone()->set_is_resizable(true),
+                                                          &output_info.clone()->set_is_resizable(true),1.f,1.f,
+                                                          lhs_info,
+                                                          rhs_info,
+                                                          gemm_info_with_post_ops)) == expected, framework::LogLevel::ERRORS);
+}
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
@@ -438,6 +618,37 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
 }
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+TEST_SUITE_END() //  FusedPostOps
+
 TEST_SUITE(ExportToCLImage)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F32),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
@@ -704,6 +915,45 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<float>,
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F32)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output only if validate() is successful
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_SUITE_END() //  FusedPostOps
+
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP32
 
@@ -809,6 +1059,37 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
 }
 
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", false)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+}
+
+TEST_SUITE_END() //  FusedPostOps
+
 TEST_SUITE(ExportToCLImage)
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                framework::dataset::make("Input0Info", { TensorInfo(TensorShape(256U, 16U, 2U), 1, DataType::F16),  // OK or incorrect if cl_khr_image2d_from_buffer not supported
@@ -1075,6 +1356,45 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DFixture<half>,
         framework::ARM_COMPUTE_PRINT_INFO();
     }
 }
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedWithPostOpsFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", true)),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output only if validate() is successful
+    if(validate_result)
+    {
+        validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16);
+    }
+    else
+    {
+        ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped");
+        framework::ARM_COMPUTE_PRINT_INFO();
+    }
+}
+
+TEST_SUITE_END() //  FusedPostOps
+
 TEST_SUITE_END() // ExportToCLImage
 TEST_SUITE_END() // FP16
 
@@ -1179,6 +1499,38 @@ FIXTURE_DATA_TEST_CASE(RunLarge3D, CLGEMMMatrixMultiplyReshaped3DMixedPrecisionF
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
 }
+
+TEST_SUITE(FusedPostOps)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyReshapedMixedPrecisionWithPostOpsFixture<half>, framework::DatasetMode::ALL,
+                combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                                                   m_values,
+                                                                   n_values),
+                                                                   k_values),
+                                                                   b_values),
+                                                                   m0_values_precommit),
+                                                                   n0_values_precommit),
+                                                                   k0_values_precommit),
+                                                                   v0_values_precommit),
+                                                                   h0_values_precommit),
+                                                                   framework::dataset::make("interleave_lhs", { false })),
+                                                                   framework::dataset::make("interleave_rhs", { false })),
+                                                                   framework::dataset::make("export_to_cl_image_rhs", { true, false })),
+                                                                   framework::dataset::make("DataType", DataType::F16)),
+                                                                   a_values_precommit),
+                                                                   beta_values_precommit),
+                                                                   framework::dataset::make("broadcast_bias", { true } )),
+                                                                   lhs_transpose_values),
+                                                                   act_values),
+                                                                   post_op_lists)
+                                                                   )
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f16_mixed_precision, 0.f, abs_tolerance_f16_mixed_precision);
+}
+
+TEST_SUITE_END() // FusedPostOps
+
 TEST_SUITE_END() // MixedPrecision
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // GEMMMatrixMultiplyReshaped
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index 5f5fa3b653..e1191587d5 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -27,6 +27,8 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/IPostOp.h"
+#include "src/core/experimental/PostOp.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -34,7 +36,9 @@
 #include "tests/framework/Fixture.h"
 #include "tests/validation/Helpers.h"
 #include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
 #include "tests/validation/reference/GEMM.h"
+#include "tests/validation/reference/PostOps.h"
 
 #include <random>
 
@@ -915,6 +919,263 @@ protected:
     SimpleTensor<T> _reference{};
 };
 
+/** (EXPERIMENTAL_POST_OPS)*/
+template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
+class GEMMMatrixMultiplyReshapedWithPostOpsValidationFixture : public framework::Fixture
+{
+public:
+    using PostOpArgBroadcast = std::tuple<bool, bool, bool>; // Instruct fixture if we need broadcasting in dimension 0, 1, 2 of each PostOp argument
+public:
+    template <typename...>
+    void setup(unsigned int m, unsigned int n, unsigned int k, unsigned int batch_size, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, bool interleave_lhs,
+               bool interleave_rhs, bool export_to_cl_image, DataType data_type, float alpha, float beta, bool broadcast_bias, bool lhs_transpose, const ActivationLayerInfo &act_info,
+               const experimental::PostOpList<PostOpArgBroadcast> &post_ops)
+    {
+        GEMMLHSMatrixInfo lhs_info;
+        lhs_info.m0         = m0;
+        lhs_info.k0         = k0;
+        lhs_info.v0         = v0;
+        lhs_info.interleave = interleave_lhs;
+        lhs_info.transpose  = lhs_transpose;
+
+        GEMMRHSMatrixInfo rhs_info;
+        rhs_info.n0                 = n0;
+        rhs_info.k0                 = k0;
+        rhs_info.h0                 = h0;
+        rhs_info.interleave         = interleave_rhs;
+        rhs_info.transpose          = !lhs_transpose;
+        rhs_info.export_to_cl_image = export_to_cl_image;
+
+        // Set the tensor shapes for LHS and RHS matrices
+        const TensorShape lhs_shape(k, m, batch_size);
+        const TensorShape rhs_shape(n, k, batch_size);
+        const TensorShape bias_shape(n,
+                                     broadcast_bias ? 1 : m,
+                                     broadcast_bias ? 1 : batch_size);
+        auto post_ops_with_shapes = experimental::transform_post_op_list_arguments<PostOpArgBroadcast, TensorShape>(post_ops,
+                                                                                                                    [ = ](auto broadcast)
+        {
+            return TensorShape
+            {
+                std::get<0>(broadcast) ? 1 : n,
+                std::get<1>(broadcast) ? 1 : m,
+                std::get<2>(broadcast) ? 1 : batch_size,
+            };
+        });
+
+        _target = compute_target(lhs_shape, rhs_shape, bias_shape, lhs_info, rhs_info, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+        if(validate_result)
+        {
+            _reference = compute_reference(lhs_shape, rhs_shape, data_type, alpha, beta, broadcast_bias, act_info, post_ops_with_shapes);
+        }
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i)
+    {
+        static_assert(std::is_floating_point<T>::value || std::is_same<T, half>::value, "Only floating point data types supported.");
+        using DistributionType = typename std::conditional<std::is_same<T, half>::value, arm_compute::utils::uniform_real_distribution_16bit<T>, std::uniform_real_distribution<T>>::type;
+
+        DistributionType distribution{ T(-1.0f), T(1.0f) };
+        library->fill(tensor, distribution, i);
+
+        // Fill border with infinity in order to check the presence of NaN values (i.e. inf * 0)
+        DistributionType distribution_inf{ T(std::numeric_limits<float>::infinity()), T(std::numeric_limits<float>::infinity()) };
+        library->fill_borders_with_garbage(tensor, distribution_inf, i);
+    }
+
+    TensorType compute_target(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const TensorShape &bias_shape, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
+                              DataType data_type, float alpha, float beta, bool broadcast_bias, const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+    {
+        // Create tensors
+        TensorType lhs  = create_tensor<TensorType>(lhs_shape, data_type, 1);
+        TensorType rhs  = create_tensor<TensorType>(rhs_shape, data_type, 1);
+        TensorType bias = create_tensor<TensorType>(bias_shape, data_type, 1);
+
+        // Create post op tensors and populate post op with them
+        std::vector<TensorType> post_op_tensors_holder{};
+        auto                    populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, ITensorInfo *>(post_ops,
+                                                                                                                                [&post_op_tensors_holder, &data_type](auto shape)
+        {
+            auto t = create_tensor<TensorType>(shape, data_type, 1);
+            post_op_tensors_holder.push_back(std::move(t));
+            return post_op_tensors_holder.back().info();
+        });
+        TensorType lhs_reshaped;
+        TensorType rhs_reshaped;
+        TensorType dst;
+
+        const unsigned int M = lhs_shape[1];
+        const unsigned int N = rhs_shape[0];
+        const unsigned int K = lhs_shape[0];
+        GEMMKernelInfo     kernel_info;
+        kernel_info.m                       = M;
+        kernel_info.n                       = N;
+        kernel_info.k                       = K;
+        kernel_info.depth_output_gemm3d     = 0;
+        kernel_info.reinterpret_input_as_3d = false;
+        kernel_info.broadcast_bias          = broadcast_bias;
+        kernel_info.activation_info         = act_info;
+        kernel_info.fp_mixed_precision      = fp_mixed_precision;
+        kernel_info.post_ops                = populated_post_ops;
+
+        // The output tensor will be auto-initialized within the function
+
+        // Create and configure function
+        ReshapeLHSOperatorType reshape_lhs;
+        ReshapeRHSOperatorType reshape_rhs;
+        GEMMOperatorType       gemm;
+
+        validate_result = bool(reshape_rhs.validate(rhs.info(), rhs_reshaped.info(), rhs_info));
+        validate_result = validate_result || !rhs_info.export_to_cl_image;
+        if(!validate_result)
+        {
+            return nullptr;
+        }
+
+        reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info);
+        reshape_rhs.configure(rhs.info(), rhs_reshaped.info(), rhs_info);
+        gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info);
+
+        ARM_COMPUTE_ASSERT(lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(bias.info()->is_resizable());
+        for(const auto &tensor : post_op_tensors_holder)
+        {
+            ARM_COMPUTE_ASSERT(tensor.info()->is_resizable());
+        }
+
+        // We do not pad when using image as it needs to comply to strict pitch alignment restrictions
+        if(!rhs_info.export_to_cl_image)
+        {
+            add_padding_x({ &lhs, &rhs, &lhs_reshaped, &rhs_reshaped, &bias, &dst });
+            for(auto &tensor : post_op_tensors_holder)
+            {
+                add_padding_x({ &tensor });
+            }
+        }
+
+        // Allocate tensors
+        lhs.allocator()->allocate();
+        rhs.allocator()->allocate();
+        lhs_reshaped.allocator()->allocate();
+        rhs_reshaped.allocator()->allocate();
+        bias.allocator()->allocate();
+        dst.allocator()->allocate();
+        for(auto &tensor : post_op_tensors_holder)
+        {
+            tensor.allocator()->allocate();
+        }
+
+        ARM_COMPUTE_ASSERT(!lhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!bias.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!lhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!rhs_reshaped.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+        for(const auto &tensor : post_op_tensors_holder)
+        {
+            ARM_COMPUTE_ASSERT(!tensor.info()->is_resizable());
+        }
+
+        // Fill tensors
+        fill(AccessorType(lhs), 0);
+        fill(AccessorType(rhs), 1);
+        fill(AccessorType(bias), 2);
+        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
+        {
+            fill(AccessorType(post_op_tensors_holder.at(i)), 3 + i);
+        }
+
+        // Compute GEMM
+        ITensorPack reshape_lhs_pack = { { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } };
+        reshape_lhs.run(reshape_lhs_pack);
+        ITensorPack reshape_rhs_pack = { { ACL_SRC, &rhs }, { ACL_DST, &rhs_reshaped } };
+        reshape_rhs.run(reshape_rhs_pack);
+        ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped },
+            { ACL_SRC_1, &rhs_reshaped },
+            { ACL_SRC_2, &bias },
+            { ACL_DST, &dst }
+        });
+        for(size_t i = 0; i < post_op_tensors_holder.size(); ++i)
+        {
+            gemm_pack.add_tensor(experimental::get_post_op_arg_type(i), &post_op_tensors_holder.at(i));
+        }
+        gemm.run(gemm_pack);
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &lhs_shape, const TensorShape &rhs_shape, DataType data_type, float alpha, float beta, bool broadcast_bias,
+                                      const ActivationLayerInfo &act_info, const experimental::PostOpList<TensorShape> &post_ops)
+    {
+        TensorShape dst_shape = lhs_shape;
+        dst_shape[0]          = rhs_shape[0];
+        dst_shape[1]          = lhs_shape[1];
+
+        // Create reference
+        SimpleTensor<T> lhs{ lhs_shape, data_type, 1 };
+        SimpleTensor<T> rhs{ rhs_shape, data_type, 1 };
+        SimpleTensor<T> bias{ dst_shape, data_type, 1 };
+        // Create post op tensors and populate post op with them
+        auto populated_post_ops = experimental::transform_post_op_list_arguments<TensorShape, SimpleTensor<T>>(post_ops, [&data_type](auto shape)
+        {
+            return SimpleTensor<T> { shape, data_type, 1 };
+        });
+
+        const int n          = rhs_shape[0];
+        const int m          = lhs_shape[1];
+        const int batch_size = lhs_shape[2];
+
+        // Fill reference
+        int tensor_idx = 0;
+        fill(lhs, tensor_idx++);
+        fill(rhs, tensor_idx++);
+        fill(bias, tensor_idx++);
+        for(auto &op : populated_post_ops.get_list())
+        {
+            for(auto tensor : op->arguments())
+            {
+                fill(*tensor, tensor_idx++);
+            }
+        }
+
+        if(broadcast_bias)
+        {
+            // In case of broadcast, we need simply copy the first into the following "M" ones
+            for(int i = 1; i < m * batch_size; i++)
+            {
+                memcpy(bias.data() + i * n, bias.data(), n * sizeof(T));
+            }
+        }
+
+        SimpleTensor<T> out;
+        if(fp_mixed_precision)
+        {
+            out = reference::gemm_mixed_precision<T>(lhs, rhs, bias, alpha, beta);
+        }
+        else
+        {
+            out = reference::gemm<T>(lhs, rhs, bias, alpha, beta);
+        }
+        // Ignore activation info if post ops are used instead
+        if(populated_post_ops.size() > 0)
+        {
+            out = reference::post_ops<T>(out, populated_post_ops);
+        }
+        else
+        {
+            out = reference::activation_layer(out, act_info);
+        }
+        return out;
+    }
+
+    bool            validate_result = true;
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+
 template <typename TensorType, typename AccessorType, typename T, typename ReshapeLHSOperatorType, typename ReshapeRHSOperatorType, typename GEMMOperatorType, bool fp_mixed_precision = false>
 class GEMMMatrixMultiplyReshaped3DValidationFixture : public framework::Fixture
 {
diff --git a/tests/validation/reference/PostOps.cpp b/tests/validation/reference/PostOps.cpp
new file mode 100644
index 0000000000..1a8fb990c8
--- /dev/null
+++ b/tests/validation/reference/PostOps.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "PostOps.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/experimental/PostOp.h"
+#include "tests/validation/reference/ActivationLayer.h"
+#include "tests/validation/reference/ElementwiseOperations.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type>
+SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops)
+{
+    // Create reference
+    SimpleTensor<T> dst{ a };
+
+    for(auto &post_op : post_ops.get_list())
+    {
+        switch(post_op->type())
+        {
+            case experimental::PostOpType::Activation:
+            {
+                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpAct<SimpleTensor<T>> *>(post_op.get());
+                dst                 = reference::activation_layer(dst, _post_op->_act_info);
+                break;
+            }
+            case experimental::PostOpType::Eltwise_Add:
+            {
+                const auto _post_op = utils::cast::polymorphic_downcast<const experimental::PostOpEltwiseAdd<SimpleTensor<T>> *>(post_op.get());
+                dst                 = reference::arithmetic_operation(ArithmeticOperation::ADD, dst, _post_op->_addend, dst, _post_op->_policy);
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported PostOpType");
+            }
+        }
+    }
+    return dst;
+}
+
+template SimpleTensor<float> post_ops(const SimpleTensor<float> &a, experimental::PostOpList<SimpleTensor<float>> post_ops);
+template SimpleTensor<half> post_ops(const SimpleTensor<half> &a, experimental::PostOpList<SimpleTensor<half>> post_ops);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
\ No newline at end of file
diff --git a/tests/validation/reference/PostOps.h b/tests/validation/reference/PostOps.h
new file mode 100644
index 0000000000..5fe0fe71f5
--- /dev/null
+++ b/tests/validation/reference/PostOps.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_POSTOPS_H
+#define ARM_COMPUTE_TEST_POSTOPS_H
+
+#include "arm_compute/core/experimental/IPostOp.h"
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+/** (EXPERIMENTAL_POST_OPS) */
+template <typename T, typename std::enable_if<is_floating_point<T>::value, int>::type = 0>
+SimpleTensor<T> post_ops(const SimpleTensor<T> &a, experimental::PostOpList<SimpleTensor<T>> post_ops);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_POSTOPS_H */
-- 
cgit v1.2.1