From 769c638959b8f8a11fe9d7880f3dcdebc181bb91 Mon Sep 17 00:00:00 2001
From: Manuel Bottini <manuel.bottini@arm.com>
Date: Thu, 22 Aug 2019 13:13:48 +0100
Subject: COMPMID-2314: Implement NEON INSTANCE_NORMALIZATION function

Change-Id: Ibaa574207aedf691953f8af8fa32b6408a1664ec
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1905
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
---
 arm_compute/core/NEON/NEKernels.h                  |   1 +
 .../kernels/NEInstanceNormalizationLayerKernel.h   |  98 +++++++++
 arm_compute/runtime/NEON/NEFunctions.h             |   1 +
 .../NEON/functions/NEInstanceNormalizationLayer.h  |  87 ++++++++
 .../kernels/NEInstanceNormalizationLayerKernel.cpp | 230 +++++++++++++++++++++
 .../functions/NEInstanceNormalizationLayer.cpp     |  88 ++++++++
 .../validation/NEON/InstanceNormalizationLayer.cpp | 140 +++++++++++++
 .../fixtures/InstanceNormalizationLayerFixture.h   | 147 +++++++++++++
 .../reference/InstanceNormalizationLayer.cpp       |  96 +++++++++
 .../reference/InstanceNormalizationLayer.h         |  44 ++++
 10 files changed, 932 insertions(+)
 create mode 100644 arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
 create mode 100644 arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
 create mode 100644 src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
 create mode 100644 src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
 create mode 100644 tests/validation/NEON/InstanceNormalizationLayer.cpp
 create mode 100644 tests/validation/fixtures/InstanceNormalizationLayerFixture.h
 create mode 100644 tests/validation/reference/InstanceNormalizationLayer.cpp
 create mode 100644 tests/validation/reference/InstanceNormalizationLayer.h

diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 8d8f7439a5..80bc74e135 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -100,6 +100,7 @@
 #include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
 #include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
diff --git a/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
new file mode 100644
index 0000000000..9745d266b8
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H__
+#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Interface for performing an instance normalization */
+class NEInstanceNormalizationLayerKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEInstanceNormalizationLayerKernel";
+    }
+    /** Default constructor */
+    NEInstanceNormalizationLayerKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEInstanceNormalizationLayerKernel(const NEInstanceNormalizationLayerKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEInstanceNormalizationLayerKernel &operator=(const NEInstanceNormalizationLayerKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEInstanceNormalizationLayerKernel(NEInstanceNormalizationLayerKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEInstanceNormalizationLayerKernel &operator=(NEInstanceNormalizationLayerKernel &&) = default;
+    /** Default destructor */
+    ~NEInstanceNormalizationLayerKernel() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported: NCHW
+     * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
+     * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
+     * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+     */
+    void configure(ITensor *input, ITensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEInstanceNormalizationLayer.
+     *
+     * @param[in] input   Source tensor info. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     *                    Data types supported: F16/F32. Data layout supported: NCHW
+     * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
+     * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
+     * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
+     * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Common signature for all the specialized instance normalization functions
+     *
+     * @param[in, out] input   An input tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     * @param[out]     output  The output tensor.
+     * @param[in]      gamma   The scale scalar value applied to the normalized tensor. Defaults to 1.0
+     * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to 0.0
+     * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
+     */
+    using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+
+    NormalizationFunction *_func;
+    ITensor               *_input;
+    ITensor               *_output;
+    float                  _gamma;
+    float                  _beta;
+    float                  _epsilon;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index 9dd7e5e5e7..09d3c65e25 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -92,6 +92,7 @@
 #include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
 #include "arm_compute/runtime/NEON/functions/NEHistogram.h"
 #include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
 #include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
 #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
new file mode 100644
index 0000000000..8e2ee37635
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H__
+#define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H__
+
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to perform a Instance normalization.
+ *
+ * This function runs the following kernels:
+ * -# @ref NEInstanceNormalizationLayerKernel
+ */
+class NEInstanceNormalizationLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Set the input and output tensors.
+     *
+     * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
+     *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+     * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
+     * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
+     * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+     */
+    void configure(ITensor *input, ITensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEInstanceNormalizationLayer.
+     *
+     * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported: NHWC, NCHW
+     * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
+     * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
+     * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
+     * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    MemoryGroup                        _memory_group;
+    NEInstanceNormalizationLayerKernel _normalization_kernel;
+    bool                               _is_nchw;
+    NEPermute                          _permute_input;
+    NEPermute                          _permute_output;
+    Tensor                             _permuted_input;
+    Tensor                             _permuted_output;
+};
+}
+#endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H__ */
diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
new file mode 100644
index 0000000000..31d982c4e3
--- /dev/null
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEMath.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+{
+    /** NEON vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    // Clear X/Y dimensions on execution window as we handle the planes manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    win.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    constexpr int      window_step_x  = 16 / sizeof(T);
+    const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
+
+    Iterator input_it(input, win);
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        Window win_plane = window;
+        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
+
+        Iterator input_plane_it(input, win_plane);
+        Iterator output_plane_it(output, win_plane);
+
+        auto sum_h_w         = static_cast<T>(0.f);
+        auto sum_squares_h_w = static_cast<T>(0.f);
+
+        execute_window_loop(win_plane, [&](const Coordinates &)
+        {
+            const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+
+            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+
+            // Compute S elements per iteration
+            int x = window.x().start();
+            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
+            {
+                auto vec_input_val  = wrapper::vloadq(input_ptr + x);
+                vec_sum_h_w         = wrapper::vadd(vec_sum_h_w, vec_input_val);
+                vec_sum_squares_h_w = wrapper::vadd(vec_sum_squares_h_w, wrapper::vmul(vec_input_val, vec_input_val));
+            }
+
+            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+            for(int i = 0; i < window_step_x / 4; ++i)
+            {
+                vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+            }
+            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+            // Compute left-over elements
+            for(; x < window.x().end(); ++x)
+            {
+                const auto value = *(input_ptr + x);
+                sum_h_w += value;
+                sum_squares_h_w += value * value;
+            }
+        },
+        input_plane_it, output_plane_it);
+
+        const auto mean_h_w = sum_h_w / elements_plane;
+        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<T>(mean_h_w), ExactTagType{});
+        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<T>(multip_h_w), ExactTagType{});
+        const auto vec_beta       = wrapper::vdup_n(static_cast<T>(beta), ExactTagType{});
+
+        execute_window_loop(win_plane, [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
+            auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+            // Compute S elements per iteration
+            int  x       = window.x().start();
+            auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
+            {
+                vec_val = wrapper::vloadq(input_ptr + x);
+                vec_val = wrapper::vadd(wrapper::vmul(wrapper::vsub(vec_val, vec_mean_h_w), vec_multip_h_w), vec_beta);
+                wrapper::vstore(output_ptr + x, vec_val);
+            }
+
+            // Compute left-over elements
+            for(; x < window.x().end(); ++x)
+            {
+                *(output_ptr + x) = ((*(input_ptr + x)) - mean_h_w) * multip_h_w + beta;
+            }
+        },
+        input_plane_it, output_plane_it);
+    },
+    input_it);
+}
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_UNUSED(gamma);
+    ARM_COMPUTE_UNUSED(beta);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly");
+
+    if(output != nullptr && output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+    }
+
+    return Status{};
+}
+
+std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    // We handle the planes manually
+    Window win = calculate_max_window(*input, Steps(1));
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*output, input->tensor_shape(), 1, input->data_type());
+
+    // NEInstanceNormalizationLayerKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+    return std::make_pair(Status{}, win);
+}
+} // namespace
+
+NEInstanceNormalizationLayerKernel::NEInstanceNormalizationLayerKernel()
+    : _func(nullptr), _input(nullptr), _output(nullptr), _gamma(1), _beta(0), _epsilon(1e-12)
+{
+}
+
+void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
+    _input   = input;
+    _output  = output == nullptr ? input : output;
+    _gamma   = gamma;
+    _beta    = beta;
+    _epsilon = epsilon;
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _output->info(), gamma, beta, epsilon));
+
+    if(_input->info()->data_type() == DataType::F32)
+    {
+        _func = &instance_normalization_nchw<float>;
+    }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    else if(_input->info()->data_type() == DataType::F16)
+    {
+        _func = &instance_normalization_nchw<float16_t>;
+    }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(_input->info(), _output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
+
+    INEKernel::configure(std::get<1>(win_config));
+}
+
+Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, gamma, beta, epsilon));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+    return Status{};
+}
+
+void NEInstanceNormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    (*_func)(_input, _output, _gamma, _beta, _epsilon, window);
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
new file mode 100644
index 0000000000..295f80af95
--- /dev/null
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+{
+}
+
+void NEInstanceNormalizationLayer::configure(ITensor *input, ITensor *output, float gamma, float beta, float epsilon)
+{
+    const DataLayout data_layout = input->info()->data_layout();
+
+    // Configure Kernels
+    _is_nchw = data_layout == DataLayout::NCHW;
+
+    if(!_is_nchw)
+    {
+        _memory_group.manage(&_permuted_input);
+        _memory_group.manage(&_permuted_output);
+
+        // Configure the function to transform the input tensor from NHWC -> NCHW
+        _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+        _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+
+        _normalization_kernel.configure(&_permuted_input, &_permuted_output, gamma, beta, epsilon);
+        _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+
+        _permute_output.configure(&_permuted_output, output != nullptr ? output : input, PermutationVector(2U, 0U, 1U));
+        _permuted_input.allocator()->allocate();
+        _permuted_output.allocator()->allocate();
+    }
+    else
+    {
+        _normalization_kernel.configure(input, output, gamma, beta, epsilon);
+    }
+}
+
+Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+{
+    return NEInstanceNormalizationLayerKernel::validate(input, &output->clone()->set_data_layout(DataLayout::NCHW), gamma, beta, epsilon);
+}
+
+void NEInstanceNormalizationLayer::run()
+{
+    MemoryGroupResourceScope scope_mg(_memory_group);
+
+    // Permute input
+    if(!_is_nchw)
+    {
+        _permute_input.run();
+    }
+
+    NEScheduler::get().schedule(&_normalization_kernel, Window::DimZ);
+
+    // Permute output
+    if(!_is_nchw)
+    {
+        _permute_output.run();
+    }
+}
+} // namespace arm_compute
diff --git a/tests/validation/NEON/InstanceNormalizationLayer.cpp b/tests/validation/NEON/InstanceNormalizationLayer.cpp
new file mode 100644
index 0000000000..8356613368
--- /dev/null
+++ b/tests/validation/NEON/InstanceNormalizationLayer.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/InstanceNormalizationLayerFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+/** Tolerance for float operations */
+AbsoluteTolerance<float> tolerance_f32(0.001f);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+AbsoluteTolerance<float> tolerance_f16(0.2f);
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+} // namespace
+
+TEST_SUITE(NEON)
+TEST_SUITE(InstanceNormalizationLayer)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
+    framework::dataset::make("InputInfo",  { TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32), // Mismatching data type input/output
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32), // Mismatching shape input/output
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 2, DataType::F32), // Number of Input channels != 1
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::S16), // DataType != F32
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32)
+                                           }),
+    framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F16),
+                                             TensorInfo(TensorShape(256U, 64U, 32U, 4U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::S16),
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32),
+                                             TensorInfo(TensorShape(128U, 64U, 32U, 4U), 1, DataType::F32)
+                                           })),
+    framework::dataset::make("Expected",   { false, false, false, false, true, true, true, true })),
+    input_info, output_info, expected)
+{
+    bool is_valid = bool(NEInstanceNormalizationLayer::validate(&input_info.clone()->set_is_resizable(false),
+                                                      &output_info.clone()->set_is_resizable(false)
+                                                      ));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+template <typename T>
+using NEInstanceNormalizationLayerFixture = InstanceNormalizationLayerValidationFixture<Tensor, Accessor, NEInstanceNormalizationLayer, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEInstanceNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::Small4DShapes(),
+                                               framework::dataset::make("DataType", DataType::F32)),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                               framework::dataset::make("InPlace", { false, true })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEInstanceNormalizationLayerFixture<float>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::Large4DShapes(),
+                                               framework::dataset::make("DataType", DataType::F32)),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                               framework::dataset::make("InPlace", { false, true })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f32);
+}
+TEST_SUITE_END() // FP32
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEInstanceNormalizationLayerFixture<half>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(datasets::SmallShapes(),
+                                               framework::dataset::make("DataType", DataType::F16)),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                               framework::dataset::make("InPlace", { false, true })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEInstanceNormalizationLayerFixture<half>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(datasets::LargeShapes(),
+                                               framework::dataset::make("DataType", DataType::F16)),
+                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                               framework::dataset::make("InPlace", { false, true })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_f16);
+}
+TEST_SUITE_END() // FP16
+#endif           // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+TEST_SUITE_END() // InstanceNormalizationLayer
+TEST_SUITE_END() // NEON
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/InstanceNormalizationLayerFixture.h b/tests/validation/fixtures/InstanceNormalizationLayerFixture.h
new file mode 100644
index 0000000000..175ef2fb90
--- /dev/null
+++ b/tests/validation/fixtures/InstanceNormalizationLayerFixture.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_INSTANCENORMALIZATION_FIXTURE
+#define ARM_COMPUTE_TEST_INSTANCENORMALIZATION_FIXTURE
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/InstanceNormalizationLayer.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class InstanceNormalizationLayerValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape shape, DataType data_type, DataLayout data_layout, bool in_place)
+    {
+        _target    = compute_target(shape, data_type, data_layout, in_place);
+        _reference = compute_reference(shape, data_type, data_layout);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        std::uniform_real_distribution<> distribution(1.f, 2.f);
+        library->fill(tensor, distribution, 0);
+    }
+
+    TensorType compute_target(TensorShape shape, DataType data_type, DataLayout data_layout, bool in_place)
+    {
+        if(data_layout == DataLayout::NHWC)
+        {
+            permute(shape, PermutationVector(2U, 0U, 1U));
+        }
+
+        std::mt19937                          gen(library->seed());
+        std::uniform_real_distribution<float> dist_gamma(1.f, 2.f);
+        std::uniform_real_distribution<float> dist_beta(-2.f, 2.f);
+        std::uniform_real_distribution<float> dist_epsilon(1e-16f, 1e-12f);
+
+        const float gamma   = dist_gamma(gen);
+        const float beta    = dist_beta(gen);
+        const float epsilon = dist_epsilon(gen);
+
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), data_layout);
+        TensorType dst = create_tensor<TensorType>(shape, data_type, 1, QuantizationInfo(), data_layout);
+
+        // Create and configure function
+        FunctionType instance_norm_func;
+        instance_norm_func.configure(&src, in_place ? nullptr : &dst, gamma, beta, epsilon);
+
+        ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        if(!in_place)
+        {
+            ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        if(!in_place)
+        {
+            dst.allocator()->allocate();
+        }
+
+        ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS);
+        if(!in_place)
+        {
+            ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+        }
+
+        // Fill tensors
+        fill(AccessorType(src));
+
+        // Compute function
+        instance_norm_func.run();
+
+        if(in_place)
+        {
+            return src;
+        }
+        else
+        {
+            return dst;
+        }
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type, DataLayout data_layout)
+    {
+        std::mt19937                          gen(library->seed());
+        std::uniform_real_distribution<float> dist_gamma(1.f, 2.f);
+        std::uniform_real_distribution<float> dist_beta(-2.f, 2.f);
+        std::uniform_real_distribution<float> dist_epsilon(1e-16f, 1e-12f);
+
+        const float gamma   = dist_gamma(gen);
+        const float beta    = dist_beta(gen);
+        const float epsilon = dist_epsilon(gen);
+
+        // Create reference
+        SimpleTensor<T> src{ shape, data_type };
+
+        // Fill reference
+        fill(src);
+
+        return reference::instance_normalization<T>(src, gamma, beta, epsilon);
+    }
+
+    TensorType      _target{};
+    SimpleTensor<T> _reference{};
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_INSTANCENORMALIZATION_FIXTURE */
diff --git a/tests/validation/reference/InstanceNormalizationLayer.cpp b/tests/validation/reference/InstanceNormalizationLayer.cpp
new file mode 100644
index 0000000000..0e5c02aa99
--- /dev/null
+++ b/tests/validation/reference/InstanceNormalizationLayer.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "InstanceNormalizationLayer.h"
+
+#include "tests/validation/Helpers.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> instance_normalization(const SimpleTensor<T> &src, float gamma, float beta, float epsilon)
+{
+    SimpleTensor<T> dst{ src.shape(), src.data_type() };
+
+    //NCHW
+    const size_t w_size = src.shape()[0];
+    const size_t h_size = src.shape()[1];
+    const size_t c_size = src.shape()[2];
+    const size_t n_size = src.shape()[3];
+
+    for(size_t n_i = 0; n_i < n_size; ++n_i)
+    {
+        for(size_t c_i = 0; c_i < c_size; ++c_i)
+        {
+            float sum_h_w = 0;
+            //Compute mean
+            for(size_t h_i = 0; h_i < h_size; ++h_i)
+            {
+                for(size_t w_i = 0; w_i < w_size; ++w_i)
+                {
+                    sum_h_w += src[coord2index(src.shape(), Coordinates(w_i, h_i, c_i, n_i))];
+                }
+            }
+            const float mean_h_w = sum_h_w / (h_size * w_size);
+
+            //Compute variance
+            float partial_var_h_w = 0;
+            for(size_t h_i = 0; h_i < h_size; ++h_i)
+            {
+                for(size_t w_i = 0; w_i < w_size; ++w_i)
+                {
+                    partial_var_h_w += std::pow(src[coord2index(src.shape(), Coordinates(w_i, h_i, c_i, n_i))] - mean_h_w, 2);
+                }
+            }
+            const float var_h_w = partial_var_h_w / (h_size * w_size);
+
+            //Apply mean
+            for(size_t h_i = 0; h_i < h_size; ++h_i)
+            {
+                for(size_t w_i = 0; w_i < w_size; ++w_i)
+                {
+                    //Compute output
+                    size_t index = coord2index(src.shape(), Coordinates(w_i, h_i, c_i, n_i));
+                    dst[index]   = (src[index] - mean_h_w) * gamma / std::sqrt(var_h_w + epsilon) + beta;
+                }
+            }
+        }
+    }
+    return dst;
+}
+
+template SimpleTensor<float> instance_normalization(const SimpleTensor<float> &src, float gamma, float beta, float epsilon);
+template SimpleTensor<half> instance_normalization(const SimpleTensor<half> &src, float gamma, float beta, float epsilon);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/InstanceNormalizationLayer.h b/tests/validation/reference/InstanceNormalizationLayer.h
new file mode 100644
index 0000000000..2926e09f1b
--- /dev/null
+++ b/tests/validation/reference/InstanceNormalizationLayer.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_INSTANCENORMALIZATION_H__
+#define __ARM_COMPUTE_TEST_INSTANCENORMALIZATION_H__
+
+#include "tests/SimpleTensor.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+template <typename T>
+SimpleTensor<T> instance_normalization(const SimpleTensor<T> &src, float gamma, float beta, float epsilon);
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_INSTANCENORMALIZATION_H__ */
-- 
cgit v1.2.1