15 files changed, 256 insertions, 190 deletions
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 530cc166a2..e86eeba121 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,54 @@ public:
         : value{ uint64_t(0) }
     {
     }
+    /** Initialize the union with a pixel value of chosen datatype
+     *
+     * @param[in] v        int value.
+     * @param[in] datatype DataType that @p v have to be stored
+     */
+    PixelValue(uint64_t v, DataType datatype)
+        : PixelValue()
+    {
+        switch(datatype)
+        {
+            case DataType::U8:
+                value.u8 = static_cast<uint8_t>(v);
+                break;
+            case DataType::S8:
+                value.s8 = static_cast<int8_t>(v);
+                break;
+            case DataType::U16:
+                value.u16 = static_cast<uint16_t>(v);
+                break;
+            case DataType::S16:
+                value.s16 = static_cast<int16_t>(v);
+                break;
+            case DataType::U32:
+                value.u32 = static_cast<uint32_t>(v);
+                break;
+            case DataType::S32:
+                value.s32 = static_cast<int32_t>(v);
+                break;
+            case DataType::U64:
+                value.u64 = static_cast<uint64_t>(v);
+                break;
+            case DataType::S64:
+                value.s16 = static_cast<int64_t>(v);
+                break;
+            case DataType::F16:
+                value.f16 = static_cast<half>(v);
+                break;
+            case DataType::F32:
+                value.f32 = static_cast<float>(v);
+                break;
+            case DataType::F64:
+                value.f64 = static_cast<double>(v);
+                break;
+            default:
+                value.u64 = v;
+                break;
+        }
+    }
     /** Initialize the union with a U8 pixel value
      *
      * @param[in] v U8 value.
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index dc87617f55..317c8990fa 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -538,11 +538,12 @@ enum class NonLinearFilterFunction : unsigned
 /** Available reduction operations */
 enum class ReductionOperation
 {
-    SUM_SQUARE,  /**< Sum of squares */
-    SUM,         /**< Sum */
-    MEAN_SUM,    /**< Mean of sum */
     ARG_IDX_MAX, /**< Index of the max value */
-    ARG_IDX_MIN  /**< Index of the min value */
+    ARG_IDX_MIN, /**< Index of the min value */
+    MEAN_SUM,    /**< Mean of sum */
+    PROD,        /**< Product */
+    SUM_SQUARE,  /**< Sum of squares */
+    SUM          /**< Sum */
 };
 
 /** Available element-wise operations */
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index ac35b7bfc6..977562d993 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,12 +76,12 @@ public:
 
 private:
     CLMemoryGroup                                 _memory_group;
-    std::unique_ptr<CLTensor[]>                   _sums_vector{ nullptr };
+    std::unique_ptr<CLTensor[]>                   _results_vector{ nullptr };
     std::unique_ptr<CLReductionOperationKernel[]> _reduction_kernels_vector{ nullptr };
     std::unique_ptr<CLFillBorderKernel[]>         _border_handlers_vector{ nullptr };
     unsigned int                                  _num_of_stages;
     unsigned int                                  _reduction_axis;
-    bool                                          _is_quantized;
+    bool                                          _is_serial;
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLREDUCTIONOPERATION_H__ */
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index d1f47beda7..b4ede25296 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,12 +60,31 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input)
 
     return (in.s0 + in.s1);
 }
+
+/** Calculate product of a vector
+ *
+ * @param[in] input Pointer to the first pixel.
+ *
+ * @return product of vector.
+ */
+inline DATA_TYPE product(__global const DATA_TYPE *input)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input);
+
+    in.s01234567 *= in.s89ABCDEF;
+    in.s0123 *= in.s4567;
+    in.s01 *= in.s23;
+
+    return (in.s0 * in.s1);
+}
 #if defined(OPERATION)
 /** This kernel performs parallel reduction given an operation on x-axis.
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
  * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
  *
  * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: F16/F32
@@ -74,28 +93,28 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input)
  * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
  * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
  * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[in] partial_sum_ptr                           The local buffer to hold sumed values. Supported data types: same as @p src_ptt
- * @param[in] partial_sum_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] partial_sum_step_x                        partial_sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] partial_sum_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] partial_sum_step_y                        partial_sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] partial_sum_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_sums                                Local buffer for storing the partial sum
+ * @param[in] partial_res_ptr                           The local buffer to hold partial result values. Supported data types: same as @p src_ptr
+ * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] local_results                             Local buffer for storing the partial result
  */
 __kernel void reduction_operation_x(
     IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(partial_sum),
-    __local DATA_TYPE *local_sums)
+    IMAGE_DECLARATION(partial_res),
+    __local DATA_TYPE *local_results)
 {
     Image src         = CONVERT_TO_IMAGE_STRUCT(src);
-    Image partial_sum = CONVERT_TO_IMAGE_STRUCT(partial_sum);
+    Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
 
     unsigned int lsize = get_local_size(0);
     unsigned int lid   = get_local_id(0);
 
     for(unsigned int y = 0; y < get_local_size(1); ++y)
     {
-        local_sums[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
+        local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
         barrier(CLK_LOCAL_MEM_FENCE);
 
         // Perform parallel reduction
@@ -103,7 +122,11 @@ __kernel void reduction_operation_x(
         {
             if(lid < i)
             {
-                local_sums[lid] += local_sums[lid + i];
+#if defined(PROD)
+                local_results[lid] *= local_results[lid + i];
+#else  //!defined(PROD)
+                local_results[lid] += local_results[lid + i];
+#endif //defined(PROD)
             }
             barrier(CLK_LOCAL_MEM_FENCE);
         }
@@ -113,10 +136,10 @@ __kernel void reduction_operation_x(
 #if defined(MEAN) && defined(WIDTH)
             if(y == get_local_size(1) - 1)
             {
-                local_sums[0] /= WIDTH;
+                local_results[0] /= WIDTH;
             }
 #endif /* defined(MEAN) && defined(WIDTH) */
-            ((__global DATA_TYPE *)offset(&partial_sum, get_group_id(0), y))[0] = local_sums[0];
+            ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
         }
     }
 }
@@ -127,6 +150,7 @@ __kernel void reduction_operation_x(
  *
  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
  * @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
  *
  * @param[in] src_ptr                              Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN
@@ -230,7 +254,11 @@ __kernel void reduction_operation_y(
 #if defined(SUM_SQUARE)
         in *= in;
 #endif // defined(SUM_SQUARE)
+#if defined(PROD)
+        res *= in;
+#else  //!defined(PROD)
         res += in;
+#endif //defined(PROD)
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
     }
 
@@ -304,7 +332,11 @@ __kernel void reduction_operation_z(
 #if defined(SUM_SQUARE)
         in *= in;
 #endif // defined(SUM_SQUARE)
+#if defined(PROD)
+        res *= in;
+#else  //!defined(PROD)
         res += in;
+#endif //defined(PROD)
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
     }
 
@@ -383,7 +415,11 @@ __kernel void reduction_operation_w(
 #if defined(SUM_SQUARE)
         in *= in;
 #endif // defined(SUM_SQUARE)
+#if defined(PROD)
+        res *= in;
+#else  //!defined(PROD)
         res += in;
+#endif //defined(PROD)
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
     }
 
@@ -397,4 +433,4 @@ __kernel void reduction_operation_w(
     vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
 #endif // defined(ARG_MAX) || defined(ARG_MIN)
 }
-#endif /* defined(BATCH) && defined(DEPTH) */
-\ No newline at end of file
+#endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 69206678d0..5fdb826f8b 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -75,25 +75,18 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo
     // Select appropriate kernel
     std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
 
-    // Define select type required by replicate border > 1
-    const DataType dt          = tensor->info()->data_type();
-    std::string    select_type = get_underlying_cl_type_from_data_type(dt);
-    if(is_data_type_float(dt))
-    {
-        select_type = (DataType::F32 == dt) ? "int" : "short";
-    }
+    const DataType dt = tensor->info()->data_type();
 
     // Define build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt)));
-    build_opts.emplace(("-DSELECT_TYPE=" + select_type));
-    build_opts.emplace(("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top)));
-    build_opts.emplace(("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom)));
-    build_opts.emplace(("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left)));
-    build_opts.emplace(("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right)));
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(dt));
+    build_opts.add_option("-DBORDER_SIZE_TOP=" + support::cpp11::to_string(border_size.top));
+    build_opts.add_option("-DBORDER_SIZE_BOTTOM=" + support::cpp11::to_string(border_size.bottom));
+    build_opts.add_option("-DBORDER_SIZE_LEFT=" + support::cpp11::to_string(border_size.left));
+    build_opts.add_option("-DBORDER_SIZE_RIGHT=" + support::cpp11::to_string(border_size.right));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     _tensor = tensor;
 
     // Create static kernel arguments
@@ -141,8 +134,9 @@ void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, Bo
                 set_constant_border<float>(idx, constant_border_value);
                 break;
             case DataType::F16:
+                static_assert(sizeof(cl_half) == sizeof(half), "Half must be same size as cl_half");
                 static_assert(sizeof(cl_half) == 2, "Half must be 16 bit");
-                set_constant_border<cl_half>(idx, constant_border_value);
+                set_constant_border<half>(idx, constant_border_value);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not handled");
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 959209edc0..45aa810517 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,13 +80,13 @@ std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITe
     const unsigned int num_elems_processed_per_iteration = (is_data_type_quantized(input->data_type()) && (axis == 0)) ? 1 : 16;
     Window             win                               = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     bool               window_changed                    = false;
-    const bool         is_arg_op                         = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+    const bool         is_serial_op                      = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->data_type()));
 
     switch(axis)
     {
         case 0:
         {
-            if(is_data_type_quantized(input->data_type()) || is_arg_op)
+            if(is_serial_op)
             {
                 AccessWindowHorizontal input_access(input, 0, input->dimension(0));
                 AccessWindowHorizontal output_access(output, 0, 1);
@@ -153,10 +153,11 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     }
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
-    build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE=");
+    build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
     build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
     build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX");
     build_opts.add_option_if(op == ReductionOperation::ARG_IDX_MIN, "-DARG_MIN");
+    build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
 
     switch(op)
     {
@@ -170,6 +171,9 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
         case ReductionOperation::ARG_IDX_MAX:
         case ReductionOperation::ARG_IDX_MIN:
             break;
+        case ReductionOperation::PROD:
+            build_opts.add_option(("-DOPERATION=product"));
+            break;
         default:
             ARM_COMPUTE_ERROR("Unsupported reduction operation");
     }
@@ -177,12 +181,18 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
     // Create kernel
     cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange();
     std::string kernel_axis_name;
-    const bool  is_arg_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
+    const bool  is_serial_op = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(input->info()->data_type()));
     switch(axis)
     {
         case 0:
         {
-            if(!is_data_type_quantized(input->info()->data_type()) && !is_arg_op)
+            if(is_serial_op)
+            {
+                build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
+                build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short");
+                kernel_axis_name = "non_parallel_x";
+            }
+            else
             {
                 build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DWIDTH=" + support::cpp11::to_string(width));
                 const unsigned int width_leftover = input->info()->dimension(0) % border_val;
@@ -195,12 +205,6 @@ void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *ou
                 lws_hint     = cl::NDRange(std::min(8U, num_of_threads));
                 _border_size = BorderSize(0, border_width, 0, 0);
             }
-            else
-            {
-                build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
-                build_opts.add_option_if_else(_input->info()->data_type() == DataType::F32, "-DCOND_DATA_TYPE=int", "-DCOND_DATA_TYPE=short");
-                kernel_axis_name = "non_parallel_x";
-            }
         }
         break;
         case 1:
@@ -242,13 +246,31 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const bool is_arg_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN);
+    const bool is_serial_op = (_op == ReductionOperation::ARG_IDX_MAX || _op == ReductionOperation::ARG_IDX_MIN || is_data_type_quantized(_input->info()->data_type()));
     switch(_reduction_axis)
     {
         case 0:
         {
             // We use parallel reduction only in non quantized types
-            if(!is_data_type_quantized(_input->info()->data_type()) && !is_arg_op)
+            if(is_serial_op)
+            {
+                // Get first input and output slices
+                Window window_in{ window };
+                window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+
+                Window in_slice  = window.first_slice_window_1D();
+                Window out_slice = window.first_slice_window_1D();
+
+                do
+                {
+                    unsigned int idx = 0;
+                    add_1D_tensor_argument(idx, _input, in_slice);
+                    add_1D_tensor_argument(idx, _output, out_slice);
+                    enqueue(queue, *this, in_slice);
+                }
+                while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
+            }
+            else
             {
                 // Set out window
                 Window out_window(window);
@@ -263,8 +285,8 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                 in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), in_slice.x().end() + border_width, in_slice.x().step()));
 
                 // Set local sums buffer
-                unsigned int local_sum_size = lws_hint()[0] * _input->info()->element_size();
-                _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_sum_size, nullptr);
+                unsigned int local_res_size = lws_hint()[0] * _input->info()->element_size();
+                _kernel.setArg(num_arguments_per_2D_tensor() * 2, local_res_size, nullptr);
 
                 do
                 {
@@ -275,24 +297,6 @@ void CLReductionOperationKernel::run(const Window &window, cl::CommandQueue &que
                 }
                 while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
             }
-            else
-            {
-                // Get first input and output slices
-                Window window_in{ window };
-                window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
-
-                Window in_slice  = window.first_slice_window_1D();
-                Window out_slice = window.first_slice_window_1D();
-
-                do
-                {
-                    unsigned int idx = 0;
-                    add_1D_tensor_argument(idx, _input, in_slice);
-                    add_1D_tensor_argument(idx, _output, out_slice);
-                    enqueue(queue, *this, in_slice);
-                }
-                while(window_in.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(out_slice));
-            }
         }
         break;
         case 1:
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 39bcd996f9..f4046e0851 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index c5447ffd6b..e2dec6b375 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,15 +56,19 @@ unsigned int calculate_number_of_stages(const ITensorInfo *input, unsigned int a
 } // namespace
 
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _sums_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_quantized()
+    : _memory_group(std::move(memory_manager)), _results_vector(), _reduction_kernels_vector(), _border_handlers_vector(), _num_of_stages(), _reduction_axis(), _is_serial()
 {
 }
 
 Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
 {
     const unsigned int num_of_stages = calculate_number_of_stages(input, axis);
-
-    if(axis == 0 && !is_data_type_quantized(input->data_type()))
+    bool               is_serial     = is_data_type_quantized(input->data_type()) || axis != 0;
+    if(is_serial)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
+    }
+    else
     {
         // Create temporary tensor infos
         auto sums_vector = arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_stages - 1);
@@ -81,17 +85,25 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
         }
 
         ReductionOperation first_kernel_op;
+        ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
-                first_kernel_op = ReductionOperation::SUM;
-                last_kernel_op  = op;
+                first_kernel_op        = ReductionOperation::SUM;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = op;
                 break;
             case ReductionOperation::SUM_SQUARE:
-                first_kernel_op = ReductionOperation::SUM_SQUARE;
-                last_kernel_op  = ReductionOperation::SUM;
+                first_kernel_op        = ReductionOperation::SUM_SQUARE;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = ReductionOperation::SUM;
+                break;
+            case ReductionOperation::PROD:
+                first_kernel_op        = ReductionOperation::PROD;
+                intermediate_kernel_op = ReductionOperation::PROD;
+                last_kernel_op         = ReductionOperation::PROD;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
@@ -103,17 +115,13 @@ Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInf
         // Validate ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < num_of_stages - 1; ++i)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, ReductionOperation::SUM));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + i - 1, sums_vector.get() + i, axis, intermediate_kernel_op));
         }
 
         // Validate ReductionOperation on the last stage
         const unsigned int last_stage = num_of_stages - 1;
         ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input->dimension(0)));
     }
-    else
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output, axis, op));
-    }
 
     return Status{};
 }
@@ -122,65 +130,77 @@ void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsign
 {
     _num_of_stages  = calculate_number_of_stages(input->info(), axis);
     _reduction_axis = axis;
-    _is_quantized   = is_data_type_quantized(input->info()->data_type());
+    _is_serial      = is_data_type_quantized(input->info()->data_type()) || axis != 0;
 
     // Configure reduction operation kernels
     _reduction_kernels_vector = arm_compute::support::cpp14::make_unique<CLReductionOperationKernel[]>(_num_of_stages);
 
     // Create temporary tensors
-    if(axis == 0 && !_is_quantized)
+    if(_is_serial)
+    {
+        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+    }
+    else
     {
         _border_handlers_vector = arm_compute::support::cpp14::make_unique<CLFillBorderKernel[]>(_num_of_stages);
-        _sums_vector            = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
+        _results_vector         = arm_compute::support::cpp14::make_unique<CLTensor[]>(_num_of_stages - 1);
         TensorShape shape{ input->info()->tensor_shape() };
         for(unsigned int i = 0; i < _num_of_stages - 1; i++)
         {
             shape.set(0, ceil(shape.x() / 128.f));
-            _sums_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
+            _results_vector[i].allocator()->init(input->info()->clone()->set_tensor_shape(shape));
         }
 
         // Apply ReductionOperation only on first kernel
-        _memory_group.manage(_sums_vector.get());
+        _memory_group.manage(_results_vector.get());
 
         ReductionOperation first_kernel_op;
+        ReductionOperation intermediate_kernel_op;
         ReductionOperation last_kernel_op;
+        PixelValue         pixelValue;
         switch(op)
         {
             case ReductionOperation::SUM:
             case ReductionOperation::MEAN_SUM:
-                first_kernel_op = ReductionOperation::SUM;
-                last_kernel_op  = op;
+                first_kernel_op        = ReductionOperation::SUM;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = op;
+                pixelValue             = PixelValue(0);
                 break;
             case ReductionOperation::SUM_SQUARE:
-                first_kernel_op = ReductionOperation::SUM_SQUARE;
-                last_kernel_op  = ReductionOperation::SUM;
+                first_kernel_op        = ReductionOperation::SUM_SQUARE;
+                intermediate_kernel_op = ReductionOperation::SUM;
+                last_kernel_op         = ReductionOperation::SUM;
+                pixelValue             = PixelValue(0);
+                break;
+            case ReductionOperation::PROD:
+                first_kernel_op        = ReductionOperation::PROD;
+                intermediate_kernel_op = ReductionOperation::PROD;
+                last_kernel_op         = ReductionOperation::PROD;
+                pixelValue             = PixelValue(1, input->info()->data_type());
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not supported");
         }
 
-        _reduction_kernels_vector[0].configure(input, _sums_vector.get(), axis, first_kernel_op);
-        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, PixelValue(0));
+        _reduction_kernels_vector[0].configure(input, _results_vector.get(), axis, first_kernel_op);
+        _border_handlers_vector[0].configure(input, _reduction_kernels_vector[0].border_size(), BorderMode::CONSTANT, pixelValue);
 
         // Apply ReductionOperation on intermediate stages
         for(unsigned int i = 1; i < _num_of_stages - 1; ++i)
         {
-            _memory_group.manage(_sums_vector.get() + i);
-            _reduction_kernels_vector[i].configure(_sums_vector.get() + i - 1, _sums_vector.get() + i, axis, ReductionOperation::SUM);
-            _border_handlers_vector[i].configure(_sums_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, PixelValue(0));
-            _sums_vector[i - 1].allocator()->allocate();
+            _memory_group.manage(_results_vector.get() + i);
+            _reduction_kernels_vector[i].configure(_results_vector.get() + i - 1, _results_vector.get() + i, axis, intermediate_kernel_op);
+            _border_handlers_vector[i].configure(_results_vector.get() + i - 1, _reduction_kernels_vector[i].border_size(), BorderMode::CONSTANT, pixelValue);
+            _results_vector[i - 1].allocator()->allocate();
         }
 
         // Apply ReductionOperation on the last stage
         const unsigned int last_stage  = _num_of_stages - 1;
         const unsigned int input_width = input->info()->dimension(0);
-        _reduction_kernels_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
-        _border_handlers_vector[last_stage].configure(_sums_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, PixelValue(0));
-        _sums_vector[last_stage - 1].allocator()->allocate();
-    }
-    else
-    {
-        _reduction_kernels_vector[0].configure(input, output, axis, op, 0);
+        _reduction_kernels_vector[last_stage].configure(_results_vector.get() + last_stage - 1, output, axis, last_kernel_op, input_width);
+        _border_handlers_vector[last_stage].configure(_results_vector.get() + last_stage - 1, _reduction_kernels_vector[last_stage].border_size(), BorderMode::CONSTANT, pixelValue);
+        _results_vector[last_stage - 1].allocator()->allocate();
     }
 }
 
@@ -188,7 +208,11 @@ void CLReductionOperation::run()
 {
     _memory_group.acquire();
 
-    if(_reduction_axis == 0 && !_is_quantized)
+    if(_is_serial)
+    {
+        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
+    }
+    else
     {
         for(unsigned int i = 0; i < _num_of_stages; ++i)
         {
@@ -196,10 +220,6 @@ void CLReductionOperation::run()
             CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
         }
     }
-    else
-    {
-        CLScheduler::get().enqueue(_reduction_kernels_vector[0], false);
-    }
 
     _memory_group.release();
 }
diff --git a/src/runtime/NEON/functions/NEIntegralImage.cpp b/src/runtime/NEON/functions/NEIntegralImage.cpp
index fa8aaeb5dd..43308fa169 100644
--- a/src/runtime/NEON/functions/NEIntegralImage.cpp
+++ b/src/runtime/NEON/functions/NEIntegralImage.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,5 +36,5 @@ void NEIntegralImage::configure(const ITensor *input, ITensor *output)
     auto k = arm_compute::support::cpp14::make_unique<NEIntegralImageKernel>();
     k->configure(input, output);
     _kernel = std::move(k);
-    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, static_cast<float>(0.f));
+    _border_handler.configure(output, _kernel->border_size(), BorderMode::CONSTANT, PixelValue(0));
 }
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index a9c85bd726..169b9bbf6a 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -167,7 +167,7 @@ void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy pol
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
 
-    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, PixelValue(constant_border_value));
+    _border_handler.configure(input, _scale_kernel.border_size(), border_mode, constant_border_value);
 }
 
 Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
diff --git a/tests/datasets/ReductionOperationDataset.h b/tests/datasets/ReductionOperationDataset.h
deleted file mode 100644
index dadc4e9249..0000000000
--- a/tests/datasets/ReductionOperationDataset.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __ARM_COMPUTE_TEST_REDUCTION_OPERATION_DATASET_H__
-#define __ARM_COMPUTE_TEST_REDUCTION_OPERATION_DATASET_H__
-
-#include "arm_compute/core/Types.h"
-#include "tests/framework/datasets/ContainerDataset.h"
-#include "utils/TypePrinter.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-namespace test
-{
-namespace datasets
-{
-class ReductionOperations final : public framework::dataset::ContainerDataset<std::vector<ReductionOperation>>
-{
-public:
-    ReductionOperations()
-        : ContainerDataset("ReductionOperation",
-    {
-        ReductionOperation::SUM
-    })
-    {
-    }
-};
-} // namespace datasets
-} // namespace test
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_TEST_REDUCTION_OPERATION_DATASET_H__ */
diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp
index 2adb4e90d6..c8474e97e6 100644
--- a/tests/validation/CL/ReductionOperation.cpp
+++ b/tests/validation/CL/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "tests/CL/CLAccessor.h"
 #include "tests/PaddingCalculator.h"
-#include "tests/datasets/ReductionOperationDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -44,10 +43,17 @@ namespace validation
 namespace
 {
 /** Tolerance for float operations */
-AbsoluteTolerance<float> tolerance_f32(0.01f);
+AbsoluteTolerance<float> tolerance_f32(0.001f);
 RelativeTolerance<float> rel_tolerance_f32(0.00001f);
 AbsoluteTolerance<float> tolerance_f16(0.5f);
 RelativeTolerance<float> rel_tolerance_f16(0.2f);
+
+const auto ReductionOperations = framework::dataset::make("ReductionOperation",
+{
+    ReductionOperation::SUM,
+    ReductionOperation::PROD
+});
+
 } // namespace
 
 TEST_SUITE(CL)
@@ -89,13 +95,13 @@ using CLReductionOperationFixture = ReductionOperationFixture<CLTensor, CLAccess
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLReductionOperationFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations()))
+                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations()))
+                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0, tolerance_f16);
@@ -103,13 +109,13 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<half>, framework::D
 TEST_SUITE_END() // F16
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLReductionOperationFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations()))
+                       combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations()))
+                       combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0, tolerance_f32);
diff --git a/tests/validation/NEON/ReductionOperation.cpp b/tests/validation/NEON/ReductionOperation.cpp
index d064940366..e322947993 100644
--- a/tests/validation/NEON/ReductionOperation.cpp
+++ b/tests/validation/NEON/ReductionOperation.cpp
@@ -27,7 +27,6 @@
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
-#include "tests/datasets/ReductionOperationDataset.h"
 #include "tests/datasets/ShapeDatasets.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
@@ -48,6 +47,9 @@ AbsoluteTolerance<float> tolerance_f32(0.0001f);
 RelativeTolerance<float> rel_tolerance_f32(0.00001f);
 /** Tolerance for quantized operations */
 RelativeTolerance<float> tolerance_qasymm8(1);
+
+const auto ReductionOperations = framework::dataset::make("ReductionOperation",
+{ ReductionOperation::SUM });
 } // namespace
 
 TEST_SUITE(NEON)
@@ -86,13 +88,13 @@ using NEReductionOperationFixture = ReductionOperationFixture<Tensor, Accessor,
 
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations()))
+                       combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEReductionOperationFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), datasets::ReductionOperations()))
+                       combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), ReductionOperations))
 {
     // Validate output
     validate(Accessor(_target), _reference, rel_tolerance_f32, 0, tolerance_f32);
@@ -105,7 +107,7 @@ using NEReductionOperationQuantizedFixture = ReductionOperationQuantizedFixture<
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       datasets::ReductionOperations()),
+                                       ReductionOperations),
                                framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })))
 {
     // Validate output
@@ -113,7 +115,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEReductionOperationQuantizedFixture<uint8_t>,
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, NEReductionOperationQuantizedFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
                        combine(combine(combine(combine(datasets::Large4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                       datasets::ReductionOperations()),
+                                       ReductionOperations),
                                framework::dataset::make("QuantizationInfo", { QuantizationInfo(1.f / 255, 0) })))
 {
     // Validate output
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index fc12e31d75..8e79c3bfb0 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -42,11 +42,11 @@ template <typename T, typename OT>
 OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, int stride)
 {
     using type = typename std::remove_cv<OT>::type;
-    auto res   = type(0);
+    auto res   = (op == ReductionOperation::PROD) ? type(1) : type(0);
 
     if(std::is_integral<type>::value)
     {
-        uint32_t int_res = 0;
+        auto int_res = static_cast<uint32_t>(res);
         for(int i = 0; i < reduce_elements; ++i)
         {
             auto elem = *(ptr + stride * i);
@@ -72,6 +72,9 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in
                 case ReductionOperation::SUM:
                     int_res += elem;
                     break;
+                case ReductionOperation::PROD:
+                    int_res *= elem;
+                    break;
                 default:
                     ARM_COMPUTE_ERROR("Operation not supported");
             }
@@ -108,6 +111,9 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in
                 case ReductionOperation::SUM:
                     res += elem;
                     break;
+                case ReductionOperation::PROD:
+                    res *= elem;
+                    break;
                 default:
                     ARM_COMPUTE_ERROR("Operation not supported");
             }
@@ -117,7 +123,6 @@ OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, in
             res /= reduce_elements;
         }
     }
-
     return res;
 }
 } // namespace
diff --git a/utils/TypePrinter.h b/utils/TypePrinter.h
index 496e49beb1..f2cf606a00 100644
--- a/utils/TypePrinter.h
+++ b/utils/TypePrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -1397,6 +1397,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ReductionOperation &
         case ReductionOperation::ARG_IDX_MIN:
             os << "ARG_IDX_MIN";
             break;
+        case ReductionOperation::PROD:
+            os << "PROD";
+            break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }