aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--LICENSE2
-rw-r--r--README.md3
-rw-r--r--SConscript4
-rw-r--r--arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h3
-rw-r--r--arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h3
-rw-r--r--arm_compute/core/CL/kernels/CLIm2ColKernel.h1
-rw-r--r--arm_compute/core/CL/kernels/CLPoolingLayerKernel.h1
-rw-r--r--arm_compute/core/CL/kernels/CLScaleKernel.h1
-rw-r--r--arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h1
-rw-r--r--arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h1
-rw-r--r--arm_compute/core/Helpers.h14
-rw-r--r--arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h1
-rw-r--r--arm_compute/core/NEON/kernels/NEIm2ColKernel.h3
-rw-r--r--arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h1
-rw-r--r--arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h1
-rw-r--r--arm_compute/core/Window.h17
-rw-r--r--arm_compute/core/Window.inl18
-rw-r--r--arm_compute/core/utils/misc/ShapeCalculator.h36
-rw-r--r--arm_compute/core/utils/misc/Utility.h16
-rw-r--r--arm_compute/runtime/CL/functions/CLReduceMean.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEReduceMean.h2
-rw-r--r--docs/00_introduction.dox17
-rw-r--r--docs/Doxyfile2
m---------release_repository0
-rw-r--r--src/core/CL/ICLKernel.cpp2
-rw-r--r--src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp21
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp33
-rw-r--r--src/core/CL/kernels/CLIm2ColKernel.cpp13
-rw-r--r--src/core/CL/kernels/CLPoolingLayerKernel.cpp28
-rw-r--r--src/core/CL/kernels/CLScaleKernel.cpp16
-rw-r--r--src/core/CL/kernels/CLUpsampleLayerKernel.cpp12
-rw-r--r--src/core/CL/kernels/CLWinogradInputTransformKernel.cpp32
-rw-r--r--src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp7
-rw-r--r--src/core/NEON/kernels/NEIm2ColKernel.cpp19
-rw-r--r--src/core/NEON/kernels/NEPoolingLayerKernel.cpp22
-rw-r--r--src/core/NEON/kernels/NEReductionOperationKernel.cpp38
-rw-r--r--src/core/NEON/kernels/NEScaleKernel.cpp4
-rw-r--r--src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp10
-rw-r--r--src/runtime/CL/functions/CLReduceMean.cpp120
-rw-r--r--src/runtime/NEON/functions/NEReduceMean.cpp97
-rw-r--r--tests/datasets/ShapeDatasets.h12
-rw-r--r--tests/validation/CL/ReduceMean.cpp22
-rw-r--r--tests/validation/NEON/ReduceMean.cpp22
43 files changed, 414 insertions, 266 deletions
diff --git a/LICENSE b/LICENSE
index 54292789e6..1b316d3292 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2017-2019 ARM Software
+Copyright (c) 2017-2020 ARM Software
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 75fa1b25ec..5f79467c91 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@ Blogs:
Documentation available here:
+- [v19.08.1](https://arm-software.github.io/ComputeLibrary/v19.08.1/)
- [v19.08](https://arm-software.github.io/ComputeLibrary/v19.08/)
- [v19.05](https://arm-software.github.io/ComputeLibrary/v19.05/)
- [v19.02](https://arm-software.github.io/ComputeLibrary/v19.02/)
@@ -50,6 +51,8 @@ Documentation available here:
Binaries available here:
+- [v19.08.1-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08.1/arm_compute-v19.08.1-bin-linux.tar.gz)
+- [v19.08.1-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08.1/arm_compute-v19.08.1-bin-android.tar.gz)
- [v19.08-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08/arm_compute-v19.08-bin-linux.tar.gz)
- [v19.08-android](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.08/arm_compute-v19.08-bin-android.tar.gz)
- [v19.05-linux](https://github.com/ARM-software/ComputeLibrary/releases/download/v19.05/arm_compute-v19.05-bin-linux.tar.gz)
diff --git a/SConscript b/SConscript
index 6c9b0bb796..e06262ec72 100644
--- a/SConscript
+++ b/SConscript
@@ -24,8 +24,8 @@ import os.path
import re
import subprocess
-VERSION = "v19.08"
-SONAME_VERSION="16.0.0"
+VERSION = "v19.08.1"
+SONAME_VERSION="16.1.0"
Import('env')
Import('vars')
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index 21d026e0a1..2dd20e9588 100644
--- a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,6 +72,7 @@ private:
const ICLTensor *_input;
ICLTensor *_output;
PadStrideInfo _info;
+ DataLayout _data_layout;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
index 081b01aad3..faf97e45dc 100644
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -94,6 +94,7 @@ public:
const ICLTensor *_biases;
const ICLTensor *_weights;
ICLTensor *_output;
+ DataLayout _data_layout;
BorderSize _border_size;
int _conv_stride_x;
int _conv_stride_y;
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index 0647f5dcec..00cb416e90 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -103,6 +103,7 @@ public:
public:
const ICLTensor *_input;
ICLTensor *_output;
+ DataLayout _data_layout;
std::pair<unsigned int, unsigned int> _convolved_dims;
unsigned int _num_elems_processed_per_iteration;
Size2D _kernel_dims;
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
index db1a756229..68a99039d8 100644
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
@@ -75,6 +75,7 @@ public:
const ICLTensor *_input;
ICLTensor *_output;
PoolingLayerInfo _pool_info;
+ DataLayout _data_layout;
BorderSize _border_size;
unsigned int _num_elems_processed_per_iteration;
};
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
index ff72af29fc..1ada3cde85 100644
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLScaleKernel.h
@@ -75,6 +75,7 @@ public:
public:
InterpolationPolicy _interpolationPolicy = InterpolationPolicy::BILINEAR;
+ DataLayout _data_layout = DataLayout::UNKNOWN;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_CLSCALEKERNEL_H__ */
diff --git a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
index dcd4f1bdb4..c8c69002c4 100644
--- a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
@@ -73,6 +73,7 @@ private:
const ICLTensor *_input;
ICLTensor *_output;
Size2D _info;
+ DataLayout _data_layout;
unsigned int _num_elems_processed_per_iteration_input_x;
};
} // namespace arm_compute
diff --git a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
index bc05a0ebf1..30bd3abb43 100644
--- a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
+++ b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
@@ -93,6 +93,7 @@ private:
BorderSize _border_size;
const ICLTensor *_input;
ICLTensor *_output;
+ DataLayout _data_layout;
int _num_tiles_x;
int _num_tiles_y;
unsigned int _step_z;
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 87b1fdf64c..8d526e96c0 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -766,6 +766,20 @@ inline T wrap_around(T x, T m)
return x >= 0 ? x % m : (x % m + m) % m;
}
+/** Convert negative coordinates to positive in the range [0, num_dims_input]
+ *
+ * @param[out] coords Array of coordinates to be converted.
+ * @param[in] max_value Maximum value to be used when wrapping the negative values in coords
+ */
+inline Coordinates &convert_negative_axis(Coordinates &coords, int max_value)
+{
+ for(unsigned int i = 0; i < coords.num_dimensions(); ++i)
+ {
+ coords[i] = wrap_around(coords[i], max_value);
+ }
+ return coords;
+}
+
/** Given an integer value, this function returns the next power of two
*
* @param[in] x Input value
diff --git a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
index a62da049a5..b34f6d3ebf 100644
--- a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
@@ -74,6 +74,7 @@ private:
const ITensor *_input; /**< Source tensor */
ITensor *_output; /**< Destination tensor */
int32_t _block_shape; /**< Block shape */
+ DataLayout _data_layout; /**< Data layout of the operation */
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index f76521f770..689da857a7 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -131,6 +131,7 @@ private:
unsigned int _kernel_height;
bool _has_bias;
Size2D _dilation;
+ DataLayout _data_layout;
};
} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEIM2COLKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 5f45a90cef..5b143250e9 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
@@ -194,6 +194,7 @@ private:
const ITensor *_input;
ITensor *_output;
PoolingLayerInfo _pool_info;
+ DataLayout _data_layout;
unsigned int _num_elems_processed_per_iteration;
BorderSize _border_size;
bool _is_square;
diff --git a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index c9ecdd26f8..68bc1737c8 100644
--- a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
@@ -75,6 +75,7 @@ private:
const ITensor *_input; /**< Source tensor */
ITensor *_output; /**< Destination tensor */
int32_t _block_shape; /**< Block shape */
+ DataLayout _data_layout; /**< Data layout of the operation */
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H__ */
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index a56227996b..be42fe9a87 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -48,7 +48,7 @@ public:
/** Default constructor: create a window containing a single element. */
constexpr Window()
- : _dims()
+ : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value)
{
}
/** Copy constructor
@@ -170,6 +170,20 @@ public:
*/
void set(size_t dimension, const Dimension &dim);
+ /** Set the dimension as broadcasted dimension
+ *
+ * @param[in] dimension The dimension to set
+ */
+ void set_broadcasted(size_t dimension);
+
+ /** Return whether a dimension has been broadcasted
+ *
+ * @param[in] dimension The requested dimension
+ *
+ * @return true if the dimension has been broadcasted
+ */
+ bool is_broadcasted(size_t dimension) const;
+
/** Use the tensor's dimensions to fill the window dimensions.
*
* @param[in] shape @ref TensorShape to copy the dimensions from.
@@ -419,6 +433,7 @@ private:
private:
std::array<Dimension, Coordinates::num_max_dimensions> _dims;
+ std::array<bool, Coordinates::num_max_dimensions> _is_broadcasted;
};
} // namespace arm_compute
#include "Window.inl"
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index eeef3df7b0..589d6bfafc 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -24,11 +24,12 @@
namespace arm_compute
{
inline Window::Window(const Window &src)
- : _dims()
+ : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value)
{
for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
{
set(i, src[i]);
+ _is_broadcasted[i] = src.is_broadcasted(i);
}
}
@@ -51,6 +52,19 @@ inline void Window::set(size_t dimension, const Window::Dimension &dim)
_dims[dimension] = dim;
}
+inline void Window::set_broadcasted(size_t dimension)
+{
+ ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+ set(dimension, Dimension(0, 0, 0));
+ _is_broadcasted[dimension] = true;
+}
+
+inline bool Window::is_broadcasted(size_t dimension) const
+{
+ ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
+ return _is_broadcasted[dimension];
+}
+
inline Window Window::collapse_if_possible(const Window &full_window, const size_t first,
const size_t last, bool *has_collapsed) const
{
@@ -110,7 +124,7 @@ inline Window Window::broadcast_if_dimension_le_one(const TensorShape &shape) co
{
if(shape[d] <= 1)
{
- broadcastWin.set(d, Dimension(0, 0, 0));
+ broadcastWin.set_broadcasted(d);
}
}
return broadcastWin;
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 65a2a1edf4..698a2b7a45 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -39,6 +39,42 @@ namespace misc
{
namespace shape_calculator
{
+/** Calculate the output tensor shape for the reduce mean operation
+ *
+ * @param[in] input Input tensor shape
+ * @param[in] reduction_axis Reduction axis
+ * @param[in] keep_dims Flag to indicate if dimensions are kept
+ *
+ * @return the calculated shape
+ */
+inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates &reduction_axis, bool keep_dims)
+{
+ const int reduction_ops = reduction_axis.num_dimensions();
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
+ convert_negative_axis(axis_local, input_dims);
+ TensorShape out_shape = input->info()->tensor_shape();
+ // Configure reshape layer if we want to drop the dimensions
+ if(!keep_dims)
+ {
+ // We have to sort the reduction axis vectors in order for remove_dimension
+ // to work properly
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for(int i = 0; i < reduction_ops; ++i)
+ {
+ out_shape.remove_dimension(axis_local[i] - i);
+ }
+ return out_shape;
+ }
+ else
+ {
+ for(int i = 0; i < reduction_ops; ++i)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ return out_shape;
+ }
+}
/** Calculate the output tensor shape of a vector input given the convolution dimensions
*
* @param[in] input Input tensor shape
diff --git a/arm_compute/core/utils/misc/Utility.h b/arm_compute/core/utils/misc/Utility.h
index 8dd9afd5cd..2325644e72 100644
--- a/arm_compute/core/utils/misc/Utility.h
+++ b/arm_compute/core/utils/misc/Utility.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,6 +53,20 @@ struct index_sequence_generator<0u, S...> : index_sequence<S...>
template <std::size_t N>
using index_sequence_t = typename index_sequence_generator<N>::type;
+
+template <typename T, std::size_t N, T val, T... vals>
+struct generate_array : generate_array < T, N - 1, val, val, vals... >
+{
+};
+
+template <typename T, T val, T... vals>
+struct generate_array<T, 0, val, vals...>
+{
+ static constexpr std::array<T, sizeof...(vals)> value{ vals... };
+};
+
+template <typename T, T val, T... vals>
+constexpr std::array<T, sizeof...(vals)> generate_array<T, 0, val, vals...>::value;
/** @endcond */
namespace detail
diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h
index 9c087eadf1..6836ba3f58 100644
--- a/arm_compute/runtime/CL/functions/CLReduceMean.h
+++ b/arm_compute/runtime/CL/functions/CLReduceMean.h
@@ -71,7 +71,7 @@ private:
std::vector<CLReductionOperation> _reduction_kernels;
std::vector<CLTensor> _reduced_outs;
CLReshapeLayer _reshape;
- unsigned int _reduction_ops;
+ int _reduction_ops;
bool _keep_dims;
};
} // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index fdd8edfe87..245f7577ce 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -72,7 +72,7 @@ private:
std::vector<NEReductionOperation> _reduction_kernels;
std::vector<Tensor> _reduced_outs;
NEReshapeLayer _reshape;
- unsigned int _reduction_ops;
+ int _reduction_ops;
bool _keep_dims;
};
} // namespace arm_compute
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 9c8eaf2733..bcbc818e59 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -236,6 +236,9 @@ If there is more than one release in a month then an extra sequential number is
@subsection S2_2_changelog Changelog
+v19.08.1 Public maintanance release
+ - Various bug fixes.
+
v19.08 Public major release
- Various bug fixes.
- Various optimisations.
@@ -248,9 +251,14 @@ v19.08 Public major release
- CLGEMMTranspose1xWKernel / CLGEMMTranspose1xW
- CLWidthConcatenateLayer
- New NEON kernels / functions:
+ - @ref NEAbsLayer
- @ref NECast
+ - @ref NEElementwisePower
+ - @ref NELogLayer
- @ref NELSTMLayerQuantized
+ - @ref NENegLayer
- @ref NEPReluLayer
+ - @ref NESinLayer
- @ref NEBatchConcatenateLayerKernel
- @ref NEDepthToSpaceLayerKernel / @ref NEDepthToSpaceLayer
- @ref NEDepthwiseConvolutionLayerNativeKernel
@@ -258,8 +266,13 @@ v19.08 Public major release
- @ref NEMeanStdDevNormalizationKernel / @ref NEMeanStdDevNormalizationLayer
- @ref NESpaceToDepthLayerKernel / @ref NESpaceToDepthLayer
- New OpenCL kernels / functions:
+ - @ref CLAbsLayer
+ - @ref CLElementwisePower
+ - @ref CLLogLayer
- @ref CLLSTMLayerQuantized
+ - @ref CLNegLayer
- @ref CLPReluLayer
+ - @ref CLSinLayer
- @ref CLBatchConcatenateLayerKernel
- @ref CLDepthToSpaceLayerKernel / @ref CLDepthToSpaceLayer
- @ref CLGEMMLowpMatrixMultiplyNativeKernel
@@ -271,6 +284,9 @@ v19.08 Public major release
- neon_opticalflow
- cl_cache
- neon_permute
+ - Added support for FP16 in @ref NEDeconvolutionLayer
+ - Added support for FP16 in @ref CLDeconvolutionLayer
+ - Added support for REDUCE_MIN and REDUCE_MAX in @ref ReductionOperation
- Enable the fusion of batch normalization with convolution and depthwise convolution layer for FP32 in the graph API (OpenCL only)
- Added support for fusing activation function and broadcast addition with the matrix multiplication for FP32 (OpenCL only)
- Re-factored the depthwise convolution layer kernel on NEON for generic cases
@@ -280,6 +296,7 @@ v19.08 Public major release
- The @ref NEDepthwiseConvolutionLayer3x3 will be replaced by @ref NEDepthwiseConvolutionLayerOptimized to accommodate for future optimizations.
- Removed inner_border_right and inner_border_top parameters from @ref CLDeconvolutionLayer interface
- Removed inner_border_right and inner_border_top parameters from @ref NEDeconvolutionLayer interface
+ - Optimized the NEON assembly kernel for GEMMLowp. The new implementation fuses the output stage and quantization with the matrix multiplication kernel
v19.05 Public major release
- Various bug fixes.
diff --git a/docs/Doxyfile b/docs/Doxyfile
index e9027c85f1..5f3091c492 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME = "Compute Library"
# could be handy for archiving the generated documentation or if some version
# control system is used.
-PROJECT_NUMBER = 19.08
+PROJECT_NUMBER = 19.08.1
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
diff --git a/release_repository b/release_repository
-Subproject 4ba87dbdc3b22220eba4a792c1f5c87e7a88c7a
+Subproject 975dfe175e3d7c62c27598b1c0e8e77ed90df46
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 2d28a496c9..d81ad46b29 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -98,7 +98,7 @@ void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, cons
for(unsigned int n = 0; n < info->num_dimensions(); ++n)
{
- offset_first_element += window[n].start() * strides[n];
+ offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n];
}
unsigned int idx_start = idx;
diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 295fb5c997..177f05f3ca 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,7 +35,7 @@
using namespace arm_compute;
CLDeconvolutionLayerUpsampleKernel::CLDeconvolutionLayerUpsampleKernel()
- : _input(nullptr), _output(nullptr), _info()
+ : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN)
{
}
@@ -72,13 +72,14 @@ void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTe
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- _input = input;
- _output = output;
- _info = info;
-
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
+ _input = input;
+ _output = output;
+ _info = info;
+ _data_layout = input->info()->data_layout();
+
// Create kernel
CLBuildOptions build_opts;
build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
@@ -99,10 +100,8 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
-
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
const int out_start_x = _info.pad().first;
const int out_end_x = _output->info()->dimension(idx_w) - _info.pad().first + _info.stride().first - 1;
@@ -112,7 +111,7 @@ void CLDeconvolutionLayerUpsampleKernel::run(const Window &window, cl::CommandQu
const int out_end_y = _output->info()->dimension(idx_h) - _info.pad().second + _info.stride().second - 1;
const int out_step_y = _info.stride().second;
- switch(data_layout)
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index dc4c431c5d..21685dcf0e 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -377,7 +377,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
- : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
+ : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _border_size(0), _conv_stride_x(0), _conv_stride_y(0)
{
}
@@ -390,10 +390,10 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- const DataLayout data_layout = input->info()->data_layout();
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ _data_layout = input->info()->data_layout();
+ const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const unsigned int kernel_size = weights->info()->dimension(width_idx);
const DataType data_type = input->info()->data_type();
@@ -419,11 +419,11 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
_conv_stride_x = std::get<0>(conv_info.stride());
_conv_stride_y = std::get<1>(conv_info.stride());
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
_border_size = BorderSize(conv_info.pad_left(), 0, conv_info.pad_right(), 0);
}
- else if(data_layout == DataLayout::NCHW)
+ else if(_data_layout == DataLayout::NCHW)
{
_border_size = BorderSize(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
}
@@ -441,15 +441,15 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
std::stringstream kernel_name;
kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
- kernel_name << "_" << lower_string(string_from_data_layout(data_layout));
+ kernel_name << "_" << lower_string(string_from_data_layout(_data_layout));
}
CLBuildOptions build_options;
build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));
- const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout);
+ const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
if(run_optimized_for_bifrost)
{
@@ -466,9 +466,9 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
- const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, data_layout);
+ const bool run_optimized_for_bifrost_nhwc = can_run_optimized_kernel_for_bifrost_nhwc(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);
build_options.add_option(std::string("-DDATA_LAYOUT_NHWC=1"));
build_options.add_option(std::string("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx))));
build_options.add_option(std::string("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx))));
@@ -538,7 +538,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(height_idx));
_config_id += "_";
- _config_id += lower_string(string_from_data_layout(data_layout));
+ _config_id += lower_string(string_from_data_layout(_data_layout));
}
Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -562,9 +562,8 @@ void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue
win_in.adjust(Window::DimX, -_border_size.left, true);
win_in.adjust(Window::DimY, -_border_size.top, true);
- const DataLayout data_layout = _input->info()->data_layout();
- const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 10d6e68cd9..24f22c31a5 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -287,7 +287,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *input, const Size
} // namespace
CLIm2ColKernel::CLIm2ColKernel()
- : _input(nullptr), _output(nullptr), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
+ : _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
{
}
@@ -297,9 +297,10 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
- const DataLayout data_layout = input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ _data_layout = input->info()->data_layout();
+
+ const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
const unsigned int input_width = input->info()->dimension(width_idx);
const unsigned int input_height = input->info()->dimension(height_idx);
@@ -336,7 +337,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(1));
_config_id += "_";
- _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+ _config_id += lower_string(string_from_data_layout(_data_layout));
}
Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
@@ -369,7 +370,7 @@ void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
Window slice_in = first_slice_3d;
Window slice_out = window_output.first_slice_window_2D();
- if(_input->info()->data_layout() == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3);
const int num_batches = tmp_win[3].end();
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 8eaf5bf76f..032d451aad 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -172,7 +172,7 @@ std::tuple<Status, Window, CLPoolingConfig> validate_and_configure_window(ITenso
} // namespace
CLPoolingLayerKernel::CLPoolingLayerKernel()
- : _input(nullptr), _output(nullptr), _pool_info(), _border_size(0), _num_elems_processed_per_iteration(1)
+ : _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _border_size(0), _num_elems_processed_per_iteration(1)
{
}
@@ -185,13 +185,18 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ // Set instance variables
+ _input = input;
+ _output = output;
+ _pool_info = pool_info;
+ _data_layout = input->info()->data_layout();
+
int pool_stride_x = 0;
int pool_stride_y = 0;
const PoolingType pool_type = pool_info.pool_type();
- DataLayout data_layout = input->info()->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const int pool_size_x = pool_info.is_global_pooling() ? input->info()->dimension(idx_width) : pool_info.pool_size().width;
const int pool_size_y = pool_info.is_global_pooling() ? input->info()->dimension(idx_height) : pool_info.pool_size().height;
const PadStrideInfo pad_stride_info = pool_info.pad_stride_info();
@@ -218,11 +223,6 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
auto_init(input->info(), output->info(), pool_info);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info));
- // Set instance variables
- _input = input;
- _output = output;
- _pool_info = pool_info;
-
const DataType data_type = input->info()->data_type();
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -237,7 +237,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
// Create kernel
- switch(data_layout)
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
@@ -286,7 +286,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
ICLKernel::configure_internal(std::get<1>(win_config));
- if(data_layout == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
CLPoolingConfig pooling_config = std::get<2>(win_config);
_num_elems_processed_per_iteration = pooling_config.first;
@@ -302,7 +302,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
_config_id = "pooling_layer_";
_config_id += lower_string(string_from_data_type(data_type));
_config_id += "_";
- _config_id += lower_string(string_from_data_layout(data_layout));
+ _config_id += lower_string(string_from_data_layout(_data_layout));
_config_id += "_";
_config_id += support::cpp11::to_string(output->info()->dimension(idx_width));
_config_id += "_";
@@ -333,7 +333,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
// Collapse window
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
- switch(_input->info()->data_layout())
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
diff --git a/src/core/CL/kernels/CLScaleKernel.cpp b/src/core/CL/kernels/CLScaleKernel.cpp
index 488313fd12..82c5c8a446 100644
--- a/src/core/CL/kernels/CLScaleKernel.cpp
+++ b/src/core/CL/kernels/CLScaleKernel.cpp
@@ -160,11 +160,12 @@ const ICLTensor *CLScaleKernel::output() const
void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy)
{
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy));
+
_input = input;
_output = output;
_interpolationPolicy = policy;
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy));
+ _data_layout = input->info()->data_layout();
float wr = 0.f;
float hr = 0.f;
@@ -172,10 +173,9 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo
const bool call_quantized_kernel = is_data_type_quantized_asymmetric(input->info()->data_type()) && policy == InterpolationPolicy::BILINEAR;
- DataLayout data_layout = input->info()->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const bool is_nhwc = data_layout == DataLayout::NHWC;
+ const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const bool is_nhwc = _data_layout == DataLayout::NHWC;
// Compute the ratio between source width/height and destination width/height
const unsigned int input_width = input->info()->dimension(idx_width);
@@ -215,7 +215,7 @@ void CLScaleKernel::configure(const ICLTensor *input, ICLTensor *output, Interpo
std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
std::string kernel_name = "scale_" + interpolation_name;
kernel_name += call_quantized_kernel ? "_quantized_" : "_";
- kernel_name += lower_string(string_from_data_layout(data_layout));
+ kernel_name += lower_string(string_from_data_layout(_data_layout));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
unsigned int idx = is_nhwc ? 2 * num_arguments_per_4D_tensor() : 2 * num_arguments_per_2D_tensor(); //Skip the input and output parameters
@@ -249,7 +249,7 @@ void CLScaleKernel::run(const Window &window, cl::CommandQueue &queue)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
- switch(_input->info()->data_layout())
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
diff --git a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
index ce5ed86332..2ccd540788 100644
--- a/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLUpsampleLayerKernel.cpp
@@ -37,7 +37,7 @@
namespace arm_compute
{
CLUpsampleLayerKernel::CLUpsampleLayerKernel()
- : _input(nullptr), _output(nullptr), _info(), _num_elems_processed_per_iteration_input_x()
+ : _input(nullptr), _output(nullptr), _info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration_input_x()
{
}
@@ -71,13 +71,12 @@ void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
_input = input;
_output = output;
_info = info;
+ _data_layout = input->info()->data_layout();
_num_elems_processed_per_iteration_input_x = 1;
- const DataLayout data_layout = input->info()->data_layout();
-
TensorShape output_shape = misc::shape_calculator::compute_upsample_shape(*input->info(), info);
auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
- output->info()->set_data_layout(data_layout);
+ output->info()->set_data_layout(_data_layout);
unsigned int num_elems_processed_per_iteration_x = 16;
const int output_width_x = output->info()->dimension(0);
@@ -88,7 +87,7 @@ void CLUpsampleLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
Window win{};
- switch(data_layout)
+ switch(_data_layout)
{
case DataLayout::NCHW:
{
@@ -140,8 +139,7 @@ void CLUpsampleLayerKernel::run(const Window &window, cl::CommandQueue &queue)
Window slice_out = collapsed_window.first_slice_window_3D();
Window slice_in = collapsed_window.first_slice_window_3D();
- DataLayout data_layout = _input->info()->data_layout();
- switch(data_layout)
+ switch(_data_layout)
{
case DataLayout::NCHW:
slice_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _num_elems_processed_per_iteration_input_x));
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index 1c31ceba99..6125790491 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -99,7 +99,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
CLWinogradInputTransformKernel::CLWinogradInputTransformKernel()
- : _border_size(0), _input(nullptr), _output(nullptr), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
+ : _border_size(0), _input(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _num_tiles_x(0), _num_tiles_y(0), _step_z(1)
{
}
@@ -116,16 +116,17 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
const PadStrideInfo conv_info = winograd_info.convolution_info;
const Size2D output_tile_size = winograd_info.output_tile_size;
const Size2D kernel_size = winograd_info.kernel_size;
- const DataLayout data_layout = input->info()->data_layout();
- const size_t idx_w = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+ _data_layout = input->info()->data_layout();
+
+ const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
// Compute number of elements to process in the X and Y direction
const int num_elements_x = input->info()->dimension(idx_w) - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right();
const int num_elements_y = input->info()->dimension(idx_h) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
- if(data_layout == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
// Check if we need to extend the right or bottom border
const unsigned int extra_border_right = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
@@ -166,7 +167,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
build_opts.add_option_if(total_batches > 1, "-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y));
build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
@@ -184,7 +185,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
// Check optimized kernel if output_dims == 2x2
- if((tile_max_dim == 2) && (data_layout == DataLayout::NCHW))
+ if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
{
_step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
}
@@ -192,7 +193,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
// Append stepz and data layout
kernel_name += "_stepz";
kernel_name += support::cpp11::to_string(_step_z);
- kernel_name += "_" + lower_string(string_from_data_layout(data_layout));
+ kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
@@ -212,7 +213,7 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor
_config_id += "_";
_config_id += support::cpp11::to_string(conv_info.pad_top());
_config_id += "_";
- _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
+ _config_id += lower_string(string_from_data_layout(_data_layout));
}
Status CLWinogradInputTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
@@ -229,11 +230,10 @@ void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
- const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const size_t total_batches = window.shape().total_size_upper(3);
+ const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const size_t idx_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const size_t total_batches = window.shape().total_size_upper(3);
// Collapse window
Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -241,7 +241,7 @@ void CLWinogradInputTransformKernel::run(const Window &window, cl::CommandQueue
Window slice = window_collapsed.first_slice_window_3D();
slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
- if(data_layout == DataLayout::NHWC)
+ if(_data_layout == DataLayout::NHWC)
{
slice.set(idx_h, Window::Dimension(0, _num_tiles_y * total_batches, 1));
}
diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index df631c3c03..98b0c106db 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
@@ -63,7 +63,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
} // namespace
NEDepthToSpaceLayerKernel::NEDepthToSpaceLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+ : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
{
}
@@ -80,6 +80,7 @@ void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output,
_input = input;
_output = output;
_block_shape = block_shape;
+ _data_layout = input->info()->data_layout();
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
@@ -99,7 +100,7 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- const int idx_channel = get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const int depth_size = _input->info()->dimension(idx_channel);
const int r = (depth_size / (_block_shape * _block_shape));
const int element_size = _input->info()->element_size();
@@ -112,7 +113,7 @@ void NEDepthToSpaceLayerKernel::run(const Window &window, const ThreadInfo &info
slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
// Main loop for NCHW and NHWC
- if(_input->info()->data_layout() == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
Window slice_in = window.first_slice_window_2D();
do
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 0641d6cfa3..27c3d66b4f 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -265,10 +265,9 @@ void NEIm2ColKernel::run_im2col(const Window &window)
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
const int input_w = _input->info()->dimension(width_idx);
const int input_h = _input->info()->dimension(height_idx);
@@ -344,7 +343,7 @@ void NEIm2ColKernel::run_im2col(const Window &window)
}
NEIm2ColKernel::NEIm2ColKernel()
- : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U)
+ : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U), _data_layout(DataLayout::UNKNOWN)
{
}
@@ -355,9 +354,9 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
ARM_COMPUTE_UNUSED(num_groups);
- const DataLayout data_layout = input->info()->data_layout();
- const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ _data_layout = input->info()->data_layout();
+ const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+ const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
_input = input;
_output = output;
@@ -370,7 +369,7 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
_conv_info, _dilation);
_has_bias = has_bias;
- if(data_layout == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
switch(_input->info()->data_type())
{
diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
index 62c9ca0d5e..14de4a19d8 100644
--- a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp
@@ -321,7 +321,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
} // namespace
NEPoolingLayerKernel::NEPoolingLayerKernel()
- : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
+ : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _data_layout(DataLayout::UNKNOWN), _num_elems_processed_per_iteration(0), _border_size(0), _is_square(false)
{
}
@@ -364,14 +364,15 @@ void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, cons
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, pooled_w, pooled_h));
// Set instance variables
- _input = input;
- _output = output;
- _pool_info = pool_info;
- _is_square = (pool_size.x() == pool_size.y());
+ _input = input;
+ _output = output;
+ _pool_info = pool_info;
+ _data_layout = input->info()->data_layout();
+ _is_square = (pool_size.x() == pool_size.y());
// Get data type
const DataType data_type = input->info()->data_type();
- const bool is_nchw = data_layout == DataLayout::NCHW;
+ const bool is_nchw = _data_layout == DataLayout::NCHW;
if(data_type == DataType::QASYMM8)
{
@@ -1574,7 +1575,12 @@ void NEPoolingLayerKernel::poolingMxN_f32_nhwc(const Window &window_input, const
// Calculate square-root in case of l2 pooling
if(pooling_type == PoolingType::L2)
{
- vres = vmulq_f32(vres, vinvsqrtq_f32(vres));
+ float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+ static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+ static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+ static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
+ };
+ vres = l2_res;
}
// Store result
@@ -1835,7 +1841,7 @@ void NEPoolingLayerKernel::run(const Window &window, const ThreadInfo &info)
const bool exclude_padding = _pool_info.exclude_padding();
Window window_input(window);
- if(_input->info()->data_layout() == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
// Set step for input in x and y direction for the input
unsigned int window_x_inc = 0;
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index ffa4fa3565..16cd6f77b4 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -742,23 +742,8 @@ struct RedOpYZW
for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
{
- T *in_ptr;
- switch(axis)
- {
- case 1:
- in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, dim)));
- break;
- case 2:
- in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, dim)));
- break;
- case 3:
- in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, dim)));
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
+ const T *in_ptr = reinterpret_cast<T *>(input.ptr() + in_info.strides_in_bytes()[axis] * dim);
const auto vec_elements = wrapper::vloadq(in_ptr);
-
switch(op)
{
case ReductionOperation::SUM:
@@ -907,23 +892,8 @@ struct RedOpYZW_qasymm8
for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
{
- uint8_t *in_ptr;
- switch(axis)
- {
- case 1:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, index_dim));
- break;
- case 2:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, index_dim));
- break;
- case 3:
- in_ptr = input.ptr() + in_info.offset_element_in_bytes(Coordinates(0, 0, 0, index_dim));
- break;
- default:
- ARM_COMPUTE_ERROR("Not supported");
- }
- const auto vec_elements = wrapper::vloadq(in_ptr);
-
+ const uint8_t *in_ptr = input.ptr() + in_info.strides_in_bytes()[axis] * index_dim;
+ const auto vec_elements = wrapper::vloadq(in_ptr);
switch(op)
{
case ReductionOperation::SUM:
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index a2a44fca18..5b8e196a2c 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -909,7 +909,7 @@ void NEScaleKernel::scale_area_nchw(const Window &window)
void NEScaleKernel::scale_nhwc(const Window &window)
{
// Get data layout and width/height indices
- const DataLayout data_layout = _input->info()->data_layout();
+ const DataLayout data_layout = DataLayout::NHWC;
const int idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 4803365013..ffd2dc14bf 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
@@ -66,7 +66,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, i
} // namespace
NESpaceToDepthLayerKernel::NESpaceToDepthLayerKernel()
- : _input(nullptr), _output(nullptr), _block_shape()
+ : _input(nullptr), _output(nullptr), _block_shape(), _data_layout(DataLayout::UNKNOWN)
{
}
@@ -82,6 +82,7 @@ void NESpaceToDepthLayerKernel::configure(const ITensor *input, ITensor *output,
_input = input;
_block_shape = block_shape;
_output = output;
+ _data_layout = input->info()->data_layout();
// Configure kernel window
Window win = calculate_max_window(*output->info(), Steps());
@@ -100,9 +101,8 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
- const DataLayout data_layout = _input->info()->data_layout();
- const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int element_size = _input->info()->element_size();
+ const int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+ const int element_size = _input->info()->element_size();
const size_t channel_size = _input->info()->dimension(channel_idx);
@@ -111,7 +111,7 @@ void NESpaceToDepthLayerKernel::run(const Window &window, const ThreadInfo &info
int batch_id = 0;
// Main loop for NCHW and NHWC
- if(_output->info()->data_layout() == DataLayout::NCHW)
+ if(_data_layout == DataLayout::NCHW)
{
do
{
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index a3634cd46e..c5de43da35 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -26,20 +26,81 @@
#include "arm_compute/core/CL/CLValidate.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
namespace arm_compute
{
+namespace
+{
+Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+ ARM_COMPUTE_UNUSED(keep_dims);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
+
+ const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ const int input_dims = input->num_dimensions();
+ Coordinates axis_local = reduction_axis;
+
+ for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+ {
+ //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
+ }
+
+ if(output->tensor_shape().total_size() != 0)
+ {
+ // Only validate if not using auto_init for the output tensor
+ TensorShape out_shape = input->tensor_shape();
+ // Validate output_shape only if not using auto_init
+ convert_negative_axis(axis_local, input_dims);
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+ if(output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if(keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
+ const unsigned int remove_index = axis_local[i] - i;
+ ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
+ out_shape.remove_dimension(remove_index);
+ }
+ }
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
+ }
+ return Status{};
+}
+}
CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
{
}
void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
@@ -49,14 +110,10 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
Coordinates axis_local = reduction_axis;
const int input_dims = input->info()->num_dimensions();
- // Convert negative axis
- for(unsigned int i = 0; i < _reduction_ops; ++i)
- {
- axis_local[i] = wrap_around(axis_local[i], input_dims);
- }
+ convert_negative_axis(axis_local, input_dims);
// Perform reduction for every axis
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
@@ -75,7 +132,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
}
// Allocate intermediate tensors
- for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
{
_reduced_outs[i].allocator()->allocate();
}
@@ -88,7 +145,7 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
out_shape.remove_dimension(axis_local[i] - i);
}
@@ -99,55 +156,16 @@ void CLReduceMean::configure(ICLTensor *input, const Coordinates &reduction_axis
Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
-
- TensorShape out_shape = input->tensor_shape();
-
- Coordinates axis_sorted = reduction_axis;
- const unsigned int reduction_ops = reduction_axis.num_dimensions();
- const int input_dims = input->num_dimensions();
-
- // Convert negative axis
- for(unsigned int i = 0; i < reduction_ops; ++i)
- {
- axis_sorted[i] = wrap_around(axis_sorted[i], input_dims);
- }
-
- std::sort(axis_sorted.begin(), axis_sorted.begin() + reduction_ops);
- for(unsigned int i = 0; i < reduction_ops; ++i)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(axis_sorted[i] > 3);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_sorted[i]) > input->num_dimensions() - 1);
- if(output->total_size() > 0 && keep_dims)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_sorted[i]) != 1);
- }
- if(keep_dims)
- {
- out_shape.set(axis_sorted[i], 1);
- }
- else
- {
- out_shape.remove_dimension(axis_sorted[i] - i);
- }
- }
-
- const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
- return Status{};
+ return validate_config(input, reduction_axis, keep_dims, output);
}
void CLReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(auto &kernel : _reduction_kernels)
{
- _reduction_kernels[i].run();
+ kernel.run();
}
if(!_keep_dims)
diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 0b145f034d..96ec8b8587 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp
@@ -24,80 +24,97 @@
#include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-using namespace arm_compute;
+namespace arm_compute
+{
NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
{
}
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
{
ARM_COMPUTE_UNUSED(keep_dims);
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
- TensorShape out_shape = input->tensor_shape();
const unsigned int reduction_ops = reduction_axis.num_dimensions();
const int input_dims = input->num_dimensions();
Coordinates axis_local = reduction_axis;
- // Convert negative axis
- for(unsigned int i = 0; i < reduction_ops; ++i)
+ for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
{
- axis_local[i] = wrap_around(axis_local[i], input_dims);
+ //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
}
- std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
- for(unsigned int i = 0; i < reduction_ops; ++i)
+ if(output->tensor_shape().total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
- ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
- if(output->total_size() > 0 && keep_dims)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
- }
- if(keep_dims)
- {
- out_shape.set(axis_local[i], 1);
- }
- else
+ // Only validate if not using auto_init for the output tensor
+ TensorShape out_shape = input->tensor_shape();
+ // Validate output_shape only if not using auto_init
+ convert_negative_axis(axis_local, input_dims);
+ std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
+ for(unsigned int i = 0; i < reduction_ops; ++i)
{
- out_shape.remove_dimension(axis_local[i] - i);
+ ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
+ ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
+ if(output->total_size() > 0 && keep_dims)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
+ }
+ if(keep_dims)
+ {
+ out_shape.set(axis_local[i], 1);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(i > static_cast<unsigned int>(axis_local[i]));
+ const unsigned int remove_index = axis_local[i] - i;
+ ARM_COMPUTE_RETURN_ERROR_ON(remove_index >= out_shape.num_dimensions());
+ out_shape.remove_dimension(remove_index);
+ }
}
+ const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
}
- const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
return Status{};
}
+Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+{
+ return validate_config(input, reduction_axis, keep_dims, output);
+}
+
void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+ // Perform validate step
+ ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
+ // Output auto inizialitation if not yet initialized
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input, reduction_axis, keep_dims);
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
_reduction_ops = reduction_axis.num_dimensions();
_reduction_kernels.resize(_reduction_ops);
_reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
_keep_dims = keep_dims;
- Coordinates axis_local = reduction_axis;
- const int input_dims = input->info()->num_dimensions();
- const unsigned int reduction_ops = reduction_axis.num_dimensions();
+ Coordinates axis_local = reduction_axis;
+ const int input_dims = input->info()->num_dimensions();
- // Convert negative axis
- for(unsigned int i = 0; i < reduction_ops; ++i)
- {
- axis_local[i] = wrap_around(axis_local[i], input_dims);
- }
+ convert_negative_axis(axis_local, input_dims);
// Perform reduction for every axis
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
TensorShape out_shape = i == 0 ? input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
out_shape.set(axis_local[i], 1);
@@ -116,7 +133,7 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
}
// Allocate intermediate tensors
- for(unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+ for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
{
_reduced_outs[i].allocator()->allocate();
}
@@ -125,11 +142,10 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
if(!keep_dims)
{
TensorShape out_shape = input->info()->tensor_shape();
-
// We have to sort the reduction axis vectors in order for remove_dimension
// to work properly
std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(int i = 0; i < _reduction_ops; ++i)
{
out_shape.remove_dimension(axis_local[i] - i);
}
@@ -141,10 +157,9 @@ void NEReduceMean::configure(ITensor *input, const Coordinates &reduction_axis,
void NEReduceMean::run()
{
MemoryGroupResourceScope scope_mg(_memory_group);
-
- for(unsigned int i = 0; i < _reduction_ops; ++i)
+ for(auto &kernel : _reduction_kernels)
{
- _reduction_kernels[i].run();
+ kernel.run();
}
if(!_keep_dims)
@@ -152,3 +167,5 @@ void NEReduceMean::run()
_reshape.run();
}
}
+
+} // namespace arm_compute
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 07ecf45d81..b479eb4953 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -203,7 +203,9 @@ public:
TensorShape{ 128U, 1U, 5U, 3U },
TensorShape{ 9U, 9U, 3U, 4U },
TensorShape{ 27U, 13U, 2U, 4U },
- TensorShape{ 1U, 1U, 1U, 5U }
+ TensorShape{ 1U, 1U, 1U, 5U },
+ TensorShape{ 1U, 16U, 10U, 2U, 128U },
+ TensorShape{ 1U, 16U, 10U, 2U, 128U }
}),
ShapeDataset("Shape1",
{
@@ -212,7 +214,9 @@ public:
TensorShape{ 128U, 64U, 1U, 3U },
TensorShape{ 9U, 1U, 3U },
TensorShape{ 1U },
- TensorShape{ 9U, 9U, 3U, 5U }
+ TensorShape{ 9U, 9U, 3U, 5U },
+ TensorShape{ 1U, 1U, 1U, 1U, 128U },
+ TensorShape{ 128U }
}))
{
}
@@ -686,7 +690,7 @@ public:
: ShapeDataset("InputShape",
{
// Batch size 1
- TensorShape{ 32U, 37U, 3U },
+ TensorShape{ 32U, 37U, 3U },
// Batch size 4
TensorShape{ 32U, 37U, 3U, 4U },
})
@@ -702,7 +706,7 @@ public:
: ShapeDataset("InputShape",
{
// Batch size 1
- TensorShape{ 32U, 37U, 3U },
+ TensorShape{ 32U, 37U, 3U },
// Batch size 4
TensorShape{ 32U, 37U, 3U, 4U },
// Arbitrary batch size
diff --git a/tests/validation/CL/ReduceMean.cpp b/tests/validation/CL/ReduceMean.cpp
index cfd4a2730c..1b7400bf53 100644
--- a/tests/validation/CL/ReduceMean.cpp
+++ b/tests/validation/CL/ReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -55,20 +55,26 @@ TEST_SUITE(ReduceMean)
// *INDENT-OFF*
// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid axis
TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid output shape
- TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32)
+ TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32),// OK
+ TensorInfo(TensorShape{228U, 19U, 2U, 2U}, 1, DataType::F32),// OK
+ TensorInfo(TensorShape{228U, 19U, 2U, 1U}, 1, DataType::F32) // Cannot support axis 3 not valid
}),
framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
- TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32)
+ TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32),
+ TensorInfo(TensorShape(19U), 1, DataType::F32),
+ TensorInfo(TensorShape(19U), 1, DataType::F32)
+
})),
- framework::dataset::make("Axis", { Coordinates(4), Coordinates(0,2), Coordinates(2) })),
- framework::dataset::make("Expected", { false, false, true })),
- input_info, output_info, axis, expected)
+ framework::dataset::make("Axis", { Coordinates(4), Coordinates(0,2), Coordinates(2), Coordinates(3,2,0), Coordinates(3,2,0) })),
+ framework::dataset::make("Keep", { true, true, true, false, false })),
+ framework::dataset::make("Expected", { false, false, true, true, false })),
+ input_info, output_info, axis, keep, expected)
{
- const Status status = CLReduceMean::validate(&input_info.clone()->set_is_resizable(false), axis, true, &output_info.clone()->set_is_resizable(false));
+ const Status status = CLReduceMean::validate(&input_info.clone()->set_is_resizable(false), axis, keep, &output_info.clone()->set_is_resizable(false));
ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
}
// clang-format on
diff --git a/tests/validation/NEON/ReduceMean.cpp b/tests/validation/NEON/ReduceMean.cpp
index 3cd7ce362e..6d0caf7160 100644
--- a/tests/validation/NEON/ReduceMean.cpp
+++ b/tests/validation/NEON/ReduceMean.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,20 +57,26 @@ TEST_SUITE(ReduceMean)
// *INDENT-OFF*
// clang-format off
-DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid axis
TensorInfo(TensorShape(27U, 3U, 16U, 2U), 1, DataType::F32), // Invalid output shape
- TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32)
+ TensorInfo(TensorShape(32U, 16U, 16U, 2U), 1, DataType::F32),// OK
+ TensorInfo(TensorShape{228U, 19U, 2U, 2U}, 1, DataType::F32),// OK
+ TensorInfo(TensorShape{228U, 19U, 2U, 1U}, 1, DataType::F32) // Cannot support axis 3 not valid
}),
framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
TensorInfo(TensorShape(27U, 3U, 1U, 2U), 1, DataType::F32),
- TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32)
+ TensorInfo(TensorShape(32U, 16U, 1U, 2U), 1, DataType::F32),
+ TensorInfo(TensorShape(19U), 1, DataType::F32),
+ TensorInfo(TensorShape(19U), 1, DataType::F32)
+
})),
- framework::dataset::make("Axis", { Coordinates(4), Coordinates(0,2), Coordinates(2) })),
- framework::dataset::make("Expected", { false, false, true })),
- input_info, output_info, axis, expected)
+ framework::dataset::make("Axis", { Coordinates(4), Coordinates(0,2), Coordinates(2), Coordinates(3,2,0), Coordinates(3,2,0) })),
+ framework::dataset::make("Keep", { true, true, true, false, false })),
+ framework::dataset::make("Expected", { false, false, true, true, false })),
+ input_info, output_info, axis, keep, expected)
{
- const Status status = NEReduceMean::validate(&input_info.clone()->set_is_resizable(false), axis, true, &output_info.clone()->set_is_resizable(false));
+ const Status status = NEReduceMean::validate(&input_info.clone()->set_is_resizable(false), axis, keep, &output_info.clone()->set_is_resizable(false));
ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);
}
// clang-format on