aboutsummaryrefslogtreecommitdiff
path: root/arm_compute/runtime/NEON
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-06-05 14:56:06 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:53:09 +0000
commit72219330fd85b1271e714d4ba894d6d8e26340c9 (patch)
tree9ae0510087a1ca77b1695252a8621de3f2ab98af /arm_compute/runtime/NEON
parentc42f28d45e9b990276d54880d2cee9c9ee675a41 (diff)
downloadComputeLibrary-72219330fd85b1271e714d4ba894d6d8e26340c9.tar.gz
COMPMID-1145: (API) Introduce prepare() stage (NEON/CL/GLES)
Change-Id: I5b46764f9c3154ec3e3b9c951cc9e6dfbcb81dfb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134255 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com> Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Diffstat (limited to 'arm_compute/runtime/NEON')
-rw-r--r--arm_compute/runtime/NEON/AssemblyHelper.h52
-rw-r--r--arm_compute/runtime/NEON/functions/NEConvolutionLayer.h1
-rw-r--r--arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h7
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMM.h15
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h12
-rw-r--r--arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h3
11 files changed, 77 insertions, 26 deletions
diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h
index 3aa43ec96e..c4ba1a584e 100644
--- a/arm_compute/runtime/NEON/AssemblyHelper.h
+++ b/arm_compute/runtime/NEON/AssemblyHelper.h
@@ -51,7 +51,7 @@ public:
using TypeResult = TypeOutput;
/** Default constructor. */
AssemblyKernelGlue()
- : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr)
+ : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr), _is_prepared(false)
{
}
/** Assembly Gemm */
@@ -76,6 +76,31 @@ public:
ITensor *_workspace;
/** Pre-transpose tensor */
ITensor *_pretranspose;
+ /** Prepared flag */
+ bool _is_prepared;
+
+ /** Runs a preparation step, usually for pre-transposing matrix b */
+ void prepare()
+ {
+ // Pretranspose B if required
+ if(_gemm_kernel_asm->B_pretranspose_required())
+ {
+ const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
+ const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+
+ // Forcing 128-byte alignment (required by 32-bit kernels)
+ const unsigned int alignment = 128;
+ void *raw_ptr = reinterpret_cast<void *>(_pretranspose->buffer());
+ size_t space = _pretranspose->info()->total_size();
+ void *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
+ ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
+ _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
+ _b->mark_as_unused();
+ }
+
+ _is_prepared = true;
+ }
/** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel.
* The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2)
@@ -102,28 +127,25 @@ public:
const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer());
- // Set workspace if needed
+ // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
if(_workspace != nullptr)
{
_gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace->buffer()));
+ const unsigned int window_size = _gemm_kernel_asm->get_window_size();
+ unsigned int num_threads = NEScheduler::get().num_threads();
+ if(window_size < num_threads)
+ {
+ num_threads = window_size;
+ _gemm_kernel_asm->set_nthreads(num_threads);
+ }
}
+ // Prepare assembly kernel
+ prepare();
+
// Set gemm parameters
_gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
- // Pretranspose B if required
- if(_gemm_kernel_asm->B_pretranspose_required())
- {
- // Forcing 128-byte alignment (required by 32-bit kernels)
- const unsigned int alignment = 128;
- void *raw_ptr = reinterpret_cast<void *>(_pretranspose->buffer());
- size_t space = _pretranspose->info()->total_size();
- void *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
- ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
- _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
- _b->mark_as_unused();
- }
-
// Schedule assembly kernel
NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
}
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index ff41f0c985..e143814a4e 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -112,6 +112,7 @@ public:
const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
std::shared_ptr<IMemoryManager> _memory_manager;
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 66c6d427ba..3e527168c1 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -108,6 +108,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
@@ -117,6 +118,7 @@ private:
ITensor *_input;
PadStrideInfo _info;
std::pair<unsigned int, unsigned int> _inner_border;
+ bool _is_prepared;
};
} // arm_compute
#endif /* __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index b80fb7f2c8..aa4cace7c2 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -122,6 +122,7 @@ public:
// Inherited methods overriden:
void run() override;
+ void prepare() override;
private:
NEDepthwiseIm2ColKernel _im2col_kernel;
@@ -135,7 +136,7 @@ private:
Tensor _weights_reshaped;
Tensor _v2mm_output;
Tensor _output_reshaped;
- bool _is_first_run;
+ bool _is_prepared;
bool _is_quantized;
const ITensor *_original_weights;
};
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
index 0562c07515..99e93ccece 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,6 +70,7 @@ public:
// Inherited methods overriden:
void run() override;
+ void prepare() override;
private:
NEDepthwiseConvolutionLayer _depthwise_conv;
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 071eecc3f7..2739f5ebef 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -127,22 +127,23 @@ public:
//Inherited methods override
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
NEIm2ColKernel _im2col_kernel;
- NEFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+ NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
NEGEMMInterleave4x4Kernel _interleave4x4_kernel;
NEGEMMMatrixMultiplyKernel _mm_kernel;
NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
Tensor _im2col_output;
Tensor _interleave4x4_output;
Tensor _reshape_weights_output;
- bool _are_weights_reshaped;
+ const ITensor *_original_weights;
bool _is_batched_fc_layer;
bool _linearize_input;
bool _accumulate_biases;
- const ITensor *_original_weights;
+ bool _is_prepared;
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index e2263c2307..5d108b2c14 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -53,7 +53,14 @@ class NEGEMM : public IFunction
public:
/** Constructor */
NEGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMM(const NEGEMM &) = delete;
+ /** Default move constructor */
+ NEGEMM(NEGEMM &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMM &operator=(const NEGEMM &) = delete;
+ /** Default move assignment operator */
+ NEGEMM &operator=(NEGEMM &&) = default;
/** Initialise the kernel's inputs, output
*
* @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -72,6 +79,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
@@ -84,10 +92,11 @@ private:
Tensor _tmp_b;
Tensor _workspace;
Tensor _B_pretransposed;
+ const ITensor *_original_b;
bool _run_vector_matrix_multiplication;
bool _run_addition;
- bool _is_first_run;
bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
};
-}
+} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index d64fd9e771..7075becf75 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -153,6 +153,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
/** Configures the appropriate matrix multiply routine
@@ -197,6 +198,7 @@ private:
bool _is_interleaved;
bool _is_activationlayer_enabled;
bool _skip_im2col;
+ bool _is_prepared;
};
}
#endif /* __ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index adcddb8263..f32eb3c757 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -56,6 +56,14 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction
public:
/** Constructor */
NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
+ /** Default move constructor */
+ NEGEMMLowpMatrixMultiplyCore(NEGEMMLowpMatrixMultiplyCore &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMLowpMatrixMultiplyCore &operator=(const NEGEMMLowpMatrixMultiplyCore &) = delete;
+ /** Default move assignment operator */
+ NEGEMMLowpMatrixMultiplyCore &operator=(NEGEMMLowpMatrixMultiplyCore &&) = default;
/** Initialise the kernel's inputs, output
*
* @note GEMM_LOWP: low precision GEMM kernel
@@ -86,6 +94,7 @@ public:
// Inherited methods overridden
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
@@ -103,12 +112,13 @@ private:
Tensor _tmp_b;
Tensor _workspace;
Tensor _B_pretranspose;
+ const ITensor *_original_b;
int32_t _a_offset;
int32_t _b_offset;
bool _run_vector_matrix_multiplication;
bool _dot_product_path;
- bool _is_first_run;
bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
};
}
#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
index 18cd27414e..7d1f124bb3 100644
--- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -90,6 +90,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
@@ -100,7 +101,7 @@ private:
Tensor _input_im2col_reshaped;
Tensor _weights_reshaped;
Tensor _gemm_output;
- bool _is_first_run;
+ bool _is_prepared;
const ITensor *_original_weights;
};
}
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 55921f78f3..c1260977c0 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -74,6 +74,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
*
@@ -122,7 +123,7 @@ private:
const ITensor *_input;
const ITensor *_weights;
ITensor *_output;
- bool _reshaped_kernel;
+ bool _is_prepared;
bool _is_activationlayer_enabled;
};
}