aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-06-05 14:56:06 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:53:09 +0000
commit72219330fd85b1271e714d4ba894d6d8e26340c9 (patch)
tree9ae0510087a1ca77b1695252a8621de3f2ab98af
parentc42f28d45e9b990276d54880d2cee9c9ee675a41 (diff)
downloadComputeLibrary-72219330fd85b1271e714d4ba894d6d8e26340c9.tar.gz
COMPMID-1145: (API) Introduce prepare() stage (NEON/CL/GLES)
Change-Id: I5b46764f9c3154ec3e3b9c951cc9e6dfbcb81dfb Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/134255 Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Pablo Tello <pablo.tello@arm.com> Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
-rw-r--r--arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h2
-rw-r--r--arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h1
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h14
-rw-r--r--arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h3
-rw-r--r--arm_compute/runtime/CL/functions/CLRNNLayer.h2
-rw-r--r--arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h4
-rw-r--r--arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h10
-rw-r--r--arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h13
-rw-r--r--arm_compute/runtime/NEON/AssemblyHelper.h52
-rw-r--r--arm_compute/runtime/NEON/functions/NEConvolutionLayer.h1
-rw-r--r--arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h7
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMM.h15
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h12
-rw-r--r--arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h3
-rw-r--r--arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h3
-rw-r--r--src/graph/GraphManager.cpp18
-rw-r--r--src/runtime/CL/functions/CLDeconvolutionLayer.cpp18
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp35
-rw-r--r--src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp10
-rw-r--r--src/runtime/CL/functions/CLGEMM.cpp6
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp35
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp68
-rw-r--r--src/runtime/CL/functions/CLLocallyConnectedLayer.cpp33
-rw-r--r--src/runtime/CL/functions/CLRNNLayer.cpp20
-rw-r--r--src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp42
-rw-r--r--src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp35
-rw-r--r--src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp45
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp6
-rw-r--r--src/runtime/NEON/functions/NEDeconvolutionLayer.cpp18
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp34
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp10
-rw-r--r--src/runtime/NEON/functions/NEFullyConnectedLayer.cpp54
-rw-r--r--src/runtime/NEON/functions/NEGEMM.cpp48
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp65
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp88
-rw-r--r--src/runtime/NEON/functions/NELocallyConnectedLayer.cpp32
-rw-r--r--src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp40
43 files changed, 627 insertions, 291 deletions
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index 82969301b0..7767b73e10 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -94,12 +94,14 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
CLMemoryGroup _memory_group;
CLDeconvolutionLayerUpsample _scale_f;
CLConvolutionLayer _conv_f;
CLTensor _scaled_output;
+ bool _is_prepared;
};
}
#endif /* __ARM_COMPUTE_CLDECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index b1eb4b9e04..229fb24010 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -140,6 +140,7 @@ public:
// Inherited methods overriden:
void run() override;
+ void prepare() override;
private:
CLDepthwiseIm2ColKernel _im2col_kernel;
@@ -153,7 +154,7 @@ private:
CLTensor _weights_reshaped;
CLTensor _v2mm_output;
CLTensor _output_reshaped;
- bool _is_first_run;
+ bool _is_prepared;
bool _is_quantized;
const ICLTensor *_original_weights;
};
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h
index 27cee5ed3b..a43461048a 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,6 +70,7 @@ public:
// Inherited methods overriden:
void run() override;
+ void prepare() override;
private:
CLDepthwiseConvolutionLayer _depthwise_conv;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index aaa432616d..3dde52989b 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -195,7 +195,6 @@ private:
bool _is_quantized;
bool _is_activationlayer_enabled;
bool _is_prepared;
- bool _retain_internal_weights;
};
}
#endif /* __ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 3976704907..f404ccdf4c 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -53,6 +53,14 @@ class CLGEMMLowpMatrixMultiplyCore : public IFunction
public:
/** Constructor */
CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGEMMLowpMatrixMultiplyCore(const CLGEMMLowpMatrixMultiplyCore &) = delete;
+ /** Default move constructor */
+ CLGEMMLowpMatrixMultiplyCore(CLGEMMLowpMatrixMultiplyCore &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CLGEMMLowpMatrixMultiplyCore &operator=(const CLGEMMLowpMatrixMultiplyCore &) = delete;
+ /** Default move assignment operator */
+ CLGEMMLowpMatrixMultiplyCore &operator=(CLGEMMLowpMatrixMultiplyCore &&) = default;
/** Initialise the kernel's inputs, output
*
* @note GEMM_LOWP: low precision GEMM kernel
@@ -83,6 +91,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
CLMemoryGroup _memory_group;
@@ -96,11 +105,12 @@ private:
CLTensor _vector_sum_row;
CLTensor _tmp_a;
CLTensor _tmp_b;
+ const ICLTensor *_original_b;
int32_t _a_offset;
int32_t _b_offset;
bool _is_interleaved_transposed;
- bool _is_first_run;
bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
};
}
#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
index b7b2587454..c2bb47c550 100644
--- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
@@ -90,6 +90,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
CLMemoryGroup _memory_group;
@@ -100,7 +101,7 @@ private:
CLTensor _input_im2col_reshaped;
CLTensor _weights_reshaped;
CLTensor _gemm_output;
- bool _is_first_run;
+ bool _is_prepared;
const ICLTensor *_original_weights;
};
}
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 9f239a9e64..ab7407dbfc 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -69,6 +69,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
CLMemoryGroup _memory_group;
@@ -80,6 +81,7 @@ private:
CLTensor _fully_connected_out;
CLTensor _gemm_output;
CLTensor _add_output;
+ bool _is_prepared;
};
}
#endif /* __ARM_COMPUTE_CLRNN_LAYER_H__ */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
index fa29f447c8..45a883948c 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
@@ -63,7 +63,6 @@ public:
private:
GCWeightsReshapeKernel _weights_reshape_kernel;
- GCTensor _weights_reshaped;
};
/** Basic function to compute the convolution layer. This function calls the following GLES kernels:
@@ -128,6 +127,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
/** Configures the appropriate matrix multiply routine
@@ -166,8 +166,8 @@ private:
GCTensor _gemm_output;
GCTensor _tmp_output;
- bool _is_first_run;
bool _is_activationlayer_enabled;
+ bool _is_prepared;
};
}
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
index 1f8dc3e1a0..cd108c3eab 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
@@ -65,6 +65,14 @@ class GCFullyConnectedLayer : public IFunction
public:
/** Constructor */
GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ GCFullyConnectedLayer(const GCFullyConnectedLayer &) = delete;
+ /** Default move constructor */
+ GCFullyConnectedLayer(GCFullyConnectedLayer &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ GCFullyConnectedLayer &operator=(const GCFullyConnectedLayer &) = delete;
+ /** Default move assignment operator */
+ GCFullyConnectedLayer &operator=(GCFullyConnectedLayer &&) = default;
/** Set the input and output tensors.
*
* @param[in] input Source tensor. Data type supported: F16/F32.
@@ -81,6 +89,7 @@ public:
//Inherited methods override
void run() override;
+ void prepare() override;
private:
void configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output);
@@ -93,6 +102,7 @@ private:
GCGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
GCTensor _im2col_output;
GCTensor _reshape_weights_output;
+ const IGCTensor *_original_weights;
bool _are_weights_reshaped;
bool _is_fc_after_conv;
bool _accumulate_biases;
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
index a1d6c8a438..2db254527f 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
@@ -50,7 +50,14 @@ class GCGEMM : public IFunction
public:
/** Default constructor. */
GCGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ GCGEMM(const GCGEMM &) = delete;
+ /** Default move constructor */
+ GCGEMM(GCGEMM &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ GCGEMM &operator=(const GCGEMM &) = delete;
+ /** Default move assignment operator */
+ GCGEMM &operator=(GCGEMM &&) = default;
/** Initialise the kernel's inputs and output
*
* @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -86,6 +93,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
GCMemoryGroup _memory_group;
@@ -95,10 +103,11 @@ private:
GCGEMMMatrixAdditionKernel _ma_kernel;
GCTensor _tmp_a;
GCTensor _tmp_b;
+ const IGCTensor *_original_b;
bool _is_interleaved_transposed;
bool _run_addition;
- bool _is_first_run;
bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
};
}
diff --git a/arm_compute/runtime/NEON/AssemblyHelper.h b/arm_compute/runtime/NEON/AssemblyHelper.h
index 3aa43ec96e..c4ba1a584e 100644
--- a/arm_compute/runtime/NEON/AssemblyHelper.h
+++ b/arm_compute/runtime/NEON/AssemblyHelper.h
@@ -51,7 +51,7 @@ public:
using TypeResult = TypeOutput;
/** Default constructor. */
AssemblyKernelGlue()
- : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr)
+ : _gemm_kernel_asm(nullptr), _optimised_kernel(nullptr), _a(nullptr), _b(nullptr), _d(nullptr), _workspace(nullptr), _pretranspose(nullptr), _is_prepared(false)
{
}
/** Assembly Gemm */
@@ -76,6 +76,31 @@ public:
ITensor *_workspace;
/** Pre-transpose tensor */
ITensor *_pretranspose;
+ /** Prepared flag */
+ bool _is_prepared;
+
+ /** Runs a preparation step, usually for pre-transposing matrix b */
+ void prepare()
+ {
+ // Pretranspose B if required
+ if(_gemm_kernel_asm->B_pretranspose_required())
+ {
+ const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
+ const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+
+ // Forcing 128-byte alignment (required by 32-bit kernels)
+ const unsigned int alignment = 128;
+ void *raw_ptr = reinterpret_cast<void *>(_pretranspose->buffer());
+ size_t space = _pretranspose->info()->total_size();
+ void *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
+ ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
+ _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
+ _b->mark_as_unused();
+ }
+
+ _is_prepared = true;
+ }
/** Configures the arrays pointers and strides in the assembly kernel and executes the assembly kernel.
* The call to set_arrays is needed to deal with the input sizes containing batches (dims > 2)
@@ -102,28 +127,25 @@ public:
const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer());
- // Set workspace if needed
+ // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
if(_workspace != nullptr)
{
_gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace->buffer()));
+ const unsigned int window_size = _gemm_kernel_asm->get_window_size();
+ unsigned int num_threads = NEScheduler::get().num_threads();
+ if(window_size < num_threads)
+ {
+ num_threads = window_size;
+ _gemm_kernel_asm->set_nthreads(num_threads);
+ }
}
+ // Prepare assembly kernel
+ prepare();
+
// Set gemm parameters
_gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr, ldd, batch_stride_d, multi_stride_d);
- // Pretranspose B if required
- if(_gemm_kernel_asm->B_pretranspose_required())
- {
- // Forcing 128-byte alignment (required by 32-bit kernels)
- const unsigned int alignment = 128;
- void *raw_ptr = reinterpret_cast<void *>(_pretranspose->buffer());
- size_t space = _pretranspose->info()->total_size();
- void *aligned_ptr = support::cpp11::align(alignment, _gemm_kernel_asm->get_B_pretransposed_array_size(), raw_ptr, space);
- ARM_COMPUTE_ERROR_ON(_pretranspose == nullptr || _pretranspose->buffer() == nullptr);
- _gemm_kernel_asm->pretranspose_B_array(aligned_ptr, in1_ptr, ldb, multi_stride_b);
- _b->mark_as_unused();
- }
-
// Schedule assembly kernel
NEScheduler::get().schedule(_optimised_kernel.get(), Window::DimX);
}
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index ff41f0c985..e143814a4e 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -112,6 +112,7 @@ public:
const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
std::shared_ptr<IMemoryManager> _memory_manager;
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index 66c6d427ba..3e527168c1 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -108,6 +108,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
@@ -117,6 +118,7 @@ private:
ITensor *_input;
PadStrideInfo _info;
std::pair<unsigned int, unsigned int> _inner_border;
+ bool _is_prepared;
};
} // arm_compute
#endif /* __ARM_COMPUTE_NEDECONVOLUTIONLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index b80fb7f2c8..aa4cace7c2 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -122,6 +122,7 @@ public:
// Inherited methods overriden:
void run() override;
+ void prepare() override;
private:
NEDepthwiseIm2ColKernel _im2col_kernel;
@@ -135,7 +136,7 @@ private:
Tensor _weights_reshaped;
Tensor _v2mm_output;
Tensor _output_reshaped;
- bool _is_first_run;
+ bool _is_prepared;
bool _is_quantized;
const ITensor *_original_weights;
};
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
index 0562c07515..99e93ccece 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -70,6 +70,7 @@ public:
// Inherited methods overriden:
void run() override;
+ void prepare() override;
private:
NEDepthwiseConvolutionLayer _depthwise_conv;
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index 071eecc3f7..2739f5ebef 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -127,22 +127,23 @@ public:
//Inherited methods override
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
NEIm2ColKernel _im2col_kernel;
- NEFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
+ NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
NEGEMMInterleave4x4Kernel _interleave4x4_kernel;
NEGEMMMatrixMultiplyKernel _mm_kernel;
NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
Tensor _im2col_output;
Tensor _interleave4x4_output;
Tensor _reshape_weights_output;
- bool _are_weights_reshaped;
+ const ITensor *_original_weights;
bool _is_batched_fc_layer;
bool _linearize_input;
bool _accumulate_biases;
- const ITensor *_original_weights;
+ bool _is_prepared;
};
} // namespace arm_compute
#endif /* __ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index e2263c2307..5d108b2c14 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -53,7 +53,14 @@ class NEGEMM : public IFunction
public:
/** Constructor */
NEGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMM(const NEGEMM &) = delete;
+ /** Default move constructor */
+ NEGEMM(NEGEMM &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMM &operator=(const NEGEMM &) = delete;
+ /** Default move assignment operator */
+ NEGEMM &operator=(NEGEMM &&) = default;
/** Initialise the kernel's inputs, output
*
* @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
@@ -72,6 +79,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
@@ -84,10 +92,11 @@ private:
Tensor _tmp_b;
Tensor _workspace;
Tensor _B_pretransposed;
+ const ITensor *_original_b;
bool _run_vector_matrix_multiplication;
bool _run_addition;
- bool _is_first_run;
bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
};
-}
+} // namespace arm_compute
#endif /*__ARM_COMPUTE_NEGEMM_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index d64fd9e771..7075becf75 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -153,6 +153,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
/** Configures the appropriate matrix multiply routine
@@ -197,6 +198,7 @@ private:
bool _is_interleaved;
bool _is_activationlayer_enabled;
bool _skip_im2col;
+ bool _is_prepared;
};
}
#endif /* __ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index adcddb8263..f32eb3c757 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -56,6 +56,14 @@ class NEGEMMLowpMatrixMultiplyCore : public IFunction
public:
/** Constructor */
NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
+ /** Default move constructor */
+ NEGEMMLowpMatrixMultiplyCore(NEGEMMLowpMatrixMultiplyCore &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMLowpMatrixMultiplyCore &operator=(const NEGEMMLowpMatrixMultiplyCore &) = delete;
+ /** Default move assignment operator */
+ NEGEMMLowpMatrixMultiplyCore &operator=(NEGEMMLowpMatrixMultiplyCore &&) = default;
/** Initialise the kernel's inputs, output
*
* @note GEMM_LOWP: low precision GEMM kernel
@@ -86,6 +94,7 @@ public:
// Inherited methods overridden
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
@@ -103,12 +112,13 @@ private:
Tensor _tmp_b;
Tensor _workspace;
Tensor _B_pretranspose;
+ const ITensor *_original_b;
int32_t _a_offset;
int32_t _b_offset;
bool _run_vector_matrix_multiplication;
bool _dot_product_path;
- bool _is_first_run;
bool _reshape_b_only_on_first_run;
+ bool _is_prepared;
};
}
#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
index 18cd27414e..7d1f124bb3 100644
--- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
@@ -90,6 +90,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
private:
MemoryGroup _memory_group;
@@ -100,7 +101,7 @@ private:
Tensor _input_im2col_reshaped;
Tensor _weights_reshaped;
Tensor _gemm_output;
- bool _is_first_run;
+ bool _is_prepared;
const ITensor *_original_weights;
};
}
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 55921f78f3..c1260977c0 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -74,6 +74,7 @@ public:
// Inherited methods overridden:
void run() override;
+ void prepare() override;
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
*
@@ -122,7 +123,7 @@ private:
const ITensor *_input;
const ITensor *_weights;
ITensor *_output;
- bool _reshaped_kernel;
+ bool _is_prepared;
bool _is_activationlayer_enabled;
};
}
diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index 0ea3254faa..10661ea275 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp
@@ -76,12 +76,8 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
detail::allocate_const_tensors(graph);
detail::call_all_const_node_accessors(graph);
- // TODO (COMPMID-920) : Update prepare for NEON/GC
- if(forced_target == Target::CL)
- {
- // Prepare graph
- detail::prepare_all_tasks(workload);
- }
+ // Prepare graph
+ detail::prepare_all_tasks(workload);
// Setup tensor memory (Allocate all tensors or setup transition manager)
if(ctx.config().use_transition_memory_manager)
@@ -99,16 +95,6 @@ void GraphManager::finalize_graph(Graph &graph, GraphContext &ctx, PassManager &
// Register graph
_workloads.insert(std::make_pair(graph.id(), std::move(workload)));
ARM_COMPUTE_LOG_GRAPH_VERBOSE("Created workload for graph with ID : " << graph.id().get() << std::endl);
-
- // TODO (COMPMID-920) : Update prepare for NEON/GC
- if(forced_target != Target::CL)
- {
- // Make first run
- execute_graph(graph);
-
- // Release all unused const tensors
- detail::release_unused_tensors(graph);
- }
}
void GraphManager::execute_graph(Graph &graph)
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 6c54b18b81..4c1ea5b9a2 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
@@ -38,7 +38,8 @@ CLDeconvolutionLayer::CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
: _memory_group(std::move(memory_manager)),
_scale_f(),
_conv_f(),
- _scaled_output()
+ _scaled_output(),
+ _is_prepared(false)
{
}
@@ -104,6 +105,8 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights,
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info, inner_border_right, inner_border_top));
+ _is_prepared = false;
+
_memory_group.manage(&_scaled_output);
// configure scale function
@@ -126,8 +129,21 @@ void CLDeconvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights,
void CLDeconvolutionLayer::run()
{
+ prepare();
+
_memory_group.acquire();
+
_scale_f.run();
_conv_f.run();
+
_memory_group.release();
}
+
+void CLDeconvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ _conv_f.prepare();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index c2b24e3c20..1815361a72 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -91,7 +91,7 @@ void CLDepthwiseConvolutionLayer3x3::run()
CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
- _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
+ _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr)
{
}
@@ -104,7 +104,7 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
const size_t weights_h = weights->info()->dimension(1);
const size_t weights_z = weights->info()->dimension(2);
- _is_first_run = true;
+ _is_prepared = false;
_original_weights = weights;
_is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
@@ -182,7 +182,6 @@ void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *w
// Allocate intermediate tensors
_input_reshaped.allocator()->allocate();
- _weights_reshaped.allocator()->allocate();
_v2mm_output.allocator()->allocate();
}
@@ -235,18 +234,7 @@ Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITe
void CLDepthwiseConvolutionLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- CLScheduler::get().enqueue(_weights_reshape_kernel);
- CLScheduler::get().enqueue(_v2mm_weights_fill_border);
- _is_first_run = false;
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
CLScheduler::get().enqueue(_im2col_kernel);
CLScheduler::get().enqueue(_v2mm_input_fill_border);
@@ -257,3 +245,20 @@ void CLDepthwiseConvolutionLayer::run()
CLScheduler::get().enqueue(_output_stage_kernel);
}
}
+
+void CLDepthwiseConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ CLScheduler::get().enqueue(_v2mm_weights_fill_border);
+ _original_weights->mark_as_unused();
+
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
index af2c6f0eb8..fa2c3affa3 100644
--- a/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,6 +45,14 @@ void CLDepthwiseSeparableConvolutionLayer::configure(ICLTensor *input, const ICL
void CLDepthwiseSeparableConvolutionLayer::run()
{
+ prepare();
+
_depthwise_conv.run();
_pointwise_conv.run();
+}
+
+void CLDepthwiseSeparableConvolutionLayer::prepare()
+{
+ _depthwise_conv.prepare();
+ _pointwise_conv.prepare();
} \ No newline at end of file
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index f9713bb586..bb76872700 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -84,12 +84,10 @@ void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info));
- // Store original b matrix
- _original_b = b;
-
// Check if we need to reshape the matrix B only on the first run
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_is_prepared = false;
+ _original_b = b;
const ICLTensor *matrix_a = a;
const ICLTensor *matrix_b = b;
@@ -262,7 +260,7 @@ void CLGEMM::prepare()
{
if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
{
- // Run transpose kernel
+ // Run transpose kernel and mark original weights tensor as unused
_tmp_b.allocator()->allocate();
CLScheduler::get().enqueue(_transpose_kernel, false);
_original_b->mark_as_unused();
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 27bed44098..82710b6461 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -91,8 +91,7 @@ void CLConvolutionLayerReshapeWeights::run()
CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(),
- _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false),
- _retain_internal_weights(false)
+ _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false)
{
}
@@ -166,10 +165,9 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
dilation,
act_info));
- _is_prepared = false;
- _original_weights = weights;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _retain_internal_weights = weights_info.retain_internal_weights();
+ _is_prepared = weights_info.retain_internal_weights();
+ _original_weights = weights;
+ _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
const DataType dt = input->info()->data_type();
@@ -408,23 +406,18 @@ void CLGEMMConvolutionLayer::prepare()
{
if(!_is_prepared)
{
- if(!_retain_internal_weights)
- {
- // Run weights reshaping and mark as unused
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
- _weights_reshaped.allocator()->allocate();
- _reshape_weights.run();
- _original_weights->mark_as_unused();
- }
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ _reshape_weights.run();
+ _original_weights->mark_as_unused();
- // Run GEMM prepare
- if(!_is_quantized)
+ // Prepare GEMM
+ _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
+ if(!_weights_reshaped.is_used())
{
- _mm_gemm.prepare();
- if(!_weights_reshaped.is_used() && !_retain_internal_weights)
- {
- _weights_reshaped.allocator()->free();
- }
+ _weights_reshaped.allocator()->free();
}
CLScheduler::get().queue().finish();
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 711b006ede..94dc0e071c 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -59,8 +59,23 @@ inline bool is_interleaved_transposed(int m, int n, int k, bool reshape_b_only_o
} // namespace
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
- _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _a_offset(0), _b_offset(0), _is_interleaved_transposed(true), _is_first_run(true), _reshape_b_only_on_first_run(false)
+ : _memory_group(std::move(memory_manager)),
+ _mm_kernel(),
+ _mtx_a_reshape_kernel(),
+ _mtx_b_reshape_kernel(),
+ _mtx_a_reduction_kernel(),
+ _mtx_b_reduction_kernel(),
+ _offset_contribution_kernel(),
+ _vector_sum_col(),
+ _vector_sum_row(),
+ _tmp_a(),
+ _tmp_b(),
+ _original_b(nullptr),
+ _a_offset(0),
+ _b_offset(0),
+ _is_interleaved_transposed(true),
+ _reshape_b_only_on_first_run(false),
+ _is_prepared(false)
{
}
@@ -70,6 +85,8 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
ARM_COMPUTE_UNUSED(gemm_info);
ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+ _is_prepared = false;
+ _original_b = b;
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_a_offset = a->info()->quantization_info().offset;
_b_offset = b->info()->quantization_info().offset;
@@ -149,10 +166,13 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
if(_is_interleaved_transposed)
{
_tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
}
- if(_a_offset != 0)
+ if(_a_offset != 0 && !_reshape_b_only_on_first_run)
{
_vector_sum_col.allocator()->allocate();
}
@@ -234,6 +254,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
void CLGEMMLowpMatrixMultiplyCore::run()
{
+ prepare();
+
_memory_group.acquire();
if(_is_interleaved_transposed)
@@ -241,21 +263,17 @@ void CLGEMMLowpMatrixMultiplyCore::run()
// Run reshape matrix A
CLScheduler::get().enqueue(_mtx_a_reshape_kernel, false);
- if(_is_first_run || !_reshape_b_only_on_first_run)
+ if(!_reshape_b_only_on_first_run)
{
// Run reshape matrix B
CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
}
}
- // Note: if _reshape_b_only_on_first_run = true, the reduction kernel can be executed only once
- if(_is_first_run || !_reshape_b_only_on_first_run)
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0 && !_reshape_b_only_on_first_run)
{
- // Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0)
- {
- CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
- }
+ CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
}
// Run matrix multiply
@@ -271,6 +289,30 @@ void CLGEMMLowpMatrixMultiplyCore::run()
CLScheduler::get().enqueue(_offset_contribution_kernel, true);
_memory_group.release();
+}
+
+void CLGEMMLowpMatrixMultiplyCore::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ // Run reshape kernel and mark original weights tensor as unused
+ _tmp_b.allocator()->allocate();
+ CLScheduler::get().enqueue(_mtx_b_reshape_kernel, false);
+ _original_b->mark_as_unused();
+ }
- _is_first_run = false;
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0 && _reshape_b_only_on_first_run)
+ {
+ _vector_sum_col.allocator()->allocate();
+ CLScheduler::get().enqueue(_mtx_b_reduction_kernel, false);
+ }
+
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
}
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 31d5cd5a7e..d15e5dfa3d 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons
CLLocallyConnectedLayer::CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
- _is_first_run(false), _original_weights(nullptr)
+ _is_prepared(false), _original_weights(nullptr)
{
}
@@ -128,7 +128,7 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
bool _has_bias = (biases != nullptr);
_original_weights = weights;
- _is_first_run = true;
+ _is_prepared = false;
const unsigned int kernel_width = weights->info()->dimension(0);
const unsigned int kernel_height = weights->info()->dimension(1);
@@ -160,7 +160,6 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
_output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
// Allocate intermediate tensors
- _weights_reshaped.allocator()->allocate();
_input_im2col_reshaped.allocator()->allocate();
_gemm_output.allocator()->allocate();
@@ -169,17 +168,7 @@ void CLLocallyConnectedLayer::configure(const ICLTensor *input, const ICLTensor
void CLLocallyConnectedLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _is_first_run = false;
- CLScheduler::get().enqueue(_weights_reshape_kernel);
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
_memory_group.acquire();
@@ -194,3 +183,19 @@ void CLLocallyConnectedLayer::run()
_memory_group.release();
}
+
+void CLLocallyConnectedLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ CLScheduler::get().enqueue(_weights_reshape_kernel);
+ _original_weights->mark_as_unused();
+
+ CLScheduler::get().queue().finish();
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 4843ba6364..0e1b9d5b58 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp
@@ -36,7 +36,8 @@ using namespace arm_compute;
using namespace arm_compute::misc::shape_calculator;
CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output()
+ : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+ _is_prepared(false)
{
}
@@ -74,6 +75,8 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con
const int idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
TensorShape shape = compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+ _is_prepared = false;
+
_fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
_gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
@@ -100,7 +103,10 @@ void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, con
void CLRNNLayer::run()
{
+ prepare();
+
_memory_group.acquire();
+
_fully_connected_kernel.run();
_gemm_state_f.run();
CLScheduler::get().enqueue(_add_kernel);
@@ -108,5 +114,17 @@ void CLRNNLayer::run()
// copy hidden out to output
CLScheduler::get().enqueue(_copy_kernel);
+
_memory_group.release();
+}
+
+void CLRNNLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ _fully_connected_kernel.prepare();
+ _gemm_state_f.prepare();
+
+ _is_prepared = true;
+ }
} \ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
index d1ef87d32c..67b2ae9d61 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.cpp
@@ -37,7 +37,7 @@
using namespace arm_compute;
GCConvolutionLayerReshapeWeights::GCConvolutionLayerReshapeWeights()
- : _weights_reshape_kernel(), _weights_reshaped()
+ : _weights_reshape_kernel()
{
}
@@ -68,7 +68,7 @@ void GCConvolutionLayerReshapeWeights::run()
GCConvolutionLayer::GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _reshape_weights(), _input_im2col_kernel(), _mm_gemm(), _output_col2im_kernel(), _fill_border(), _activationlayer_function(), _original_weights(nullptr),
- _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_first_run(true), _is_activationlayer_enabled(false)
+ _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _weights_transposed(), _gemm_output(), _tmp_output(), _is_activationlayer_enabled(false), _is_prepared(false)
{
}
@@ -97,7 +97,7 @@ void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weig
ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);
- _is_first_run = true;
+ _is_prepared = false;
_original_weights = weights;
if(biases != nullptr)
@@ -184,9 +184,6 @@ void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weig
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(0) != conv_w) || (output->info()->dimension(1) != conv_h), "Output shape does not match the expected one");
- // Allocate intermediate tensor
- _weights_reshaped.allocator()->allocate();
-
//Configure Activation Layer
_is_activationlayer_enabled = act_info.enabled();
@@ -200,17 +197,7 @@ void GCConvolutionLayer::configure(const IGCTensor *input, const IGCTensor *weig
void GCConvolutionLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _reshape_weights.run();
- _is_first_run = false;
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
_memory_group.acquire();
@@ -221,17 +208,34 @@ void GCConvolutionLayer::run()
// Run gemm on reshaped matrices
_mm_gemm.run();
-
GCScheduler::get().memory_barrier();
+
// Reshape output matrix
GCScheduler::get().dispatch(_output_col2im_kernel, false);
+ GCScheduler::get().memory_barrier();
_memory_group.release();
- GCScheduler::get().memory_barrier();
// Run Activation Layer
if(_is_activationlayer_enabled)
{
_activationlayer_function.run();
}
}
+
+void GCConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights reshaping and mark as unused
+ _weights_reshaped.allocator()->allocate();
+ _reshape_weights.run();
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
+
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
index a300033bb2..ab2c6c2813 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.cpp
@@ -40,7 +40,7 @@ void GCFullyConnectedLayerReshapeWeights::configure(const IGCTensor *input, IGCT
GCFullyConnectedLayer::GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _reshape_weights_output(),
- _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
+ _original_weights(nullptr), _are_weights_reshaped(true), _is_fc_after_conv(true), _accumulate_biases(false)
{
}
@@ -86,6 +86,7 @@ void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *w
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output);
ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 2);
+ _original_weights = weights;
_are_weights_reshaped = transpose_weights ? are_weights_reshaped : true;
_is_fc_after_conv = true;
_accumulate_biases = false;
@@ -141,25 +142,13 @@ void GCFullyConnectedLayer::configure(const IGCTensor *input, const IGCTensor *w
configure_fc_fc(input, weights_to_use, output);
}
- // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!_are_weights_reshaped && !retain_internal_weights)
- {
- // Allocate the tensor for the weights reshaped
- _reshape_weights_output.allocator()->allocate();
- }
-
ARM_COMPUTE_ERROR_ON(retain_internal_weights && _reshape_weights_output.gc_buffer() == 0);
_are_weights_reshaped = _are_weights_reshaped || retain_internal_weights;
}
void GCFullyConnectedLayer::run()
{
- // Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
- {
- _are_weights_reshaped = true;
- _reshape_weights_kernel.run();
- }
+ prepare();
_memory_group.acquire();
@@ -187,3 +176,21 @@ void GCFullyConnectedLayer::run()
_memory_group.release();
}
+
+void GCFullyConnectedLayer::prepare()
+{
+ // Reshape of the weights (happens only once)
+ if(!_are_weights_reshaped)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run reshape weights kernel and mark weights as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_kernel.run();
+
+ // Mark original weights tensor as unused
+ _original_weights->mark_as_unused();
+
+ _are_weights_reshaped = true;
+ }
+} \ No newline at end of file
diff --git a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
index 79f8f71713..8ae91ee82c 100644
--- a/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
+++ b/src/runtime/GLES_COMPUTE/functions/GCGEMM.cpp
@@ -73,8 +73,8 @@ Status validate_arguments(const ITensorInfo *a, const ITensorInfo *b, const IGCT
} // namespace
GCGEMM::GCGEMM(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _is_interleaved_transposed(false), _run_addition(false),
- _is_first_run(true), _reshape_b_only_on_first_run(false)
+ : _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _ma_kernel(), _tmp_a(), _tmp_b(), _original_b(nullptr), _is_interleaved_transposed(false),
+ _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
{
}
@@ -87,6 +87,8 @@ void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *
// Check if we need to reshape the matrix B only on the first run
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _is_prepared = false;
+ _original_b = b;
const IGCTensor *matrix_a = a;
const IGCTensor *matrix_b = b;
@@ -136,7 +138,10 @@ void GCGEMM::configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *
{
// Allocate intermediate tensors
_tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
}
// Configure matrix addition kernel
@@ -155,23 +160,21 @@ Status GCGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const IGCTen
void GCGEMM::run()
{
+ prepare();
+
_memory_group.acquire();
+
if(_is_interleaved_transposed)
{
// Run interleave kernel
GCScheduler::get().dispatch(_interleave_kernel, false);
- if(_is_first_run)
- {
- // Run transpose kernel
- GCScheduler::get().dispatch(_transpose_kernel, false);
- _is_first_run = false;
- }
- else if(!_reshape_b_only_on_first_run)
+ if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
GCScheduler::get().dispatch(_transpose_kernel, false);
}
+
GCScheduler::get().memory_barrier();
}
@@ -184,5 +187,27 @@ void GCGEMM::run()
GCScheduler::get().memory_barrier();
GCScheduler::get().dispatch(_ma_kernel);
}
+
_memory_group.release();
}
+
+void GCGEMM::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_is_interleaved_transposed && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ // Run transpose kernel
+ _tmp_b.allocator()->allocate();
+ GCScheduler::get().dispatch(_transpose_kernel, false);
+ GCScheduler::get().memory_barrier();
+
+ // Mark original weights tensor as unused
+ _original_b->mark_as_unused();
+ }
+
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 96ac95f00c..4018407153 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -155,6 +155,12 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *
void NEConvolutionLayer::run()
{
+ prepare();
_function->run();
}
+
+void NEConvolutionLayer::prepare()
+{
+ _function->prepare();
+}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 40ada8f6cf..8051d6da0e 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -38,7 +38,8 @@ NEDeconvolutionLayer::NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memor
_scaled_output(),
_input(nullptr),
_info(),
- _inner_border()
+ _inner_border(),
+ _is_prepared(false)
{
}
@@ -104,6 +105,7 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
_input = input;
_info = info;
_inner_border = std::make_pair(inner_border_right, inner_border_top);
+ _is_prepared = false;
const unsigned int stride_x = info.stride().first;
const unsigned int stride_y = info.stride().second;
@@ -132,13 +134,21 @@ void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, con
void NEDeconvolutionLayer::run()
{
+ prepare();
+
_memory_group.acquire();
- // Run upsample kernel
_upsample_f.run();
-
- // Run convolution layer
_conv_f.run();
_memory_group.release();
+}
+
+void NEDeconvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ _conv_f.prepare();
+ _is_prepared = true;
+ }
} \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 0a977ad08d..83c3e217f3 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -172,7 +172,7 @@ void NEDepthwiseConvolutionLayer3x3::run()
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
: _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
- _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_first_run(true), _is_quantized(false), _original_weights(nullptr)
+ _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr)
{
}
@@ -187,7 +187,7 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
const size_t weights_z = weights->info()->dimension(2);
_is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _is_first_run = true;
+ _is_prepared = false;
_original_weights = weights;
// Should bias be appended ?
@@ -260,24 +260,12 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
// Allocate intermediate tensors
_input_reshaped.allocator()->allocate();
- _weights_reshaped.allocator()->allocate();
_v2mm_output.allocator()->allocate();
}
void NEDepthwiseConvolutionLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
- NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
- _is_first_run = false;
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
@@ -288,3 +276,19 @@ void NEDepthwiseConvolutionLayer::run()
NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
}
}
+
+void NEDepthwiseConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run reshape and mark original weights as unused
+ _weights_reshaped.allocator()->allocate();
+ NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);
+ NEScheduler::get().schedule(&_v2mm_weights_fill_border, Window::DimX);
+ _original_weights->mark_as_unused();
+
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
index d70a6689ac..da2e49c730 100644
--- a/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseSeparableConvolutionLayer.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -45,6 +45,14 @@ void NEDepthwiseSeparableConvolutionLayer::configure(ITensor *input, const ITens
void NEDepthwiseSeparableConvolutionLayer::run()
{
+ prepare();
+
_depthwise_conv.run();
_pointwise_conv.run();
+}
+
+void NEDepthwiseSeparableConvolutionLayer::prepare()
+{
+ _depthwise_conv.prepare();
+ _pointwise_conv.prepare();
} \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 958d081fd2..5b9f182bcb 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -131,8 +131,8 @@ void NEFullyConnectedLayerReshapeWeights::run()
}
NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_kernel(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(), _interleave4x4_output(),
- _reshape_weights_output(), _are_weights_reshaped(false), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _original_weights(nullptr)
+ : _memory_group(std::move(memory_manager)), _im2col_kernel(), _reshape_weights_function(), _interleave4x4_kernel(), _mm_kernel(), _accumulate_biases_kernel(), _im2col_output(),
+ _interleave4x4_output(), _reshape_weights_output(), _original_weights(nullptr), _is_batched_fc_layer(false), _linearize_input(false), _accumulate_biases(false), _is_prepared(false)
{
}
@@ -163,16 +163,16 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
const int num_input_dimensions = input->info()->tensor_shape().num_dimensions() - num_batch_dimensions;
const size_t linear_input_size = input->info()->tensor_shape().total_size_lower(num_input_dimensions);
- _original_weights = weights;
- _linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
- _are_weights_reshaped = are_weights_reshaped;
- _accumulate_biases = biases != nullptr;
- _is_batched_fc_layer = num_batch_dimensions > 0;
+ _original_weights = weights;
+ _linearize_input = (input->info()->tensor_shape().x() != linear_input_size) || (num_input_dimensions > 1 && linear_input_size == 1);
+ _accumulate_biases = biases != nullptr;
+ _is_batched_fc_layer = num_batch_dimensions > 0;
+ _is_prepared = are_weights_reshaped || (!transpose_weights && !_is_batched_fc_layer);
const size_t interleave_width = 16 / input->info()->element_size();
const ITensor *weights_to_use = weights;
- if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
+ if(!_is_prepared)
{
weights_to_use = &_reshape_weights_output;
@@ -181,7 +181,7 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
_is_batched_fc_layer, interleave_width)));
// Reshape the weights
- _reshape_weights_kernel.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
+ _reshape_weights_function.configure(weights, &_reshape_weights_output, transpose_weights, _is_batched_fc_layer);
}
const ITensor *multiply_input = input;
@@ -220,13 +220,6 @@ void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weigh
_accumulate_biases_kernel.configure(output, biases);
}
- // Allocate the transpose tensor if the are_weights_reshaped flag is false and once all the configure methods have been called
- if(!are_weights_reshaped && (transpose_weights || _is_batched_fc_layer))
- {
- // Allocate the tensor for the weights reshaped
- _reshape_weights_output.allocator()->allocate();
- }
-
if(_linearize_input)
{
_im2col_output.allocator()->allocate();
@@ -322,17 +315,7 @@ Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorIn
void NEFullyConnectedLayer::run()
{
- // Reshape of the weights (happens only once)
- if(!_are_weights_reshaped)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _are_weights_reshaped = true;
- _reshape_weights_kernel.run();
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
_memory_group.acquire();
@@ -359,3 +342,20 @@ void NEFullyConnectedLayer::run()
_memory_group.release();
}
+
+void NEFullyConnectedLayer::prepare()
+{
+ // Reshape of the weights (happens only once)
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights reshape, clean internal tensors and mark original weights tensor as unused
+ _reshape_weights_output.allocator()->allocate();
+ _reshape_weights_function.run();
+ _reshape_weights_function = NEFullyConnectedLayerReshapeWeights();
+ _original_weights->mark_as_unused();
+
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 9168ed4327..a98309d304 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -40,7 +40,7 @@ namespace arm_compute
{
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(), _ma_kernel(), _tmp_a(), _tmp_b(), _workspace(), _B_pretransposed(),
- _run_vector_matrix_multiplication(false), _run_addition(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+ _original_b(nullptr), _run_vector_matrix_multiplication(false), _run_addition(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
{
}
@@ -63,8 +63,11 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
}
// Check if we need to reshape the matrix B only on the first run
+ _is_prepared = false;
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
+ _original_b = b;
+ _asm_glue._optimised_kernel = nullptr;
const bool run_optimised = a->info()->data_type() == DataType::F32 && (c == nullptr || beta == 0.f)
&& setup_assembly_kernel(a, b, d, alpha, beta, _reshape_b_only_on_first_run, _workspace, _B_pretransposed, _memory_group, _asm_glue);
@@ -128,7 +131,10 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
// Allocate once the all configure methods have been called
_tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
// Configure matrix addition kernel
if(beta != 0 && c != nullptr)
@@ -142,28 +148,24 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
void NEGEMM::run()
{
- _memory_group.acquire();
+ prepare();
if(_asm_glue._optimised_kernel != nullptr)
{
+ _memory_group.acquire();
_asm_glue.run();
_memory_group.release();
}
else
{
+ _memory_group.acquire();
+
if(!_run_vector_matrix_multiplication)
{
// Run interleave kernel
NEScheduler::get().schedule(&_interleave_kernel, Window::DimY);
- if(_is_first_run)
- {
- // Run transpose kernel
- NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
-
- _is_first_run = false;
- }
- else if(!_reshape_b_only_on_first_run)
+ if(!_reshape_b_only_on_first_run)
{
// Run transpose kernel
NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
@@ -181,4 +183,28 @@ void NEGEMM::run()
}
}
}
+
+void NEGEMM::prepare()
+{
+ if(!_is_prepared)
+ {
+ if(_asm_glue._optimised_kernel)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ _asm_glue.prepare();
+ _original_b->mark_as_unused();
+ }
+ else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue._optimised_kernel)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ _tmp_b.allocator()->allocate();
+ NEScheduler::get().schedule(&_transpose_kernel, Window::DimY);
+ _original_b->mark_as_unused();
+ }
+
+ _is_prepared = true;
+ }
+}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 303691aa7a..d4400b8864 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -231,7 +231,7 @@ NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryMana
: _asm_glue(), _memory_group(memory_manager), _input_im2col_kernel(), _input_interleave_kernel(), _reshape_weights(), _mm_kernel(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(),
_output_col2im_kernel(), _activationlayer_function(), _add_bias_kernel(), _original_weights(nullptr), _input_im2col_reshaped(), _input_interleaved_reshaped(), _weights_reshaped(), _gemm_output(),
_tmp_output(), _workspace(), _B_pretransposed(), _data_layout(DataLayout::NCHW), _append_bias(false), _is_fully_connected_convolution(false), _are_weights_reshaped(false), _is_quantized(false),
- _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false)
+ _is_interleaved(false), _is_activationlayer_enabled(false), _skip_im2col(false), _is_prepared(false)
{
}
@@ -287,6 +287,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
ARM_COMPUTE_ERROR_THROW_ON(status);
+ _is_prepared = false;
_original_weights = weights;
const unsigned int fixed_point_position = input->info()->fixed_point_position();
const ITensor *biases_to_use = (_append_bias) ? biases : nullptr;
@@ -442,12 +443,6 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h), "Output shape does not match the expected one");
- // Allocate intermediate tensor
- if(!_are_weights_reshaped)
- {
- _weights_reshaped.allocator()->allocate();
- }
-
//Configure Activation Layer
if(_is_activationlayer_enabled)
{
@@ -585,17 +580,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
void NEGEMMConvolutionLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(!_are_weights_reshaped)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _are_weights_reshaped = true;
- _reshape_weights.run();
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
_memory_group.acquire();
@@ -610,11 +595,6 @@ void NEGEMMConvolutionLayer::run()
if(_asm_glue._optimised_kernel != nullptr)
{
_asm_glue.run();
- // Release weights in case buffer is pretransposed
- if(!_weights_reshaped.is_used())
- {
- _weights_reshaped.allocator()->free();
- }
}
else
{
@@ -659,4 +639,43 @@ void NEGEMMConvolutionLayer::run()
_memory_group.release();
}
+
+void NEGEMMConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Run weights reshaping (Runs once for every configure)
+ if(!_are_weights_reshaped)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ _weights_reshaped.allocator()->allocate();
+ _reshape_weights.run();
+ _reshape_weights = NEConvolutionLayerReshapeWeights();
+ _original_weights->mark_as_unused();
+ _are_weights_reshaped = true;
+ }
+
+ // Run GEMM prepare stage
+ if(_asm_glue._optimised_kernel)
+ {
+ _asm_glue.prepare();
+ }
+ else
+ {
+ if(_is_quantized)
+ {
+ _mm_gemmlowp.prepare();
+ }
+ }
+
+ // Release weights in case buffer is pretransposed
+ if(!_weights_reshaped.is_used())
+ {
+ _weights_reshaped.allocator()->free();
+ }
+
+ _is_prepared = true;
+ }
+}
} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 2e06fa2ef4..a92ffa7c7b 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,8 +42,8 @@ using namespace arm_compute::misc::shape_calculator;
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _asm_glue_unsigned(), _asm_glue_signed(), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
- _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _a_offset(0), _b_offset(0),
- _run_vector_matrix_multiplication(false), _dot_product_path(false), _is_first_run(true), _reshape_b_only_on_first_run(false)
+ _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _workspace(), _B_pretranspose(), _original_b(nullptr), _a_offset(0), _b_offset(0),
+ _run_vector_matrix_multiplication(false), _dot_product_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false)
{
}
@@ -52,23 +52,32 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info(), gemm_info));
+ // Clear state
+ _mtx_a_reshape_kernel = nullptr;
+ _mtx_b_reshape_kernel = nullptr;
+ _asm_glue_signed._optimised_kernel = nullptr;
+ _asm_glue_unsigned._optimised_kernel = nullptr;
+
+ // Set internal variables
_a_offset = a->info()->quantization_info().offset;
_b_offset = b->info()->quantization_info().offset;
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
_reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _is_prepared = false;
+ _original_b = b;
#ifdef __aarch64__
switch(a->info()->data_type())
{
case DataType::S8:
{
- _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
+ _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run, _workspace, _B_pretranspose, _memory_group, _asm_glue_signed);
break;
}
case DataType::QASYMM8:
case DataType::U8:
{
- _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, true, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
+ _dot_product_path = setup_assembly_kernel(a, b, output, 1.f, 0.f, _reshape_b_only_on_first_run, _workspace, _B_pretranspose, _memory_group, _asm_glue_unsigned);
break;
}
default:
@@ -160,10 +169,13 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
if(!_dot_product_path && !_run_vector_matrix_multiplication)
{
_tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
+ if(!_reshape_b_only_on_first_run)
+ {
+ _tmp_b.allocator()->allocate();
+ }
}
- if(_a_offset != 0)
+ if(_a_offset != 0 && !_reshape_b_only_on_first_run)
{
_vector_sum_col.allocator()->allocate();
}
@@ -248,22 +260,21 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
void NEGEMMLowpMatrixMultiplyCore::run()
{
+ prepare();
+
_memory_group.acquire();
- // Do not reshape if we run the vector-by-matrix case and we do not have the optimized gemm with dot product instruction
- if(!_run_vector_matrix_multiplication && !_dot_product_path)
+ // Reshape inputs
+ if(_mtx_a_reshape_kernel)
{
- if(_mtx_a_reshape_kernel)
- {
- NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
- }
-
- if(_mtx_b_reshape_kernel && (_is_first_run || !_reshape_b_only_on_first_run))
- {
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
- }
+ NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
+ }
+ if(_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
+ {
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
+ // Run GEMM
if(_asm_glue_unsigned._optimised_kernel != nullptr)
{
_asm_glue_unsigned.run();
@@ -284,7 +295,7 @@ void NEGEMMLowpMatrixMultiplyCore::run()
}
// Run matrix B reduction kernel only if _a_offset is not equal to 0
- if(_a_offset != 0 && (_is_first_run || !_reshape_b_only_on_first_run))
+ if(_a_offset != 0 && !_reshape_b_only_on_first_run)
{
NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
}
@@ -293,6 +304,45 @@ void NEGEMMLowpMatrixMultiplyCore::run()
NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
_memory_group.release();
+}
- _is_first_run = false;
+void NEGEMMLowpMatrixMultiplyCore::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Run assembly reshape
+ if((_asm_glue_signed._optimised_kernel || _asm_glue_signed._optimised_kernel) && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ if(_asm_glue_unsigned._optimised_kernel != nullptr)
+ {
+ _asm_glue_unsigned.prepare();
+ }
+ else if(_asm_glue_signed._optimised_kernel != nullptr)
+ {
+ _asm_glue_signed.prepare();
+ }
+ _original_b->mark_as_unused();
+ }
+ // Run non-assembly reshape
+ else if(_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
+
+ // Run reshape kernel and mark original weights tensor as unused
+ _tmp_b.allocator()->allocate();
+ NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
+ _original_b->mark_as_unused();
+ }
+
+ // Run matrix B reduction kernel only if _a_offset is not equal to 0
+ if(_a_offset != 0 && _reshape_b_only_on_first_run)
+ {
+ _vector_sum_col.allocator()->allocate();
+ NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
+ }
+
+ _is_prepared = true;
+ }
}
diff --git a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
index 913acf86a2..0737bd2f73 100644
--- a/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NELocallyConnectedLayer.cpp
@@ -73,7 +73,7 @@ void calculate_shapes(const ITensorInfo *input, const ITensorInfo *weights, cons
NELocallyConnectedLayer::NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _input_im2col_kernel(), _weights_reshape_kernel(), _mm_kernel(), _output_col2im_kernel(), _input_im2col_reshaped(), _weights_reshaped(), _gemm_output(),
- _is_first_run(false), _original_weights(nullptr)
+ _is_prepared(false), _original_weights(nullptr)
{
}
@@ -127,7 +127,7 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei
ARM_COMPUTE_ERROR_THROW_ON(NELocallyConnectedLayer::validate(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), conv_info));
bool _has_bias = (biases != nullptr);
- _is_first_run = true;
+ _is_prepared = false;
_original_weights = weights;
const unsigned int kernel_width = weights->info()->dimension(0);
@@ -160,24 +160,13 @@ void NELocallyConnectedLayer::configure(const ITensor *input, const ITensor *wei
_output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
// Allocate intermediate tensors
- _weights_reshaped.allocator()->allocate();
_input_im2col_reshaped.allocator()->allocate();
_gemm_output.allocator()->allocate();
}
void NELocallyConnectedLayer::run()
{
- // Run weights reshaping (Runs once for every configure)
- if(_is_first_run)
- {
- ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
- _is_first_run = false;
- NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
-
- // Mark original weights tensor as unused
- _original_weights->mark_as_unused();
- }
+ prepare();
_memory_group.acquire();
@@ -192,3 +181,18 @@ void NELocallyConnectedLayer::run()
_memory_group.release();
}
+
+void NELocallyConnectedLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+ // Run weights reshaping and mark original weights tensor as unused
+ _weights_reshaped.allocator()->allocate();
+ NEScheduler::get().schedule(&_weights_reshape_kernel, 3);
+ _original_weights->mark_as_unused();
+
+ _is_prepared = true;
+ }
+}
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index d6bc5cfd9a..d9f6c0e0f8 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -113,7 +113,7 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz
NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)), _arm_gemm(nullptr), _gemm_kernel(nullptr), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr),
_activationlayer_function(), _permute_input(), _permute_weights(), _permute_output(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(), _weights_hwio(),
- _workspace(), _input(), _weights(), _output(), _reshaped_kernel(false), _is_activationlayer_enabled(false)
+ _workspace(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
{
} /* arm_compute */
@@ -139,9 +139,10 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
}
- _weights = weights;
- _input = input;
- _output = output;
+ _weights = weights;
+ _input = input;
+ _output = output;
+ _is_prepared = false;
std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
@@ -198,10 +199,13 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
const Tensor4DShape in_shape(internal_get_input_shape(input));
const size_t data_type_size = input->info()->element_size();
// Get the memory required to instantiate a new Winograd operator.
- constexpr size_t storage_alignment = 64;
- const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
+ constexpr size_t storage_alignment = 64;
+
+ // Kernel Storage
+ const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels, in_channels) * data_type_size;
_kernel_storage.allocator()->init(TensorInfo(TensorShape{ (kernel_storage_size + storage_alignment - 1) }, 1, DataType::U8));
_kernel_storage.allocator()->allocate();
+
// Input storage
const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols, use_same_padding) * data_type_size;
_input_workspace.allocator()->init(TensorInfo(TensorShape{ (input_storage_size + storage_alignment - 1) }, 1, DataType::U8));
@@ -331,13 +335,9 @@ void NEWinogradConvolutionLayer::run()
{
const DataLayout data_layout = _input->info()->data_layout();
+ prepare();
+
_memory_group.acquire();
- if(!_reshaped_kernel)
- {
- _reshaped_kernel = true;
- _permute_weights.run();
- NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
- }
if(data_layout == DataLayout::NCHW)
{
@@ -491,4 +491,20 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
return Status{};
}
+void NEWinogradConvolutionLayer::prepare()
+{
+ if(!_is_prepared)
+ {
+ // Permute weights
+ _permute_weights.run();
+ _weights->mark_as_unused();
+
+ // Transform weights
+ NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
+ _weights_hwio.allocator()->free();
+
+ _is_prepared = true;
+ }
+}
+
} // namespace arm_compute