29 #include <arm_compute/core/CL/CLKernelLibrary.h> 30 #include <arm_compute/runtime/CL/CLBufferAllocator.h> 31 #include <arm_compute/runtime/CL/CLScheduler.h> 48 std::string& outReasonIfUnsupported)
55 std::string& outReasonIfUnsupported,
71 if (modelOptions->SaveCachedNetwork())
75 auto cachedFd = modelOptions->GetCachedFileDescriptor();
78 std::vector<uint8_t> compiledContextData;
79 std::stringstream stream;
83 std::string
const serializedString{stream.str()};
84 std::copy(serializedString.begin(),
85 serializedString.end(),
86 std::back_inserter(compiledContextData));
87 auto success = write(cachedFd, compiledContextData.data(), compiledContextData.size());
90 ARMNN_LOG(
info) <<
"ClWorkloadFactory:: Could not cache the compiled context!";
96 auto filePath = modelOptions->GetCachedNetworkFilePath();
97 if (filePath !=
"" && fs::exists(filePath) && fs::is_regular_file(filePath))
100 std::ofstream file(filePath, std::ios::out | std::ios::binary);
108 std::unique_ptr<IWorkload> ClWorkloadFactory::MakeWorkload(
const QueueDescriptorType& descriptor,
114 return MakeWorkloadHelper<FloatWorkload, Uint8Workload>(descriptor,
info, std::forward<Args>(args)...);
122 template <
typename Workload,
typename QueueDescriptorType,
typename... Args>
123 std::unique_ptr<IWorkload> ClWorkloadFactory::MakeWorkload(
const QueueDescriptorType& descriptor,
129 return std::make_unique<Workload>(descriptor,
info, std::forward<Args>(args)...);
137 void ClWorkloadFactory::InitializeCLCompileContext()
140 auto context = arm_compute::CLKernelLibrary::get().context();
141 auto device = arm_compute::CLKernelLibrary::get().get_device();
142 m_CLCompileContext = arm_compute::CLCompileContext(context, device);
144 if (m_ModelContextPtr)
149 if (!(modelOptions->SaveCachedNetwork()))
152 auto cachedFd = modelOptions->GetCachedFileDescriptor();
155 struct stat statBuffer;
156 if (fstat(cachedFd, &statBuffer) == 0)
158 long dataSize =
static_cast<long>(statBuffer.st_size);
161 auto offset = lseek(cachedFd, 0, SEEK_CUR);
164 std::vector <uint8_t> compiledContextData(static_cast<unsigned int>(dataSize));
165 auto success = pread(cachedFd, compiledContextData.data(), compiledContextData.size(), 0);
171 compiledContextData);
179 if (filePath !=
"" && fs::exists(filePath) && fs::is_regular_file(filePath))
182 deserializer.
Deserialize(m_CLCompileContext, context, device, filePath);
189 : m_MemoryManager(memoryManager), m_ModelContextPtr(
IBackendInternal::IBackendSpecificModelContextPtr{})
191 InitializeCLCompileContext();
196 : m_MemoryManager(memoryManager), m_ModelContextPtr(modelContextPtr)
198 InitializeCLCompileContext();
202 const bool IsMemoryManaged)
const 205 std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo);
206 tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
213 const bool IsMemoryManaged)
const 216 std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo, dataLayout);
217 tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
224 unsigned int const* subTensorOrigin)
const 227 arm_compute::TensorShape shape = armcomputetensorutils::BuildArmComputeTensorShape(subTensorShape);
234 coords.set(i, armnn::numeric_cast<int>(subTensorOrigin[revertedIndex]));
237 const arm_compute::TensorShape parentShape = armcomputetensorutils::BuildArmComputeTensorShape(parent.
GetShape());
238 if (!::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, parentShape, coords, shape))
243 return std::make_unique<ClSubTensorHandle>(
244 PolymorphicDowncast<IClTensorHandle*>(&parent), shape, coords);
250 return MakeWorkload<ClActivationWorkload>(descriptor,
info, m_CLCompileContext);
256 return MakeWorkload<ClAdditionWorkload>(descriptor,
info, m_CLCompileContext);
262 return std::make_unique<ClArgMinMaxWorkload>(descriptor,
info, m_CLCompileContext);
269 return MakeWorkload<ClBatchNormalizationFloatWorkload, NullWorkload>(descriptor,
info, m_CLCompileContext);
275 return MakeWorkload<ClBatchToSpaceNdWorkload>(descriptor,
info, m_CLCompileContext);
281 return MakeWorkload<ClCastWorkload>(descriptor,
info, m_CLCompileContext);
287 return MakeWorkload<ClChannelShuffleWorkload>(descriptor,
info, m_CLCompileContext);
293 return MakeWorkload<ClComparisonWorkload>(descriptor,
info, m_CLCompileContext);
299 return MakeWorkload<ClConcatWorkload>(descriptor,
info, m_CLCompileContext);
305 return MakeWorkload<ClConstantWorkload>(descriptor,
info, m_CLCompileContext);
312 return MakeWorkload<ClConvertFp16ToFp32Workload>(descriptor,
info, m_CLCompileContext);
319 return MakeWorkload<ClConvertFp32ToFp16Workload>(descriptor,
info, m_CLCompileContext);
325 bool isFastMathEnabled =
false;
326 if (m_ModelContextPtr)
328 if (m_ModelContextPtr.get() !=
nullptr)
337 return MakeWorkload<ClConvolution2dWorkload>(descriptor,
339 m_MemoryManager->GetIntraLayerManager(),
347 bool isFastMathEnabled =
false;
348 if (m_ModelContextPtr)
350 if (m_ModelContextPtr.get() !=
nullptr)
359 return MakeWorkload<ClConvolution3dWorkload>(descriptor,
361 m_MemoryManager->GetIntraLayerManager(),
369 return MakeWorkload<NullWorkload, NullWorkload>(descriptor,
info, m_CLCompileContext);
375 return MakeWorkload<ClDepthToSpaceWorkload>(descriptor,
info, m_CLCompileContext);
382 return MakeWorkload<ClDepthwiseConvolutionWorkload>(descriptor,
info, m_CLCompileContext);
388 return MakeWorkload<ClDequantizeWorkload>(descriptor,
info, m_CLCompileContext);
395 return MakeWorkload<NullWorkload, NullWorkload>(descriptor,
info, m_CLCompileContext);
401 return std::make_unique<ClDivisionWorkload>(descriptor,
info, m_CLCompileContext);
415 return std::make_unique<ClAbsWorkload>(absQueueDescriptor,
info, m_CLCompileContext);
418 return std::make_unique<ClExpWorkload>(descriptor,
info, m_CLCompileContext);
420 return std::make_unique<ClLogWorkload>(descriptor,
info, m_CLCompileContext);
422 return std::make_unique<ClLogicalNotWorkload>(descriptor,
info, m_CLCompileContext);
424 return std::make_unique<ClNegWorkload>(descriptor,
info, m_CLCompileContext);
428 rsqrtQueueDescriptor.
m_Inputs = descriptor.m_Inputs;
429 rsqrtQueueDescriptor.
m_Outputs = descriptor.m_Outputs;
431 return std::make_unique<ClRsqrtWorkload>(rsqrtQueueDescriptor,
info, m_CLCompileContext);
434 return std::make_unique<ClSinWorkload>(descriptor,
info, m_CLCompileContext);
443 return std::make_unique<ClFillWorkload>(descriptor,
info, m_CLCompileContext);
449 return MakeWorkload<ClFloorFloatWorkload, NullWorkload>(descriptor,
info, m_CLCompileContext);
455 return MakeWorkload<ClFullyConnectedWorkload>(descriptor,
457 m_MemoryManager->GetIntraLayerManager(),
464 return MakeWorkload<ClGatherWorkload>(descriptor,
info, m_CLCompileContext);
470 return std::make_unique<CopyMemGenericWorkload>(descriptor,
info);
477 return MakeWorkload<ClInstanceNormalizationWorkload>(descriptor,
info, m_CLCompileContext);
483 return MakeWorkload<ClL2NormalizationFloatWorkload, NullWorkload>(descriptor,
info, m_CLCompileContext);
492 return std::make_unique<ClLogicalAndWorkload>(descriptor,
info, m_CLCompileContext);
494 return std::make_unique<ClLogicalOrWorkload>(descriptor,
info, m_CLCompileContext);
503 return MakeWorkload<ClLogSoftmaxWorkload>(descriptor,
505 m_MemoryManager->GetIntraLayerManager(),
512 return MakeWorkload<ClLstmFloatWorkload, NullWorkload>(descriptor,
info, m_CLCompileContext);
518 return MakeWorkload<ClMaximumWorkload>(descriptor,
info, m_CLCompileContext);
524 return MakeWorkload<ClMeanWorkload>(descriptor,
info, m_CLCompileContext);
535 return MakeWorkload<CopyMemGenericWorkload>(descriptor,
info);
546 return std::make_unique<ImportMemGenericWorkload>(descriptor,
info);
552 return MakeWorkload<ClMinimumWorkload>(descriptor,
info, m_CLCompileContext);
558 return MakeWorkload<ClMultiplicationWorkload>(descriptor,
info, m_CLCompileContext);
564 return MakeWorkload<ClNormalizationFloatWorkload, NullWorkload>(descriptor,
info, m_CLCompileContext);
570 return std::make_unique<CopyMemGenericWorkload>(descriptor,
info);
576 return MakeWorkload<ClPadWorkload>(descriptor,
info, m_CLCompileContext);
582 return MakeWorkload<ClPermuteWorkload>(descriptor,
info, m_CLCompileContext);
588 return MakeWorkload<ClPooling2dWorkload>(descriptor,
info, m_CLCompileContext);
594 return MakeWorkload<NullWorkload, NullWorkload>(descriptor,
info, m_CLCompileContext);
600 return MakeWorkload<ClPreluWorkload>(descriptor,
info, m_CLCompileContext);
606 return std::make_unique<ClQLstmWorkload>(descriptor,
info, m_CLCompileContext);
612 return MakeWorkload<ClQuantizeWorkload>(descriptor,
info, m_CLCompileContext);
618 return MakeWorkload<ClQuantizedLstmWorkload>(descriptor,
info, m_CLCompileContext);
624 return std::make_unique<ClRankWorkload>(descriptor,
info);
630 return std::make_unique<ClReduceWorkload>(descriptor,
info);
636 return MakeWorkload<ClReshapeWorkload>(descriptor,
info, m_CLCompileContext);
642 return MakeWorkload<ClResizeWorkload>(descriptor,
info, m_CLCompileContext);
648 return MakeWorkload<ClSliceWorkload>(descriptor,
info, m_CLCompileContext);
654 return std::make_unique<ClSoftmaxWorkload>(descriptor,
656 m_MemoryManager->GetIntraLayerManager(),
663 return MakeWorkload<ClSpaceToBatchNdWorkload>(descriptor,
info, m_CLCompileContext);
669 return MakeWorkload<ClSpaceToDepthWorkload>(descriptor,
info, m_CLCompileContext);
675 return MakeWorkload<ClSplitterWorkload>(descriptor,
info, m_CLCompileContext);
681 return MakeWorkload<ClStackWorkload>(descriptor,
info, m_CLCompileContext);
687 return MakeWorkload<ClStridedSliceWorkload>(descriptor,
info, m_CLCompileContext);
693 return MakeWorkload<ClSubtractionWorkload>(descriptor,
info, m_CLCompileContext);
699 return MakeWorkload<ClTransposeWorkload>(descriptor,
info, m_CLCompileContext);
706 return MakeWorkload<ClTransposeConvolution2dWorkload>(descriptor,
708 m_MemoryManager->GetIntraLayerManager(),
std::unique_ptr< IWorkload > CreateDetectionPostProcess(const DetectionPostProcessQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< ITensorHandle > CreateSubTensorHandle(ITensorHandle &parent, TensorShape const &subTensorShape, unsigned int const *subTensorOrigin) const override
std::unique_ptr< IWorkload > CreateComparison(const ComparisonQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateGather(const GatherQueueDescriptor &descriptor, const WorkloadInfo &info) const override
ClWorkloadFactory(const std::shared_ptr< ClMemoryManager > &memoryManager)
UnaryOperation m_Operation
Specifies the elementwiseUnary operation to execute.
Interface for a layer that is connectable to other layers via InputSlots and OutputSlots.
std::unique_ptr< IWorkload > CreateDebug(const DebugQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateReshape(const ReshapeQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateConvolution3d(const Convolution3dQueueDescriptor &descriptor, const WorkloadInfo &info) const override
void AfterWorkloadsCreated() override
std::unique_ptr< IWorkload > CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateSpaceToBatchNd(const SpaceToBatchNdQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateDivision(const DivisionQueueDescriptor &descriptor, const WorkloadInfo &info) const override
constexpr const char * ClBackendId()
std::vector< BackendOptions > ModelOptions
std::unique_ptr< IWorkload > CreateStridedSlice(const StridedSliceQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateQuantize(const QuantizeQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::array< unsigned int, MaxNumOfTensorDimensions > Coordinates
std::unique_ptr< IWorkload > CreateInput(const InputQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateStack(const StackQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateFullyConnected(const FullyConnectedQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateSlice(const SliceQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< ITensorHandle > CreateTensorHandle(const TensorInfo &tensorInfo, const bool IsMemoryManaged=true) const override
std::unique_ptr< IWorkload > CreateQuantizedLstm(const QuantizedLstmQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateRank(const RankQueueDescriptor &descriptor, const WorkloadInfo &info) const override
const BackendId & GetBackendId() const override
#define ARMNN_LOG(severity)
bool SaveSerializedToStream(std::ostream &stream)
Serializes the ClContext to the stream.
Copyright (c) 2021 ARM Limited and Contributors.
void IgnoreUnused(Ts &&...)
TypedWorkload< QueueDescriptor, armnn::DataType::Float16, armnn::DataType::Float32 > FloatWorkload
std::unique_ptr< IWorkload > CreateLstm(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info) const override
LayerDescriptor m_Parameters
LogicalBinaryOperation m_Operation
Specifies the logical operation to execute.
void DeserializeFromBinary(arm_compute::CLCompileContext &clCompileContext, cl::Context &context, cl::Device &device, const std::vector< uint8_t > &binaryContent)
Deserializes the CLCompileContext built-in programs from binary file contents.
std::unique_ptr< IWorkload > CreateLogicalBinary(const LogicalBinaryQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateLogSoftmax(const LogSoftmaxQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateMean(const MeanQueueDescriptor &descriptor, const WorkloadInfo &Info) const override
std::unique_ptr< IWorkload > CreateReduce(const ReduceQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateMinimum(const MinimumQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateTransposeConvolution2d(const TransposeConvolution2dQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateFill(const FillQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateCast(const CastQueueDescriptor &descriptor, const WorkloadInfo &info) const override
static bool IsLayerSupported(const Layer &layer, Optional< DataType > dataType, std::string &outReasonIfUnsupported)
std::unique_ptr< IWorkload > CreateQLstm(const QLstmQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateDepthwiseConvolution2d(const DepthwiseConvolution2dQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateDepthToSpace(const DepthToSpaceQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateChannelShuffle(const ChannelShuffleQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::shared_ptr< IBackendModelContext > IBackendSpecificModelContextPtr
std::unique_ptr< IWorkload > CreateResize(const ResizeQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreatePermute(const PermuteQueueDescriptor &descriptor, const WorkloadInfo &info) const override
void Serialize(const arm_compute::CLCompileContext &clCompileContext)
Serializes the CLCompileContext built-in programs.
std::unique_ptr< IWorkload > CreateMemCopy(const MemCopyQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateBatchToSpaceNd(const BatchToSpaceNdQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateSoftmax(const SoftmaxQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateMaximum(const MaximumQueueDescriptor &descriptor, const WorkloadInfo &info) const override
RuntimeException WrapClError(const cl::Error &clError, const CheckLocation &location)
std::unique_ptr< IWorkload > CreateElementwiseUnary(const ElementwiseUnaryQueueDescriptor &descriptor, const WorkloadInfo &info) const override
bool IsFastMathEnabled() const
std::unique_ptr< IWorkload > CreateFloor(const FloorQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreatePreCompiled(const PreCompiledQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateSubtraction(const SubtractionQueueDescriptor &descriptor, const WorkloadInfo &info) const override
static bool IsLayerSupported(const BackendId &backendId, const IConnectableLayer &layer, Optional< DataType > dataType, std::string &outReasonIfUnsupported)
std::unique_ptr< IWorkload > CreateMultiplication(const MultiplicationQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateAddition(const AdditionQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreatePooling2d(const Pooling2dQueueDescriptor &descriptor, const WorkloadInfo &info) const override
virtual TensorShape GetShape() const =0
Get the number of elements for each dimension ordered from slowest iterating dimension to fastest ite...
void Deserialize(arm_compute::CLCompileContext &clCompileContext, cl::Context &context, cl::Device &device, const std::string &filePath)
Deserializes the CLCompileContext built-in programs from a binary file.
TypedWorkload< QueueDescriptor, armnn::DataType::QAsymmU8 > Uint8Workload
std::unique_ptr< IWorkload > CreateSplitter(const SplitterQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateL2Normalization(const L2NormalizationQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateActivation(const ActivationQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::vector< ITensorHandle * > m_Outputs
std::unique_ptr< IWorkload > CreatePad(const PadQueueDescriptor &descriptor, const WorkloadInfo &info) const override
unsigned int GetNumDimensions() const
Function that returns the tensor rank.
std::unique_ptr< IWorkload > CreateSpaceToDepth(const SpaceToDepthQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateNormalization(const NormalizationQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateInstanceNormalization(const InstanceNormalizationQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateTranspose(const TransposeQueueDescriptor &descriptor, const WorkloadInfo &info) const override
Contains information about TensorInfos of a layer.
std::vector< ITensorHandle * > m_Inputs
std::unique_ptr< IWorkload > CreateBatchNormalization(const BatchNormalizationQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateConcat(const ConcatQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateMemImport(const MemImportQueueDescriptor &descriptor, const WorkloadInfo &info) const override
The ClBackendModelContext is used to pass in CL specific backend ModelOptions.
std::unique_ptr< IWorkload > CreatePrelu(const PreluQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateDequantize(const DequantizeQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateConstant(const ConstantQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateConvolution2d(const Convolution2dQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::unique_ptr< IWorkload > CreateArgMinMax(const ArgMinMaxQueueDescriptor &descriptor, const WorkloadInfo &info) const override
std::string GetCachedNetworkFilePath() const
Depthwise Convolution 2D layer workload data.
std::unique_ptr< IWorkload > CreateOutput(const OutputQueueDescriptor &descriptor, const WorkloadInfo &info) const override