29 #include <arm_compute/core/CL/CLKernelLibrary.h>
30 #include <arm_compute/runtime/CL/CLBufferAllocator.h>
31 #include <arm_compute/runtime/CL/CLScheduler.h>
48 std::string& outReasonIfUnsupported)
55 std::string& outReasonIfUnsupported,
71 if (modelOptions->SaveCachedNetwork())
75 auto cachedFd = modelOptions->GetCachedFileDescriptor();
78 std::vector<uint8_t> compiledContextData;
79 std::stringstream stream;
80 bool serialized =
serializer.SaveSerializedToStream(stream);
83 std::string
const serializedString{stream.str()};
84 std::copy(serializedString.begin(),
85 serializedString.end(),
86 std::back_inserter(compiledContextData));
87 auto success = write(cachedFd, compiledContextData.data(), compiledContextData.size());
90 ARMNN_LOG(
info) <<
"ClWorkloadFactory:: Could not cache the compiled context!";
96 auto filePath = modelOptions->GetCachedNetworkFilePath();
97 if (filePath !=
"" && fs::exists(filePath) && fs::is_regular_file(filePath))
100 std::ofstream file(filePath, std::ios::out | std::ios::binary);
108 std::unique_ptr<IWorkload> ClWorkloadFactory::MakeWorkload(
const QueueDescriptorType& descriptor,
114 return MakeWorkloadHelper<FloatWorkload, Uint8Workload>(descriptor, info, std::forward<Args>(args)...);
116 catch (
const cl::Error& clError)
122 template <
typename Workload,
typename QueueDescriptorType,
typename... Args>
123 std::unique_ptr<IWorkload> ClWorkloadFactory::MakeWorkload(
const QueueDescriptorType& descriptor,
124 const WorkloadInfo& info,
129 return std::make_unique<Workload>(descriptor, info, std::forward<Args>(args)...);
131 catch (
const cl::Error& clError)
137 void ClWorkloadFactory::InitializeCLCompileContext()
140 auto context = arm_compute::CLKernelLibrary::get().context();
141 auto device = arm_compute::CLKernelLibrary::get().get_device();
142 m_CLCompileContext = arm_compute::CLCompileContext(context, device);
144 if (m_ModelContextPtr)
147 auto modelOptions =
dynamic_cast<ClBackendModelContext*
>(m_ModelContextPtr.get());
148 auto filePath = modelOptions->GetCachedNetworkFilePath();
149 if (!(modelOptions->SaveCachedNetwork()))
151 ClContextDeserializer deserializer;
152 auto cachedFd = modelOptions->GetCachedFileDescriptor();
155 struct stat statBuffer;
156 if (fstat(cachedFd, &statBuffer) == 0)
158 long dataSize =
static_cast<long>(statBuffer.st_size);
161 auto offset = lseek(cachedFd, 0, SEEK_CUR);
164 std::vector <uint8_t> compiledContextData(
static_cast<unsigned int>(dataSize));
165 auto success = pread(cachedFd, compiledContextData.data(), compiledContextData.size(), 0);
168 deserializer.DeserializeFromBinary(m_CLCompileContext,
171 compiledContextData);
179 if (filePath !=
"" && fs::exists(filePath) && fs::is_regular_file(filePath))
182 deserializer.Deserialize(m_CLCompileContext, context, device, filePath);
189 : m_MemoryManager(memoryManager), m_ModelContextPtr(
IBackendInternal::IBackendSpecificModelContextPtr{})
191 InitializeCLCompileContext();
196 : m_MemoryManager(memoryManager), m_ModelContextPtr(modelContextPtr)
198 InitializeCLCompileContext();
202 const bool IsMemoryManaged)
const
205 std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo);
206 tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
213 const bool IsMemoryManaged)
const
216 std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo, dataLayout);
217 tensorHandle->SetMemoryGroup(m_MemoryManager->GetInterLayerMemoryGroup());
224 unsigned int const* subTensorOrigin)
const
227 arm_compute::TensorShape shape = armcomputetensorutils::BuildArmComputeTensorShape(subTensorShape);
234 coords.set(i, armnn::numeric_cast<int>(subTensorOrigin[revertedIndex]));
237 const arm_compute::TensorShape parentShape = armcomputetensorutils::BuildArmComputeTensorShape(parent.
GetShape());
238 if (!::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, parentShape, coords, shape))
243 return std::make_unique<ClSubTensorHandle>(
244 PolymorphicDowncast<IClTensorHandle*>(&parent), shape, coords);
255 auto activationQueueDescriptor = PolymorphicDowncast<const ActivationQueueDescriptor*>(&descriptor);
256 return MakeWorkload<ClActivationWorkload>(*activationQueueDescriptor,
info, m_CLCompileContext);
260 auto additionQueueDescriptor = PolymorphicDowncast<const AdditionQueueDescriptor*>(&descriptor);
261 return MakeWorkload<ClAdditionWorkload>(*additionQueueDescriptor,
info, m_CLCompileContext);
265 auto argMinMaxQueueDescriptor = PolymorphicDowncast<const ArgMinMaxQueueDescriptor*>(&descriptor);
266 return MakeWorkload<ClArgMinMaxWorkload>(*argMinMaxQueueDescriptor,
info, m_CLCompileContext);
270 auto batchMatMulQueueDescriptor = PolymorphicDowncast<const BatchMatMulQueueDescriptor*>(&descriptor);
271 return std::make_unique<ClBatchMatMulWorkload>(*batchMatMulQueueDescriptor,
info, m_CLCompileContext);
275 auto batchNormalizationQueueDescriptor
276 = PolymorphicDowncast<const BatchNormalizationQueueDescriptor*>(&descriptor);
277 return MakeWorkload<ClBatchNormalizationFloatWorkload, NullWorkload>
278 (*batchNormalizationQueueDescriptor,
info, m_CLCompileContext);
282 auto batchToSpaceNdQueueDescriptor
283 = PolymorphicDowncast<const BatchToSpaceNdQueueDescriptor*>(&descriptor);
284 return MakeWorkload<ClBatchToSpaceNdWorkload>(*batchToSpaceNdQueueDescriptor,
info, m_CLCompileContext);
288 auto castQueueDescriptor = PolymorphicDowncast<const CastQueueDescriptor*>(&descriptor);
289 return MakeWorkload<ClCastWorkload>(*castQueueDescriptor,
info, m_CLCompileContext);
293 auto channelShuffleQueueDescriptor
294 = PolymorphicDowncast<const ChannelShuffleQueueDescriptor*>(&descriptor);
295 return MakeWorkload<ClChannelShuffleWorkload>(*channelShuffleQueueDescriptor,
info, m_CLCompileContext);
299 auto comparisonQueueDescriptor = PolymorphicDowncast<const ComparisonQueueDescriptor*>(&descriptor);
300 return MakeWorkload<ClComparisonWorkload>(*comparisonQueueDescriptor,
info, m_CLCompileContext);
304 auto concatQueueDescriptor = PolymorphicDowncast<const ConcatQueueDescriptor*>(&descriptor);
305 return MakeWorkload<ClConcatWorkload>(*concatQueueDescriptor,
info, m_CLCompileContext);
309 auto constantQueueDescriptor = PolymorphicDowncast<const ConstantQueueDescriptor*>(&descriptor);
310 return MakeWorkload<ClConstantWorkload>(*constantQueueDescriptor,
info, m_CLCompileContext);
314 auto convertFp16ToFp32QueueDescriptor
315 = PolymorphicDowncast<const ConvertFp16ToFp32QueueDescriptor*>(&descriptor);
316 return MakeWorkload<ClConvertFp16ToFp32Workload>(*convertFp16ToFp32QueueDescriptor,
322 auto convertFp32ToFp16QueueDescriptor
323 = PolymorphicDowncast<const ConvertFp32ToFp16QueueDescriptor*>(&descriptor);
324 return MakeWorkload<ClConvertFp32ToFp16Workload>(*convertFp32ToFp16QueueDescriptor,
330 auto convolution2dQueueDescriptor = PolymorphicDowncast<const Convolution2dQueueDescriptor*>(&descriptor);
332 bool isFastMathEnabled =
false;
333 if (m_ModelContextPtr)
335 if (m_ModelContextPtr.get() !=
nullptr)
344 return MakeWorkload<ClConvolution2dWorkload>(*convolution2dQueueDescriptor,
346 m_MemoryManager->GetIntraLayerManager(),
352 auto convolution3dQueueDescriptor = PolymorphicDowncast<const Convolution3dQueueDescriptor*>(&descriptor);
354 bool isFastMathEnabled =
false;
355 if (m_ModelContextPtr)
357 if (m_ModelContextPtr.get() !=
nullptr)
366 return MakeWorkload<ClConvolution3dWorkload>(*convolution3dQueueDescriptor,
368 m_MemoryManager->GetIntraLayerManager(),
374 auto debugQueueDescriptor = PolymorphicDowncast<const DebugQueueDescriptor*>(&descriptor);
375 return MakeWorkload<NullWorkload, NullWorkload>(*debugQueueDescriptor,
info, m_CLCompileContext);
379 auto depthToSpaceQueueDescriptor = PolymorphicDowncast<const DepthToSpaceQueueDescriptor*>(&descriptor);
380 return MakeWorkload<ClDepthToSpaceWorkload>(*depthToSpaceQueueDescriptor,
info, m_CLCompileContext);
384 auto depthwiseConvolution2dQueueDescriptor
385 = PolymorphicDowncast<const DepthwiseConvolution2dQueueDescriptor*>(&descriptor);
386 return MakeWorkload<ClDepthwiseConvolutionWorkload>(*depthwiseConvolution2dQueueDescriptor,
392 auto dequantizeQueueDescriptor = PolymorphicDowncast<const DequantizeQueueDescriptor*>(&descriptor);
393 return MakeWorkload<ClDequantizeWorkload>(*dequantizeQueueDescriptor,
info, m_CLCompileContext);
397 auto detectionPostProcessQueueDescriptor
398 = PolymorphicDowncast<const DetectionPostProcessQueueDescriptor*>(&descriptor);
399 return MakeWorkload<NullWorkload, NullWorkload>(*detectionPostProcessQueueDescriptor,
405 auto divisionQueueDescriptor = PolymorphicDowncast<const DivisionQueueDescriptor*>(&descriptor);
406 return std::make_unique<ClDivisionWorkload>(*divisionQueueDescriptor,
info, m_CLCompileContext);
410 auto elementwiseBinaryQueueDescriptor
411 = PolymorphicDowncast<const ElementwiseBinaryQueueDescriptor*>(&descriptor);
413 switch (elementwiseBinaryQueueDescriptor->m_Parameters.m_Operation)
421 elementwiseBinaryQueueDescriptor->m_AdditionalInfoObject;
422 return std::make_unique<ClAdditionWorkload>(additionQueueDescriptor,
info, m_CLCompileContext);
430 elementwiseBinaryQueueDescriptor->m_AdditionalInfoObject;
431 return std::make_unique<ClDivisionWorkload>(divisionQueueDescriptor,
info, m_CLCompileContext);
439 elementwiseBinaryQueueDescriptor->m_AdditionalInfoObject;
440 return std::make_unique<ClMaximumWorkload>(maximumQueueDescriptor,
info, m_CLCompileContext);
448 elementwiseBinaryQueueDescriptor->m_AdditionalInfoObject;
449 return std::make_unique<ClMinimumWorkload>(minimumQueueDescriptor,
info, m_CLCompileContext);
457 elementwiseBinaryQueueDescriptor->m_AdditionalInfoObject;
458 return std::make_unique<ClMultiplicationWorkload>(multiplicationQueueDescriptor,
468 elementwiseBinaryQueueDescriptor->m_AdditionalInfoObject;
469 return std::make_unique<ClSubtractionWorkload>(subtractionQueueDescriptor,
479 auto elementwiseUnaryQueueDescriptor
480 = PolymorphicDowncast<const ElementwiseUnaryQueueDescriptor*>(&descriptor);
482 switch(elementwiseUnaryQueueDescriptor->m_Parameters.m_Operation)
487 absQueueDescriptor.
m_Inputs = elementwiseUnaryQueueDescriptor->m_Inputs;
488 absQueueDescriptor.
m_Outputs = elementwiseUnaryQueueDescriptor->m_Outputs;
490 return std::make_unique<ClAbsWorkload>(absQueueDescriptor,
info, m_CLCompileContext);
493 return std::make_unique<ClExpWorkload>(*elementwiseUnaryQueueDescriptor,
info, m_CLCompileContext);
495 return std::make_unique<ClLogWorkload>(*elementwiseUnaryQueueDescriptor,
info, m_CLCompileContext);
497 return std::make_unique<ClLogicalNotWorkload>(*elementwiseUnaryQueueDescriptor,
501 return std::make_unique<ClNegWorkload>(*elementwiseUnaryQueueDescriptor,
info, m_CLCompileContext);
505 rsqrtQueueDescriptor.
m_Inputs = elementwiseUnaryQueueDescriptor->m_Inputs;
506 rsqrtQueueDescriptor.
m_Outputs = elementwiseUnaryQueueDescriptor->m_Outputs;
508 return std::make_unique<ClRsqrtWorkload>(rsqrtQueueDescriptor,
info, m_CLCompileContext);
511 return std::make_unique<ClSinWorkload>(*elementwiseUnaryQueueDescriptor,
info, m_CLCompileContext);
513 return std::make_unique<ClSqrtWorkload>(*elementwiseUnaryQueueDescriptor,
info, m_CLCompileContext);
520 auto fillQueueDescriptor = PolymorphicDowncast<const FillQueueDescriptor*>(&descriptor);
521 return std::make_unique<ClFillWorkload>(*fillQueueDescriptor,
info, m_CLCompileContext);
525 auto floorQueueDescriptor = PolymorphicDowncast<const FloorQueueDescriptor*>(&descriptor);
526 return MakeWorkload<ClFloorFloatWorkload, NullWorkload>(*floorQueueDescriptor,
info, m_CLCompileContext);
530 auto fullyConnectedQueueDescriptor
531 = PolymorphicDowncast<const FullyConnectedQueueDescriptor*>(&descriptor);
532 return MakeWorkload<ClFullyConnectedWorkload>(*fullyConnectedQueueDescriptor,
534 m_MemoryManager->GetIntraLayerManager(),
539 auto gatherQueueDescriptor = PolymorphicDowncast<const GatherQueueDescriptor*>(&descriptor);
540 return MakeWorkload<ClGatherWorkload>(*gatherQueueDescriptor,
info, m_CLCompileContext);
544 auto gatherNdQueueDescriptor = PolymorphicDowncast<const GatherNdQueueDescriptor*>(&descriptor);
545 return MakeWorkload<ClGatherNdWorkload>(*gatherNdQueueDescriptor,
info, m_CLCompileContext);
549 auto inputQueueDescriptor = PolymorphicDowncast<const InputQueueDescriptor*>(&descriptor);
550 return std::make_unique<CopyMemGenericWorkload>(*inputQueueDescriptor,
info);
554 auto instanceNormalizationQueueDescriptor
555 = PolymorphicDowncast<const InstanceNormalizationQueueDescriptor*>(&descriptor);
556 return MakeWorkload<ClInstanceNormalizationWorkload>(*instanceNormalizationQueueDescriptor,
562 auto l2NormalizationQueueDescriptor
563 = PolymorphicDowncast<const L2NormalizationQueueDescriptor*>(&descriptor);
564 return MakeWorkload<ClL2NormalizationFloatWorkload, NullWorkload>(*l2NormalizationQueueDescriptor,
570 auto logicalBinaryQueueDescriptor = PolymorphicDowncast<const LogicalBinaryQueueDescriptor*>(&descriptor);
572 switch(logicalBinaryQueueDescriptor->m_Parameters.m_Operation)
575 return std::make_unique<ClLogicalAndWorkload>(*logicalBinaryQueueDescriptor,
579 return std::make_unique<ClLogicalOrWorkload>(*logicalBinaryQueueDescriptor,
588 auto logSoftmaxQueueDescriptor = PolymorphicDowncast<const LogSoftmaxQueueDescriptor*>(&descriptor);
590 return MakeWorkload<ClLogSoftmaxWorkload>(*logSoftmaxQueueDescriptor,
592 m_MemoryManager->GetIntraLayerManager(),
597 auto lstmQueueDescriptor = PolymorphicDowncast<const LstmQueueDescriptor*>(&descriptor);
598 return MakeWorkload<ClLstmFloatWorkload, NullWorkload>(*lstmQueueDescriptor,
info, m_CLCompileContext);
602 auto maximumQueueDescriptor = PolymorphicDowncast<const MaximumQueueDescriptor*>(&descriptor);
603 return MakeWorkload<ClMaximumWorkload>(*maximumQueueDescriptor,
info, m_CLCompileContext);
607 auto meanQueueDescriptor = PolymorphicDowncast<const MeanQueueDescriptor*>(&descriptor);
608 return MakeWorkload<ClMeanWorkload>(*meanQueueDescriptor,
info, m_CLCompileContext);
612 auto memCopyQueueDescriptor = PolymorphicDowncast<const MemCopyQueueDescriptor*>(&descriptor);
613 if (memCopyQueueDescriptor->m_Inputs.empty() || !memCopyQueueDescriptor->m_Inputs[0])
617 return MakeWorkload<CopyMemGenericWorkload>(*memCopyQueueDescriptor,
info);
621 auto memImportQueueDescriptor = PolymorphicDowncast<const MemImportQueueDescriptor*>(&descriptor);
622 if (memImportQueueDescriptor->m_Inputs.empty() || !memImportQueueDescriptor->m_Inputs[0])
626 return std::make_unique<ImportMemGenericWorkload>(*memImportQueueDescriptor,
info);
630 auto minimumQueueDescriptor = PolymorphicDowncast<const MinimumQueueDescriptor*>(&descriptor);
631 return MakeWorkload<ClMinimumWorkload>(*minimumQueueDescriptor,
info, m_CLCompileContext);
635 auto multiplicationQueueDescriptor = PolymorphicDowncast<const MultiplicationQueueDescriptor*>(&descriptor);
636 return MakeWorkload<ClMultiplicationWorkload>(*multiplicationQueueDescriptor,
info, m_CLCompileContext);
640 auto normalizationQueueDescriptor = PolymorphicDowncast<const NormalizationQueueDescriptor*>(&descriptor);
641 return MakeWorkload<ClNormalizationFloatWorkload, NullWorkload>(*normalizationQueueDescriptor,
647 auto outputQueueDescriptor = PolymorphicDowncast<const OutputQueueDescriptor*>(&descriptor);
648 return std::make_unique<CopyMemGenericWorkload>(*outputQueueDescriptor,
info);
652 auto padQueueDescriptor = PolymorphicDowncast<const PadQueueDescriptor*>(&descriptor);
653 return MakeWorkload<ClPadWorkload>(*padQueueDescriptor,
info, m_CLCompileContext);
657 auto permuteQueueDescriptor = PolymorphicDowncast<const PermuteQueueDescriptor*>(&descriptor);
658 return MakeWorkload<ClPermuteWorkload>(*permuteQueueDescriptor,
info, m_CLCompileContext);
662 auto pooling2dQueueDescriptor = PolymorphicDowncast<const Pooling2dQueueDescriptor*>(&descriptor);
663 return MakeWorkload<ClPooling2dWorkload>(*pooling2dQueueDescriptor,
info, m_CLCompileContext);
667 auto pooling3dQueueDescriptor = PolymorphicDowncast<const Pooling3dQueueDescriptor*>(&descriptor);
668 return MakeWorkload<ClPooling3dWorkload>(*pooling3dQueueDescriptor,
info, m_CLCompileContext);
672 auto preCompiledQueueDescriptor = PolymorphicDowncast<const PreCompiledQueueDescriptor*>(&descriptor);
673 return MakeWorkload<NullWorkload, NullWorkload>(*preCompiledQueueDescriptor,
info, m_CLCompileContext);
677 auto preluQueueDescriptor = PolymorphicDowncast<const PreluQueueDescriptor*>(&descriptor);
678 return MakeWorkload<ClPreluWorkload>(*preluQueueDescriptor,
info, m_CLCompileContext);
682 auto qLstmQueueDescriptor = PolymorphicDowncast<const QLstmQueueDescriptor*>(&descriptor);
683 return std::make_unique<ClQLstmWorkload>(*qLstmQueueDescriptor,
info, m_CLCompileContext);
687 auto quantizeQueueDescriptor = PolymorphicDowncast<const QuantizeQueueDescriptor*>(&descriptor);
688 return MakeWorkload<ClQuantizeWorkload>(*quantizeQueueDescriptor,
info, m_CLCompileContext);
692 auto quantizedLstmQueueDescriptor = PolymorphicDowncast<const QuantizedLstmQueueDescriptor*>(&descriptor);
693 return MakeWorkload<ClQuantizedLstmWorkload>(*quantizedLstmQueueDescriptor,
info, m_CLCompileContext);
697 auto rankQueueDescriptor = PolymorphicDowncast<const RankQueueDescriptor*>(&descriptor);
698 return std::make_unique<ClRankWorkload>(*rankQueueDescriptor,
info);
702 auto reduceQueueDescriptor = PolymorphicDowncast<const ReduceQueueDescriptor*>(&descriptor);
703 return std::make_unique<ClReduceWorkload>(*reduceQueueDescriptor,
info);
707 auto reshapeQueueDescriptor = PolymorphicDowncast<const ReshapeQueueDescriptor*>(&descriptor);
708 return MakeWorkload<ClReshapeWorkload>(*reshapeQueueDescriptor,
info, m_CLCompileContext);
712 auto resizeQueueDescriptor = PolymorphicDowncast<const ResizeQueueDescriptor*>(&descriptor);
713 return MakeWorkload<ClResizeWorkload>(*resizeQueueDescriptor,
info, m_CLCompileContext);
717 auto sliceQueueDescriptor = PolymorphicDowncast<const SliceQueueDescriptor*>(&descriptor);
718 return MakeWorkload<ClSliceWorkload>(*sliceQueueDescriptor,
info, m_CLCompileContext);
722 auto softmaxQueueDescriptor = PolymorphicDowncast<const SoftmaxQueueDescriptor*>(&descriptor);
723 return std::make_unique<ClSoftmaxWorkload>(*softmaxQueueDescriptor,
725 m_MemoryManager->GetIntraLayerManager(),
730 auto spaceToBatchNdQueueDescriptor
731 = PolymorphicDowncast<const SpaceToBatchNdQueueDescriptor*>(&descriptor);
732 return MakeWorkload<ClSpaceToBatchNdWorkload>(*spaceToBatchNdQueueDescriptor,
info, m_CLCompileContext);
736 auto spaceToDepthQueueDescriptor = PolymorphicDowncast<const SpaceToDepthQueueDescriptor*>(&descriptor);
737 return MakeWorkload<ClSpaceToDepthWorkload>(*spaceToDepthQueueDescriptor,
info, m_CLCompileContext);
741 auto splitterQueueDescriptor = PolymorphicDowncast<const SplitterQueueDescriptor*>(&descriptor);
742 return MakeWorkload<ClSplitterWorkload>(*splitterQueueDescriptor,
info, m_CLCompileContext);
746 auto stackQueueDescriptor = PolymorphicDowncast<const StackQueueDescriptor*>(&descriptor);
747 return MakeWorkload<ClStackWorkload>(*stackQueueDescriptor,
info, m_CLCompileContext);
751 auto stridedSliceQueueDescriptor = PolymorphicDowncast<const StridedSliceQueueDescriptor*>(&descriptor);
752 return MakeWorkload<ClStridedSliceWorkload>(*stridedSliceQueueDescriptor,
info, m_CLCompileContext);
756 auto subtractionQueueDescriptor = PolymorphicDowncast<const SubtractionQueueDescriptor*>(&descriptor);
757 return MakeWorkload<ClSubtractionWorkload>(*subtractionQueueDescriptor,
info, m_CLCompileContext);
761 auto transposeQueueDescriptor = PolymorphicDowncast<const TransposeQueueDescriptor*>(&descriptor);
762 return MakeWorkload<ClTransposeWorkload>(*transposeQueueDescriptor,
info, m_CLCompileContext);
766 auto transposeConvolution2dQueueDescriptor
767 = PolymorphicDowncast<const TransposeConvolution2dQueueDescriptor*>(&descriptor);
768 return MakeWorkload<ClTransposeConvolution2dWorkload>(*transposeConvolution2dQueueDescriptor,
770 m_MemoryManager->GetIntraLayerManager(),
775 auto desc = PolymorphicDowncast<const UnidirectionalSequenceLstmQueueDescriptor*>(&descriptor);
776 return MakeWorkloadHelper<ClUnidirectionalSequenceLstmFloatWorkload, NullWorkload>(*desc,
788 return MakeWorkload<ClActivationWorkload>(descriptor, info, m_CLCompileContext);
792 const WorkloadInfo& info)
const
794 return MakeWorkload<ClAdditionWorkload>(descriptor, info, m_CLCompileContext);
798 const WorkloadInfo& info)
const
800 return std::make_unique<ClArgMinMaxWorkload>(descriptor, info, m_CLCompileContext);
804 const BatchNormalizationQueueDescriptor& descriptor,
805 const WorkloadInfo& info)
const
807 return MakeWorkload<ClBatchNormalizationFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
811 const WorkloadInfo& info)
const
813 return MakeWorkload<ClBatchToSpaceNdWorkload>(descriptor, info, m_CLCompileContext);
816 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateCast(
const CastQueueDescriptor& descriptor,
817 const WorkloadInfo& info)
const
819 return MakeWorkload<ClCastWorkload>(descriptor, info, m_CLCompileContext);
822 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateChannelShuffle(
const ChannelShuffleQueueDescriptor& descriptor,
823 const WorkloadInfo& info)
const
825 return MakeWorkload<ClChannelShuffleWorkload>(descriptor, info, m_CLCompileContext);
829 const WorkloadInfo& info)
const
831 return MakeWorkload<ClComparisonWorkload>(descriptor, info, m_CLCompileContext);
835 const WorkloadInfo& info)
const
837 return MakeWorkload<ClConcatWorkload>(descriptor, info, m_CLCompileContext);
841 const WorkloadInfo& info)
const
843 return MakeWorkload<ClConstantWorkload>(descriptor, info, m_CLCompileContext);
847 const ConvertFp16ToFp32QueueDescriptor& descriptor,
848 const WorkloadInfo& info)
const
850 return MakeWorkload<ClConvertFp16ToFp32Workload>(descriptor, info, m_CLCompileContext);
854 const ConvertFp32ToFp16QueueDescriptor& descriptor,
855 const WorkloadInfo& info)
const
857 return MakeWorkload<ClConvertFp32ToFp16Workload>(descriptor, info, m_CLCompileContext);
861 const WorkloadInfo& info)
const
863 bool isFastMathEnabled =
false;
864 if (m_ModelContextPtr)
866 if (m_ModelContextPtr.get() !=
nullptr)
868 auto modelOptions =
dynamic_cast<ClBackendModelContext*
>(m_ModelContextPtr.get());
871 isFastMathEnabled = modelOptions->IsFastMathEnabled();
875 return MakeWorkload<ClConvolution2dWorkload>(descriptor,
877 m_MemoryManager->GetIntraLayerManager(),
882 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvolution3d(
const Convolution3dQueueDescriptor& descriptor,
883 const WorkloadInfo& info)
const
885 bool isFastMathEnabled =
false;
886 if (m_ModelContextPtr)
888 if (m_ModelContextPtr.get() !=
nullptr)
890 auto modelOptions =
dynamic_cast<ClBackendModelContext*
>(m_ModelContextPtr.get());
893 isFastMathEnabled = modelOptions->IsFastMathEnabled();
897 return MakeWorkload<ClConvolution3dWorkload>(descriptor,
899 m_MemoryManager->GetIntraLayerManager(),
905 const WorkloadInfo& info)
const
907 return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
911 const WorkloadInfo& info)
const
913 return MakeWorkload<ClDepthToSpaceWorkload>(descriptor, info, m_CLCompileContext);
917 const DepthwiseConvolution2dQueueDescriptor& descriptor,
918 const WorkloadInfo& info)
const
920 return MakeWorkload<ClDepthwiseConvolutionWorkload>(descriptor, info, m_CLCompileContext);
924 const WorkloadInfo& info)
const
926 return MakeWorkload<ClDequantizeWorkload>(descriptor, info, m_CLCompileContext);
930 const DetectionPostProcessQueueDescriptor& descriptor,
931 const WorkloadInfo& info)
const
933 return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
937 const WorkloadInfo& info)
const
939 return std::make_unique<ClDivisionWorkload>(descriptor, info, m_CLCompileContext);
943 const WorkloadInfo& info)
const
945 switch(descriptor.m_Parameters.m_Operation)
949 AbsQueueDescriptor absQueueDescriptor;
950 absQueueDescriptor.m_Inputs = descriptor.m_Inputs;
951 absQueueDescriptor.m_Outputs = descriptor.m_Outputs;
953 return std::make_unique<ClAbsWorkload>(absQueueDescriptor, info, m_CLCompileContext);
956 return std::make_unique<ClExpWorkload>(descriptor, info, m_CLCompileContext);
958 return std::make_unique<ClLogWorkload>(descriptor, info, m_CLCompileContext);
960 return std::make_unique<ClLogicalNotWorkload>(descriptor, info, m_CLCompileContext);
962 return std::make_unique<ClNegWorkload>(descriptor, info, m_CLCompileContext);
965 RsqrtQueueDescriptor rsqrtQueueDescriptor;
966 rsqrtQueueDescriptor.m_Inputs = descriptor.m_Inputs;
967 rsqrtQueueDescriptor.m_Outputs = descriptor.m_Outputs;
969 return std::make_unique<ClRsqrtWorkload>(rsqrtQueueDescriptor, info, m_CLCompileContext);
972 return std::make_unique<ClSinWorkload>(descriptor, info, m_CLCompileContext);
978 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFill(
const FillQueueDescriptor& descriptor,
979 const WorkloadInfo& info)
const
981 return std::make_unique<ClFillWorkload>(descriptor, info, m_CLCompileContext);
985 const WorkloadInfo& info)
const
987 return MakeWorkload<ClFloorFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
991 const WorkloadInfo& info)
const
993 return MakeWorkload<ClFullyConnectedWorkload>(descriptor,
995 m_MemoryManager->GetIntraLayerManager(),
1000 const WorkloadInfo& info)
const
1002 return MakeWorkload<ClGatherWorkload>(descriptor, info, m_CLCompileContext);
1006 const WorkloadInfo& info)
const
1008 return std::make_unique<CopyMemGenericWorkload>(descriptor, info);
1012 const InstanceNormalizationQueueDescriptor& descriptor,
1013 const WorkloadInfo& info)
const
1015 return MakeWorkload<ClInstanceNormalizationWorkload>(descriptor, info, m_CLCompileContext);
1019 const WorkloadInfo& info)
const
1021 return MakeWorkload<ClL2NormalizationFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
1024 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLogicalBinary(
const LogicalBinaryQueueDescriptor& descriptor,
1025 const WorkloadInfo& info)
const
1027 switch(descriptor.m_Parameters.m_Operation)
1030 return std::make_unique<ClLogicalAndWorkload>(descriptor, info, m_CLCompileContext);
1032 return std::make_unique<ClLogicalOrWorkload>(descriptor, info, m_CLCompileContext);
1039 const WorkloadInfo& info)
const
1041 return MakeWorkload<ClLogSoftmaxWorkload>(descriptor,
1043 m_MemoryManager->GetIntraLayerManager(),
1044 m_CLCompileContext);
1048 const WorkloadInfo& info)
const
1050 return MakeWorkload<ClLstmFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
1054 const WorkloadInfo& info)
const
1056 return MakeWorkload<ClMaximumWorkload>(descriptor, info, m_CLCompileContext);
1060 const WorkloadInfo& info)
const
1062 return MakeWorkload<ClMeanWorkload>(descriptor, info, m_CLCompileContext);
1066 const WorkloadInfo& info)
const
1068 if (descriptor.m_Inputs.empty() || !descriptor.m_Inputs[0])
1070 throw InvalidArgumentException(
"ClWorkloadFactory: Invalid null input for MemCopy workload");
1073 return MakeWorkload<CopyMemGenericWorkload>(descriptor, info);
1077 const WorkloadInfo& info)
const
1079 if (descriptor.m_Inputs.empty() || !descriptor.m_Inputs[0])
1081 throw InvalidArgumentException(
"ClWorkloadFactory: Invalid null input for MemImport workload");
1084 return std::make_unique<ImportMemGenericWorkload>(descriptor, info);
1088 const WorkloadInfo& info)
const
1090 return MakeWorkload<ClMinimumWorkload>(descriptor, info, m_CLCompileContext);
1094 const WorkloadInfo& info)
const
1096 return MakeWorkload<ClMultiplicationWorkload>(descriptor, info, m_CLCompileContext);
1100 const WorkloadInfo& info)
const
1102 return MakeWorkload<ClNormalizationFloatWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
1106 const WorkloadInfo& info)
const
1108 return std::make_unique<CopyMemGenericWorkload>(descriptor, info);
1112 const WorkloadInfo& info)
const
1114 return MakeWorkload<ClPadWorkload>(descriptor, info, m_CLCompileContext);
1118 const WorkloadInfo& info)
const
1120 return MakeWorkload<ClPermuteWorkload>(descriptor, info, m_CLCompileContext);
1124 const WorkloadInfo& info)
const
1126 return MakeWorkload<ClPooling2dWorkload>(descriptor, info, m_CLCompileContext);
1130 const WorkloadInfo& info)
const
1132 return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info, m_CLCompileContext);
1136 const WorkloadInfo &info)
const
1138 return MakeWorkload<ClPreluWorkload>(descriptor, info, m_CLCompileContext);
1141 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateQLstm(
const QLstmQueueDescriptor& descriptor,
1142 const WorkloadInfo& info)
const
1144 return std::make_unique<ClQLstmWorkload>(descriptor, info, m_CLCompileContext);
1148 const WorkloadInfo& info)
const
1150 return MakeWorkload<ClQuantizeWorkload>(descriptor, info, m_CLCompileContext);
1154 const WorkloadInfo& info)
const
1156 return MakeWorkload<ClQuantizedLstmWorkload>(descriptor, info, m_CLCompileContext);
1160 const WorkloadInfo& info)
const
1162 return std::make_unique<ClRankWorkload>(descriptor, info);
1165 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateReduce(
const ReduceQueueDescriptor& descriptor,
1166 const WorkloadInfo& info)
const
1168 return std::make_unique<ClReduceWorkload>(descriptor, info);
1172 const WorkloadInfo& info)
const
1174 return MakeWorkload<ClReshapeWorkload>(descriptor, info, m_CLCompileContext);
1178 const WorkloadInfo& info)
const
1180 return MakeWorkload<ClResizeWorkload>(descriptor, info, m_CLCompileContext);
1184 const WorkloadInfo& info)
const
1186 return MakeWorkload<ClSliceWorkload>(descriptor, info, m_CLCompileContext);
1190 const WorkloadInfo& info)
const
1192 return std::make_unique<ClSoftmaxWorkload>(descriptor,
1194 m_MemoryManager->GetIntraLayerManager(),
1195 m_CLCompileContext);
1199 const WorkloadInfo& info)
const
1201 return MakeWorkload<ClSpaceToBatchNdWorkload>(descriptor, info, m_CLCompileContext);
1205 const WorkloadInfo& info)
const
1207 return MakeWorkload<ClSpaceToDepthWorkload>(descriptor, info, m_CLCompileContext);
1211 const WorkloadInfo& info)
const
1213 return MakeWorkload<ClSplitterWorkload>(descriptor, info, m_CLCompileContext);
1217 const WorkloadInfo& info)
const
1219 return MakeWorkload<ClStackWorkload>(descriptor, info, m_CLCompileContext);
1223 const WorkloadInfo& info)
const
1225 return MakeWorkload<ClStridedSliceWorkload>(descriptor, info, m_CLCompileContext);
1229 const WorkloadInfo& info)
const
1231 return MakeWorkload<ClSubtractionWorkload>(descriptor, info, m_CLCompileContext);
1235 const WorkloadInfo& info)
const
1237 return MakeWorkload<ClTransposeWorkload>(descriptor, info, m_CLCompileContext);
1241 const TransposeConvolution2dQueueDescriptor& descriptor,
1242 const WorkloadInfo& info)
const
1244 return MakeWorkload<ClTransposeConvolution2dWorkload>(descriptor,
1246 m_MemoryManager->GetIntraLayerManager(),
1247 m_CLCompileContext);