From e1ac86975cb8e19a76c1389073147ba306573b3f Mon Sep 17 00:00:00 2001 From: Orlaith Monahan Date: Tue, 23 Jan 2024 13:52:30 +0000 Subject: IVGCVSW-7628 Update GpuFsa to use ACLs latest fixes * Updates to the existing GpuFsa backend to incorporate patch: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10990 * Update the ACL pin to include the patch with the fixes Change-Id: I08d111265f4617657ee7f20249aeb111f64ba7a9 Signed-off-by: David Monahan --- scripts/get_compute_library.sh | 2 +- src/backends/gpuFsa/GpuFsaBackend.hpp | 12 ++-- src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp | 72 ++++++++++------------ .../gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp | 40 +++++++----- 4 files changed, 66 insertions(+), 60 deletions(-) diff --git a/scripts/get_compute_library.sh b/scripts/get_compute_library.sh index 811988e337..c5e2da1471 100755 --- a/scripts/get_compute_library.sh +++ b/scripts/get_compute_library.sh @@ -10,7 +10,7 @@ CMD=$( basename "$0" ) #DEFAULT_CLFRAMEWORKREVISION="branches/arm_compute_23_11" # Release 23.11 # # For pinning to a revision use this: -DEFAULT_CLFRAMEWORKREVISION="3a704ae94fc5cbfce1053886f543b31977f1774c" #10989: Update Documentation for 24.01 release +DEFAULT_CLFRAMEWORKREVISION="fdf56fb9d414a754e7cedfdc1351ab0ce2866a0c" #10990: Make GpuWorkloadContext own all tensor info objects usage() { echo -e "get_compute_library.sh: Clones the Arm Compute Library (ACL) repo from the ML Platform server and checks out diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp index 4c2a5f02e3..f5a866b186 100644 --- a/src/backends/gpuFsa/GpuFsaBackend.hpp +++ b/src/backends/gpuFsa/GpuFsaBackend.hpp @@ -23,10 +23,10 @@ namespace armnn /** * A structure which contains all the elements needed to execute a fused workload in the GpuFsa Backend * - * @param[in, out] sketch A unique pointer to the sketch containing the operators which have been fused. - * @param[in, out] TensorInfos A shared pointer to a GpuWorkloadContext which contains TensorInfos - * @param[in, out] inputIds A unique pointer to a vector of input Ids used to access workloadContext TensorInfos - * @param[in, out] outputIds A unique pointer to a vector of output Ids used to access workloadContext TensorInfos + * @param[in, out] sketch A unique pointer to the sketch containing the operators which have been fused. + * @param[in, out] TensorInfos A shared pointer to a GpuWorkloadContext which creates + stores TensorInfos + * @param[in, out] inputTensorInfos A unique pointer to a vector of inputTensorInfos used by the sketch + * @param[in, out] outputTensorInfos A unique pointer to a vector of outputTensorInfos used by the sketch * */ struct GpuFsaPreCompiledBlob @@ -34,8 +34,8 @@ struct GpuFsaPreCompiledBlob std::unique_ptr sketch = nullptr; std::shared_ptr workloadContext = nullptr; - std::unique_ptr> inputIds = nullptr; - std::unique_ptr> outputIds = nullptr; + std::unique_ptr> inputTensorInfos = nullptr; + std::unique_ptr> outputTensorInfos = nullptr; }; // add new capabilities here.. diff --git a/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp index c7137d7ac8..7aa643dcb1 100644 --- a/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp +++ b/src/backends/gpuFsa/layers/GpuFsaConvolution2d.cpp @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -49,8 +48,7 @@ arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input, // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op arm_compute::TensorInfo aclBiasInfo; - arm_compute::TensorInfo biasSketchInfo; - arm_compute::TensorInfo* biasSketchInfoPtr = nullptr; + arm_compute::ITensorInfo* biasSketchInfoPtr = nullptr; if (descriptor.m_BiasEnabled) { @@ -61,8 +59,7 @@ arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input, aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout); aclBiasInfo.set_are_values_constant(biases.value().IsConstant()); - biasSketchInfo = workloadContext.create_tensor_info(aclBiasInfo); - biasSketchInfoPtr = &biasSketchInfo; + biasSketchInfoPtr = workloadContext.create_tensor_info(aclBiasInfo); } // Set Conv2d attributes using descriptor @@ -78,8 +75,8 @@ arm_compute::Status GpuFsaConvolution2dValidate(const TensorInfo& input, // Validate operator, check status and update reasonIfUnsupported arm_compute::Status aclStatus = GpuConv2d::validate_op(sketch, - &inputInfo, - &weightInfo, + inputInfo, + weightInfo, biasSketchInfoPtr, conv2DAttributes); @@ -93,34 +90,32 @@ void GpuFsaConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob, const Optional& biases) { /* - * Creating an Op for the GpuFds backend requires us to create and maintain quite a bit of data, which is then stored + * Creating an Op for the GpuFsa backend requires us to create and maintain quite a bit of data, which is then stored * in a GpuFsaPreCompiledBlob for execution later. Specifically we need: * GpuWorkloadContext, this contains the TensorInfos and is unique to the Graph being executed * Sketch, this is similar to a subgraph and can contain one or more operations. Multiple ops can be "fused" together * using a single sketch. - * The TensorInfoIds, these are the ids of the TensorInfos used when creating the sketch. They refer to the TensorInfos - * stored within the GpuWorkloadContext and are used to fetch them later when executing the sketch. + * The inputTensorinfos / outputTensorInfos, these are pointers to the TensorInfos used when creating the sketch. + * They refer to the TensorInfos stored within the GpuWorkloadContext and are needed when executing the sketch + * as the TensorInfos used when creating the Tensors must match those used to create the Sketch. Otherwise the runtime + * doesn't know which Tensors to use. */ using namespace arm_compute::experimental::dynamic_fusion; GpuWorkloadSketch* sketch = blob->sketch.get(); GpuWorkloadContext* workloadContext = blob->workloadContext.get(); - std::vector inputIds = {}; - std::vector outputIds = {}; + std::vector inputTensorInfos = {}; + std::vector outputTensorInfos = {}; // Build and create tensor infos using the sketch const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input, descriptor.m_DataLayout); arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights, descriptor.m_DataLayout); aclWeightsInfo.set_are_values_constant(weights.IsConstant()); - auto inputInfo = workloadContext->create_tensor_info(aclInputInfo); - aclWeightsInfo.set_are_values_constant(weights.IsConstant()); - inputIds.emplace_back(inputInfo.id()); - auto weightInfo = workloadContext->create_tensor_info(aclWeightsInfo); - inputIds.emplace_back(weightInfo.id()); + inputTensorInfos.emplace_back(workloadContext->create_tensor_info(aclInputInfo)); + inputTensorInfos.emplace_back(workloadContext->create_tensor_info(aclWeightsInfo)); - // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op + // Only create the bias tensor info if enabled, otherwise pass nullptr to validate_op / create_op arm_compute::TensorInfo aclBiasInfo; - arm_compute::TensorInfo biasSketchInfo; arm_compute::ITensorInfo* biasSketchInfoPtr = nullptr; if (descriptor.m_BiasEnabled) @@ -132,9 +127,8 @@ void GpuFsaConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob, aclBiasInfo = BuildArmComputeTensorInfo(biases.value(), descriptor.m_DataLayout); aclBiasInfo.set_are_values_constant(biases.value().IsConstant()); - biasSketchInfo = workloadContext->create_tensor_info(aclBiasInfo); - inputIds.emplace_back(biasSketchInfo.id()); - biasSketchInfoPtr = workloadContext->implementation().get_tensor_info(biasSketchInfo.id()); + inputTensorInfos.emplace_back(workloadContext->create_tensor_info(aclBiasInfo)); + biasSketchInfoPtr = inputTensorInfos[2]; } // Set Conv2d attributes using descriptor @@ -149,12 +143,11 @@ void GpuFsaConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob, conv2DAttributes.stride(aclStrideInfo); // Validate operator, check status and update reasonIfUnsupported - arm_compute::Status aclStatus = - GpuConv2d::validate_op(*sketch, - workloadContext->implementation().get_tensor_info(inputInfo.id()), - workloadContext->implementation().get_tensor_info(weightInfo.id()), - biasSketchInfoPtr, - conv2DAttributes); + arm_compute::Status aclStatus = GpuConv2d::validate_op(*sketch, + inputTensorInfos[0], + inputTensorInfos[1], + biasSketchInfoPtr, + conv2DAttributes); const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); if (!supported) @@ -162,19 +155,20 @@ void GpuFsaConvolution2dCreateOp(GpuFsaPreCompiledBlob* blob, throw BackendCapabilityException("\"GpuFsa\" backend failed during Convolution2D operation validation"); } - arm_compute::ITensorInfo* convOutInfo = - GpuConv2d::create_op(*sketch, - workloadContext->implementation().get_tensor_info(inputInfo.id()), - workloadContext->implementation().get_tensor_info(weightInfo.id()), - biasSketchInfoPtr, - conv2DAttributes); + // Create the Op within the Sketch using the TensorInfos we have stored + arm_compute::ITensorInfo* convOutInfo = GpuConv2d::create_op(*sketch, + inputTensorInfos[0], + inputTensorInfos[1], + biasSketchInfoPtr, + conv2DAttributes); - arm_compute::TensorInfo outputDstInfo = workloadContext->create_tensor_info(); - outputIds.emplace_back(outputDstInfo.id()); + // Create the Output + outputTensorInfos.emplace_back(workloadContext->create_tensor_info()); + GpuOutput::create_op(*sketch, convOutInfo, outputTensorInfos[0]); - GpuOutput::create_op(*sketch, convOutInfo, workloadContext->implementation().get_tensor_info(outputDstInfo.id())); - blob->inputIds = std::make_unique>(inputIds); - blob->outputIds = std::make_unique>(outputIds); + // Store the TensorInfos within the blob as unique_ptrs to be used later + blob->inputTensorInfos = std::make_unique>(inputTensorInfos); + blob->outputTensorInfos = std::make_unique>(outputTensorInfos); } } // namespace armnn diff --git a/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp index 20386b5d86..1663044a17 100644 --- a/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp +++ b/src/backends/gpuFsa/workloads/GpuFsaPreCompiledWorkload.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -21,7 +22,6 @@ #include #include -#include #include #include #include @@ -48,26 +48,38 @@ void GpuFsaPreCompiledWorkload::Execute() const * First we need all of the data stored within the PreCompiled blob which was used to setup the workload, namely: * The GpuWorkloadContext, this is a context which contains the TensorInfos and is unique to the graph being run * The Sketch, this can contain one or many ops and acts as a subgraph within the context - * The TensorInfoIds, These are the ids of the TensorInfos used during the creation of the Sketch and stored within - * the context. + * The inputTensorInfos / outputTensorInfos, These are vectors containing the TensorInfos used when creating the sketch + * * It is very important that the Tensors passed into the Runtime being used to execute this sketch are created with * the same TensorInfos as used when creating the sketch. We do this by creating new tensors, getting the original - * TensorInfos from the GpuWorkloadContext via their ids, and then importing the buffers from our own TensorHandles - * directly into these newly created Tensors. This allows us to link the externally visible Tensors from ArmNN to the - * Tensors which are needed to execute with the Sketch. + * TensorInfos from the vectors of tensorInfos stored in the blob, and then importing the buffers from our own + * TensorHandles directly into these newly created Tensors. This allows us to link the externally visible Tensors + * from ArmNN to the Tensors which are needed to execute with the Sketch. * */ using namespace arm_compute::experimental::dynamic_fusion; // Get the runtime and configure it with the precompiled sketch ClWorkloadRuntime runtime; GpuFsaPreCompiledBlob *preCompiledBlob = static_cast(m_Data.m_PreCompiledObject); - auto workloadContext = - &(preCompiledBlob->workloadContext->implementation()); auto sketch = preCompiledBlob->sketch.release(); - std::vector inputIds = *(preCompiledBlob->inputIds.get()); - std::vector outputIds = *(preCompiledBlob->outputIds.get()); auto status = runtime.configure(*sketch); + // Get the TensorInfos stored within the PreCompiledBlob and check they're the right size + auto inputTensorInfos = preCompiledBlob->inputTensorInfos.get(); + auto outputTensorInfos = preCompiledBlob->outputTensorInfos.get(); + if (inputTensorInfos->size() != m_Data.m_Inputs.size()) + { + throw InvalidArgumentException(fmt::format("GpuFsaPreCompiledWorkload::Execute: The number of inputTensorInfos" + " {} does not match the number of inputs {}.", + inputTensorInfos->size(), m_Data.m_Inputs.size())); + } + if (outputTensorInfos->size() != m_Data.m_Outputs.size()) + { + throw InvalidArgumentException(fmt::format("GpuFsaPreCompiledWorkload::Execute: The number of outputTensorInfos" + " {} does not match the number of outputs {}.", + outputTensorInfos->size(), m_Data.m_Outputs.size())); + } + // (Important) Allocate auxiliary tensor memory if there are any for(auto &data : runtime.get_auxiliary_tensors()) { @@ -85,8 +97,8 @@ void GpuFsaPreCompiledWorkload::Execute() const for (uint32_t inputSlotIdx = 0; inputSlotIdx < m_Data.m_Inputs.size(); ++inputSlotIdx) { arm_compute::CLTensor* input = new arm_compute::CLTensor{}; - input->allocator()->init(*(dynamic_cast( - workloadContext->get_tensor_info(inputIds[inputSlotIdx])))); + // inputTensorInfos is a ptr to a vector of ptrs, so we need to do a double dereference + input->allocator()->init(*((*inputTensorInfos)[inputSlotIdx])); auto* inputHandle = PolymorphicDowncast(m_Data.m_Inputs[inputSlotIdx]); input->allocator()->import_memory(inputHandle->GetTensor().cl_buffer()); inputsWeightsOutputs.emplace_back(std::move(input)); @@ -95,8 +107,8 @@ void GpuFsaPreCompiledWorkload::Execute() const for (uint32_t outputSlotIdx = 0; outputSlotIdx < m_Data.m_Outputs.size(); ++outputSlotIdx) { arm_compute::CLTensor* output = new arm_compute::CLTensor{}; - output->allocator()->init(*(dynamic_cast( - workloadContext->get_tensor_info(outputIds[outputSlotIdx])))); + // outputTensorInfos is a ptr to a vector of ptrs, so we need to do a double dereference + output->allocator()->init(*((*outputTensorInfos)[outputSlotIdx])); auto* outputHandle = PolymorphicDowncast(m_Data.m_Outputs[outputSlotIdx]); output->allocator()->import_memory(outputHandle->GetTensor().cl_buffer()); inputsWeightsOutputs.emplace_back(std::move(output)); -- cgit v1.2.1