diff options
author | Philip Hall <philip.hall@arm.com> | 2024-04-24 14:04:03 +0100 |
---|---|---|
committer | tim.hall <tim.hall@arm.com> | 2024-05-28 14:48:29 +0000 |
commit | 254b29f9deb79c07afe6a0fa15cd20d7cad99693 (patch) | |
tree | c52bd7a05ebb94f0139586c4857238025fe7cf95 | |
parent | 8d303886fc6f29234d75aa78eabe853b7604177d (diff) | |
download | ethos-u-vela-dev/ethos_u85.tar.gz |
MLBEDSW-8883: Block config selection for elementwisedev/ethos_u85
- Elementwise operations are largely immune to block
configuration settings unless broadcasting. This
commit chooses more optimal blocks for each of the
supported broadcast dimensions.
Signed-off-by: Philip Hall <philip.hall@arm.com>
Change-Id: Idbf51b5b892aa6be93b31eb17956612b682a1492
-rw-r--r-- | ethosu/regor/architecture/ethosu85/ethos_u85.cpp | 260 | ||||
-rw-r--r-- | ethosu/regor/architecture/ethosu85/ethos_u85.hpp | 20 | ||||
-rw-r--r-- | ethosu/regor/common/shape.hpp | 21 | ||||
-rw-r--r-- | ethosu/regor/common/transpose_type.hpp | 18 |
4 files changed, 252 insertions, 67 deletions
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp index ac98d62e..3949b947 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp @@ -64,17 +64,20 @@ static const EthosU85PerfInfo s_EthosU85PerfInfo[] = { static const ArchEthosU85::AcceleratorConfig s_EthosU85Configs[] = { // Accelerator.Ethos_U85_128 - {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, 2, Shape(1, 2, 8), 8 * 1024, 8 * 1024, 2, 1, 0, &s_EthosU85PerfInfo[0]}, + {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, Shape(1, 2, 8), 2, 8192, 8192, 2048, 768, 1, 0, &s_EthosU85PerfInfo[0]}, // Accelerator.Ethos_U85_256 - {256, 1, {Shape(1, 2, 16), Shape(1, 4, 8), Shape(2, 2, 8)}, 3, Shape(2, 2, 8), 16 * 1024, 16 * 1024, 4, 1, 0, &s_EthosU85PerfInfo[0]}, + {256, 1, {Shape(1, 2, 16), Shape(1, 4, 8), Shape(2, 2, 8)}, Shape(2, 2, 8), 3, 16384, 16384, 2048, 1536, 1, 0, &s_EthosU85PerfInfo[0]}, // Accelerator.Ethos_U85_512 - {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, 2, Shape(2, 2, 16), 16 * 1024, 32 * 1024, 8, 1, 0, &s_EthosU85PerfInfo[1]}, - // Accelerator.Ethos_U85_1014 - {1024, 4, {Shape(2, 2, 32), Shape(1, 4, 32), Shape(2, 4, 16)}, 3, Shape(4, 2, 16), 16 * 1024, 64 * 1024, 16, 1, 1, &s_EthosU85PerfInfo[2]}, + {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, Shape(2, 2, 16), 2, 16384, 32768, 4096, 3072, 1, 0, &s_EthosU85PerfInfo[1]}, + // Accelerator.Ethos_U85_1024 + {1024, 4, {Shape(2, 2, 32), Shape(1, 4, 32), Shape(2, 4, 16)}, Shape(4, 2, 16), 3, 16384, 65536, 4096, 6144, 1, 1, &s_EthosU85PerfInfo[2]}, // Accelerator.Ethos_U85_2048 - {2048, 4, {Shape(2, 2, 64), Shape(1, 4, 64), Shape(4, 4, 16)}, 3, Shape(4, 4, 16), 32 * 1024, 128 * 1024, 32, 2, 1, &s_EthosU85PerfInfo[3]}, + {2048, 4, {Shape(2, 2, 64), Shape(1, 4, 64), Shape(4, 4, 16)}, Shape(4, 4, 16), 3, 32768, 131072, 8192, 12288, 2, 1, &s_EthosU85PerfInfo[3]}, }; +constexpr int CB_SLOTS = 6; +constexpr int BRICK_ELEMENTS = 16; + enum class ElementwiseUsage { No = 0, @@ -154,6 +157,8 @@ void ArchEthosU85::ApplyConfig(const AcceleratorConfig *cfg) // Internal memory _ifmRamSizeBytes = cfg->ifmRamSizeBytes; _accRamSizeBytes = cfg->accRamSizeBytes; + _cbRamSizeBytes = cfg->cbRamSizeBytes; + _obRamSizeBytes = cfg->obRamSizeBytes; _numAxiSramLog2 = cfg->numAxiSramLog2; _numAxiExtLog2 = cfg->numAxiExtLog2; @@ -693,22 +698,168 @@ Shape ArchEthosU85::FindUBlock(OpType opType, const ArchitectureConfigQuery &que return bestUblk; } +static int GranularScale(int range, int granule, double ratio) +{ + assert(granule > 0); + int granules = range / granule; + granules = std::max(int(granules * ratio), 1); + return granules * granule; +} + +static Shape FitVolumeByAspect( + const Shape &shape, int fitVolume, const Shape &granule, const Shape &limit, const std::array<int, 3> &priority) +{ + LOG_TRACE2("FitVolumeByAspect: {} into {} granule {}, limit {}\n", shape.ToString(), fitVolume, granule.ToString(), + limit.ToString()); + LOG_INDENT(Logging::Out); + assert(shape.Size() >= 3); + assert(priority[0] + priority[1] + priority[2] == -6); // Simple axis presence check + + // Extract axes by their priority + int primary = shape[priority[0]]; + int secondary = shape[priority[1]]; + int tertiary = shape[priority[2]]; + int pgranule = granule[priority[0]]; + int sgranule = granule[priority[1]]; + int tgranule = granule[priority[2]]; + int plimit = limit[priority[0]]; + int slimit = limit[priority[1]]; + int tlimit = limit[priority[2]]; + + // Fit a roughly aspect-correct 'shape' into 'fitVolume' + int sval, tval; + int pval = std::clamp(primary, 1, plimit); + pval = RoundAway(pval, pgranule); + + // Planar area (depends on chosen axes) + int area = std::max(fitVolume / pval, 1); + assert(secondary > 0); + double aspect = double(tertiary) / secondary; + + // Casting to int rounds down, making tval the smallest axis + tval = int(std::sqrt(area * aspect)); + tval = std::clamp(tval, 1, tlimit); + + // Divide before rounding tval to push sval upwards + sval = area / tval; + sval = std::clamp(sval, 1, slimit); + + // Round to granule + sval = RoundAway(sval, sgranule); + tval = RoundAway(tval, tgranule); + + Shape result(1, 1, 1); + result[priority[0]] = pval; + result[priority[1]] = sval; + result[priority[2]] = tval; + + LOG_TRACE2("Pre-fitted shape: {}\n", result.ToString()); + + // The result MUST NOT exceed the requested volume + // TODO: Crude loop - WORKS but needs improvement + int elements = pval * sval * tval; + while ( elements > fitVolume ) + { + double ratio = double(fitVolume) / elements; + if ( tval > tgranule ) + { + tval = GranularScale(tval, tgranule, ratio); + } + else if ( sval > sgranule ) + { + sval = GranularScale(sval, sgranule, ratio); + } + else if ( pval > pgranule ) + { + pval = GranularScale(pval, pgranule, ratio); + } + else break; // Give up + elements = pval * sval * tval; + } + + result[priority[0]] = pval; + result[priority[1]] = sval; + result[priority[2]] = tval; + LOG_TRACE2("Fitted Shape: {}\n", result.ToString()); + return result; +} + +Shape ArchEthosU85::FindElementwiseConfig(const ArchitectureConfigQuery &query, const FindConfigCommon &common) +{ + LOG_TRACE2("Elementwise OFM {}\n", query.ofmShape.ToString()); + LOG_INDENT(Logging::Out); + assert(query.ifmBits > 0); + const Shape ofmShape = Shape::PadAxes(Shape::RoundAway(query.ofmShape, common.ublock), 3, 1); + Shape ofmBlockLimit = Shape::Min(ofmShape, common.ofmBlockMax); + + // Default to width/depth for HCWB16 + std::array<int, 3> axisPriority{-2, -1, -3}; + + const bool isScalar = (query.ifmShape[0].Elements() == 1) || (query.ifmShape[1] && query.ifmShape[1].Elements() == 1); + // Binary elementwise, potentially broadcast + if ( !isScalar && query.ifmShape[1] ) + { + const int cbBricks = (_cbRamSizeBytes / CB_SLOTS) / (BRICK_ELEMENTS * (query.ifmBits / 8)); + unsigned broadcastMask = query.ifmShape[0].LessMask(query.ofmShape); + broadcastMask |= query.ifmShape[1].LessMask(query.ofmShape); + // Broadcast in depth first + if ( broadcastMask & 1 ) + { + int hLimit = common.ublock.Height(); + int wLimit = cbBricks / common.ublock.Height(); + while ( (wLimit > common.ublock.Width()) && (wLimit > ofmBlockLimit.Width() / 2) && (hLimit < ofmBlockLimit.Height()) ) + { + wLimit = wLimit / 2; + hLimit = hLimit * 2; + } + + return Shape(hLimit, wLimit, RoundAway(ofmBlockLimit.Depth(), common.ublock.Depth())); + } + // Broadcast in width first + else if ( broadcastMask & 2 ) + { + axisPriority = {-2, -1, -3}; + } + // Broadcast in height first + else if ( broadcastMask & 4 ) + { + int cLimit = common.granule.Depth(); + int wLimit = cbBricks; + + while ( (wLimit > common.ublock.Width()) && (wLimit > ofmBlockLimit.Width() / 2) && (cLimit < ofmBlockLimit.Depth()) ) + { + wLimit = wLimit / 2; + cLimit = cLimit * 2; + } + + return Shape(ofmBlockLimit.Height(), wLimit, cLimit); + } + } + + // As long as the output buffer is kept filled we will fill the pipeline. We size for + // at least this many elements which ultimately affects HW striping. + const int minOfmElements = 2 * (_obRamSizeBytes * 8) / query.ifmBits; + // Fit the ofmShape into the available elements with granule and limit constraints. + Shape ofmBlock = FitVolumeByAspect(ofmShape, minOfmElements, common.granule, ofmBlockLimit, axisPriority); + ofmBlock = Shape::RoundAway(ofmBlock, common.granule); + LOG_TRACE2("Elementwise choice: ofmBlock = {}\n", ofmBlock.ToString()); + return ofmBlock; +} + std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query) { + constexpr int OFMSplitDepth = 16; // Specific to this architecture assert(query.ifmBits > 0 && (query.ifmBits <= 32 || (query.ifmBits == 64 && opType == OpType::Rescale))); assert(query.ofmShape.Size() > 2 && "Insufficient dimensions to search for block config"); assert(query.kernel != nullptr); - if ( !SupportsAccumulatorMode(query.accSource, query.accOutputEnabled) ) return nullptr; - - const int OFMSplitDepth = 16; // Specific to this architecture + EthosU85NpuOp npuOp = GetHWOp(opType); + assert(npuOp != EthosU85NpuOp::None); + if ( npuOp == EthosU85NpuOp::Dma ) return nullptr; // DMA ops don't use block config // Elementwise larger-volume correction const Shape &ifmShape = (query.ifmShape[1].Elements() > query.ifmShape[0].Elements()) ? query.ifmShape[1] : query.ifmShape[0]; - EthosU85NpuOp npuOp = GetHWOp(opType); - assert(npuOp != EthosU85NpuOp::None); - // Operator typing help bool isPooling = npuOp == EthosU85NpuOp::Pooling || npuOp == EthosU85NpuOp::ReduceSum; bool isReduceSum = npuOp == EthosU85NpuOp::ReduceSum; @@ -716,32 +867,18 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opTyp bool isElementwise = npuOp == EthosU85NpuOp::Elementwise; bool isConvolution = npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::Depthwise; bool isResize = npuOp == EthosU85NpuOp::Resize; - bool isDma = npuOp == EthosU85NpuOp::Dma; bool isPartKernel = isConvolution && ChooseKernelMethod(ifmShape, query.ifmBits, query.kernel); bool isEqualDepthOp = isElementwise || (isPooling && !isReduceSum) || isDepthwise || isResize; - if ( isDma ) - { - // DMA ops doesn't use block config - return nullptr; - } - - // Operator configuration to be returned - auto config = std::make_unique<EthosU85OpConfig>(); - EthosU85Traversal traversal = isDepthwise ? EthosU85Traversal::Depthwise : (isPartKernel ? EthosU85Traversal::PartKernel : EthosU85Traversal::DepthFirst); // Accumulator settings EthosU85Accumulator accType = EthosU85Accumulator::Acc32; - if ( query.ifmBits == 16 && (!isPooling || isReduceSum) && query.scaled ) + if ( (query.ifmBits == 16 && (!isPooling || isReduceSum) && query.scaled) || // Normal 16-bit selection + (query.ifmBits >= 32) ) // Special case for Rescale int48 { accType = EthosU85Accumulator::Acc48; } - else if ( query.ifmBits == 64 && opType == OpType::Rescale ) - { - // Special case for Rescale int48 - accType = EthosU85Accumulator::Acc48; - } int accBits = AccumulatorBits(accType); int rounding; @@ -749,7 +886,7 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opTyp int numBlocksInRam = 2; const Shape ofmUBlock = FindUBlock(opType, query); - if ( ofmUBlock == Shape() ) + if ( !ofmUBlock ) { // no valid ofm microblock found LOG_WARN("Could not find a valid OFM microblock for {} with {}-bit input.\n", OpTypeToString(opType), query.ifmBits); @@ -779,31 +916,45 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opTyp ifmBlockDepth = 64; } - // Weights fetch (for operators that have them) - int weightFetchWH = isConvolution ? query.kernel->Size().AreaXY() : 0; - - int ofmUBlockDepth = ofmUBlock.Depth(); - // When using brick format and certain transposes, there are additional constraints to the block size, so we must // extend the search space to be able to find a valid block size. - Shape ofmBlockMin = Shape(0, 0, 0); + Shape ofmBlockGranule = ofmUBlock; if ( query.ofmFormat == TensorFormat::NHCWB16 ) { - switch ( query.transpose ) - { - case TransposeType::NCHW: - case TransposeType::NHCW: - ofmBlockMin = ofmBlockMin.WithWidth(16); - break; - case TransposeType::NCWH: - case TransposeType::NWCH: - ofmBlockMin = ofmBlockMin.WithHeight(16); - break; - default: - break; - } + if ( (query.transpose & TransposeType::MaskC) == TransposeType::W ) ofmBlockGranule[-2] = 16; + if ( (query.transpose & TransposeType::MaskC) == TransposeType::H ) ofmBlockGranule[-3] = 16; + } + if ( query.ofmShape.Depth() >= 16 ) ofmBlockGranule[-1] = 16; + + // Operator configuration to be returned + auto config = std::make_unique<EthosU85OpConfig>(); + config->_ofmUBlock = ofmUBlock; + config->_accumulatorType = accType; + config->_accumulatorSource = query.accSource; + config->_accumulatorOutputEnabled = query.accOutputEnabled; + config->_ifmRamSizeBytes = _ifmRamSizeBytes; + config->_traversal = traversal; + + // Common constant vars + FindConfigCommon common; + common.ofmBlockMax = _ofmBlockMax.Untranspose(Reduce4To3(query.transpose)); + common.ublock = ofmUBlock; + common.granule = ofmBlockGranule; + common.accBits = AccumulatorBits(accType); + + if ( isElementwise ) + { + config->_ofmBlock = FindElementwiseConfig(query, common); + config->_ifmBlock = config->_ofmBlock; + return config; } - Shape searchSpaceStep = Shape::Max(ofmUBlock, ofmBlockMin); + + // Weights fetch (for operators that have them) + int weightFetchWH = isConvolution ? query.kernel->Size().AreaXY() : 0; + + int ofmUBlockDepth = ofmUBlock.Depth(); + + Shape searchSpaceStep = Shape::Max(ofmUBlock, ofmBlockGranule); Shape ofmBlockMaxTp = _ofmBlockMax.Untranspose(Reduce4To3(query.transpose)); Shape searchSpaceEnd = Shape::RoundAway(Shape::Max(Shape::Min(query.ofmShape, ofmBlockMaxTp), searchSpaceStep), ofmUBlock); @@ -891,8 +1042,7 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opTyp } // Scale relative to every output OFM element - float relativeCost = - (isElementwise || isResize) ? float(ofmElements) / (float(height) * width * depth) : (ifmFetch + weightFetch) / float(ofmElements); + float relativeCost = (isResize) ? float(ofmElements) / (float(height) * width * depth) : (ifmFetch + weightFetch) / float(ofmElements); // If the entire IFM can be encompassed by both buffers, bias to prefer this configuration if ( ifmShape.Elements() < ifmBlock.Elements() * 2 ) @@ -992,12 +1142,8 @@ int ArchEthosU85::CalcResizeMaxOfmBlockWidth(int ifmBits, int scaleN, int scaleD // the IFM block to fit in the chaining buffer assert(scaleN > 0); assert(scaleD > 0); - int numIfmCbSlots = _macs / 16; - if ( ifmBits == 16 ) - { - numIfmCbSlots /= 2; - } - int maxOfmBlkW = int(std::ceil(((numIfmCbSlots - 2) * scaleN + 1) / double(scaleD))); + const int cbBricks = (_cbRamSizeBytes / CB_SLOTS) / (BRICK_ELEMENTS * (ifmBits / 8)); + int maxOfmBlkW = int(std::ceil(((cbBricks - 2) * scaleN + 1) / double(scaleD))); maxOfmBlkW = std::max(1, std::min(maxOfmBlkW, _ofmBlockMax.Width())); return maxOfmBlkW; } diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp index 831904ec..b9986a17 100644 --- a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp +++ b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp @@ -135,13 +135,14 @@ public: int macs; int cores; std::array<Shape, 3> ofmUBlocks; - int nOfmUBlocks; Shape ifmUBlock; + int nOfmUBlocks; int ifmRamSizeBytes; int accRamSizeBytes; - int elemUnits; - int numAxiSramLog2; - int numAxiExtLog2; + int obRamSizeBytes; + int cbRamSizeBytes; + uint8_t numAxiSramLog2; + uint8_t numAxiExtLog2; const EthosU85PerfInfo *perfInfo; }; @@ -160,6 +161,8 @@ private: std::array<std::array<Shape, 3>, 3> _uBlockToIfmAuTable{}; Shape _ifmUBlock; int _ifmRamSizeBytes = 0; + int _cbRamSizeBytes = 0; + int _obRamSizeBytes = 0; int _accRamSizeBytes = 0; int _numAxiSramLog2 = 0; int _numAxiExtLog2 = 0; @@ -202,6 +205,15 @@ public: protected: void ApplyConfig(const AcceleratorConfig *cfg); + struct FindConfigCommon + { + Shape ofmBlockMax; + Shape granule; + Shape ublock; + int accBits; + }; + + Shape FindElementwiseConfig(const ArchitectureConfigQuery &query, const FindConfigCommon &common); std::unique_ptr<ArchitectureOpConfig> FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query); bool TryBlockConfig(EthosU85NpuOp npuOp, const Shape &ofmBlock, const Shape &ifmBlock, const Shape &ifmShape, diff --git a/ethosu/regor/common/shape.hpp b/ethosu/regor/common/shape.hpp index 60e673aa..616e9cb8 100644 --- a/ethosu/regor/common/shape.hpp +++ b/ethosu/regor/common/shape.hpp @@ -497,6 +497,12 @@ public: return result; } + unsigned LessMask(const Shape &other) const { return MinAxisFunc<std::less<int32_t>>(*this, other); } + + unsigned GreaterMask(const Shape &other) const { return MinAxisFunc<std::greater<int32_t>>(*this, other); } + + unsigned EqualMask(const Shape &other) const { return MinAxisFunc<std::equal_to<int32_t>>(*this, other); } + bool IsValid() const { return _last >= 0; } bool IsDynamic() const { return _dynamic; } @@ -594,6 +600,21 @@ private: int ToOffset(int index) const { return (index < 0) ? (-index - 1) : (_last - index); } + template<typename FUNC> + static unsigned MinAxisFunc(const Shape &a, const Shape &b) + { + int size = std::min(a.Size(), b.Size()); + assert(size < 32); + auto *pa = a.Storage(); + auto *pb = b.Storage(); + unsigned axisMask = 0; + for ( int i = 0; i < size; i++ ) + { + if ( FUNC()(pa[i], pb[i]) ) axisMask |= 1 << i; + } + return axisMask; + } + // Apply a function to the minimum number of axes between two shapes. template<typename FUNC> static Shape MinFunc(const Shape &a, const Shape &b) diff --git a/ethosu/regor/common/transpose_type.hpp b/ethosu/regor/common/transpose_type.hpp index 58e4a3b1..e873d81f 100644 --- a/ethosu/regor/common/transpose_type.hpp +++ b/ethosu/regor/common/transpose_type.hpp @@ -21,6 +21,10 @@ enum class TransposeType : uint32_t { + H = 0x1, + W = 0x2, + C = 0x3, + MaskC = 0xF, NHWC = 0x0123, NWHC = 0x0213, NHCW = 0x0132, @@ -35,13 +39,15 @@ inline constexpr TransposeType operator>>(TransposeType type, uint32_t size) return TransposeType(uint32_t(type) >> size); } -inline bool IsNone(TransposeType type) +inline constexpr TransposeType operator&(TransposeType a, TransposeType b) { - for ( int p = 0; p < 32; p += 4 ) - { - if ( type == (TransposeType::None >> p) ) return true; - } - return false; + return TransposeType(uint32_t(a) & uint32_t(b)); +} + +inline constexpr bool IsNone(TransposeType type) +{ + uint32_t offset = (7u - (uint32_t(type) & 7u)) * 4; + return uint32_t(TransposeType::None) >> offset == uint32_t(type); } // Reduce a 4D transpose mask to a 3D transpose mask (f.ex. 0x0123 -> 0x012) |