aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhilip Hall <philip.hall@arm.com>2024-04-24 14:04:03 +0100
committertim.hall <tim.hall@arm.com>2024-05-28 14:48:29 +0000
commit254b29f9deb79c07afe6a0fa15cd20d7cad99693 (patch)
treec52bd7a05ebb94f0139586c4857238025fe7cf95
parent8d303886fc6f29234d75aa78eabe853b7604177d (diff)
downloadethos-u-vela-dev/ethos_u85.tar.gz
MLBEDSW-8883: Block config selection for elementwisedev/ethos_u85
- Elementwise operations are largely immune to block configuration settings unless broadcasting. This commit chooses more optimal blocks for each of the supported broadcast dimensions. Signed-off-by: Philip Hall <philip.hall@arm.com> Change-Id: Idbf51b5b892aa6be93b31eb17956612b682a1492
-rw-r--r--ethosu/regor/architecture/ethosu85/ethos_u85.cpp260
-rw-r--r--ethosu/regor/architecture/ethosu85/ethos_u85.hpp20
-rw-r--r--ethosu/regor/common/shape.hpp21
-rw-r--r--ethosu/regor/common/transpose_type.hpp18
4 files changed, 252 insertions, 67 deletions
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
index ac98d62e..3949b947 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp
@@ -64,17 +64,20 @@ static const EthosU85PerfInfo s_EthosU85PerfInfo[] = {
static const ArchEthosU85::AcceleratorConfig s_EthosU85Configs[] = {
// Accelerator.Ethos_U85_128
- {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, 2, Shape(1, 2, 8), 8 * 1024, 8 * 1024, 2, 1, 0, &s_EthosU85PerfInfo[0]},
+ {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, Shape(1, 2, 8), 2, 8192, 8192, 2048, 768, 1, 0, &s_EthosU85PerfInfo[0]},
// Accelerator.Ethos_U85_256
- {256, 1, {Shape(1, 2, 16), Shape(1, 4, 8), Shape(2, 2, 8)}, 3, Shape(2, 2, 8), 16 * 1024, 16 * 1024, 4, 1, 0, &s_EthosU85PerfInfo[0]},
+ {256, 1, {Shape(1, 2, 16), Shape(1, 4, 8), Shape(2, 2, 8)}, Shape(2, 2, 8), 3, 16384, 16384, 2048, 1536, 1, 0, &s_EthosU85PerfInfo[0]},
// Accelerator.Ethos_U85_512
- {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, 2, Shape(2, 2, 16), 16 * 1024, 32 * 1024, 8, 1, 0, &s_EthosU85PerfInfo[1]},
- // Accelerator.Ethos_U85_1014
- {1024, 4, {Shape(2, 2, 32), Shape(1, 4, 32), Shape(2, 4, 16)}, 3, Shape(4, 2, 16), 16 * 1024, 64 * 1024, 16, 1, 1, &s_EthosU85PerfInfo[2]},
+ {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, Shape(2, 2, 16), 2, 16384, 32768, 4096, 3072, 1, 0, &s_EthosU85PerfInfo[1]},
+ // Accelerator.Ethos_U85_1024
+ {1024, 4, {Shape(2, 2, 32), Shape(1, 4, 32), Shape(2, 4, 16)}, Shape(4, 2, 16), 3, 16384, 65536, 4096, 6144, 1, 1, &s_EthosU85PerfInfo[2]},
// Accelerator.Ethos_U85_2048
- {2048, 4, {Shape(2, 2, 64), Shape(1, 4, 64), Shape(4, 4, 16)}, 3, Shape(4, 4, 16), 32 * 1024, 128 * 1024, 32, 2, 1, &s_EthosU85PerfInfo[3]},
+ {2048, 4, {Shape(2, 2, 64), Shape(1, 4, 64), Shape(4, 4, 16)}, Shape(4, 4, 16), 3, 32768, 131072, 8192, 12288, 2, 1, &s_EthosU85PerfInfo[3]},
};
+constexpr int CB_SLOTS = 6;
+constexpr int BRICK_ELEMENTS = 16;
+
enum class ElementwiseUsage
{
No = 0,
@@ -154,6 +157,8 @@ void ArchEthosU85::ApplyConfig(const AcceleratorConfig *cfg)
// Internal memory
_ifmRamSizeBytes = cfg->ifmRamSizeBytes;
_accRamSizeBytes = cfg->accRamSizeBytes;
+ _cbRamSizeBytes = cfg->cbRamSizeBytes;
+ _obRamSizeBytes = cfg->obRamSizeBytes;
_numAxiSramLog2 = cfg->numAxiSramLog2;
_numAxiExtLog2 = cfg->numAxiExtLog2;
@@ -693,22 +698,168 @@ Shape ArchEthosU85::FindUBlock(OpType opType, const ArchitectureConfigQuery &que
return bestUblk;
}
+static int GranularScale(int range, int granule, double ratio)
+{
+ assert(granule > 0);
+ int granules = range / granule;
+ granules = std::max(int(granules * ratio), 1);
+ return granules * granule;
+}
+
+static Shape FitVolumeByAspect(
+ const Shape &shape, int fitVolume, const Shape &granule, const Shape &limit, const std::array<int, 3> &priority)
+{
+ LOG_TRACE2("FitVolumeByAspect: {} into {} granule {}, limit {}\n", shape.ToString(), fitVolume, granule.ToString(),
+ limit.ToString());
+ LOG_INDENT(Logging::Out);
+ assert(shape.Size() >= 3);
+ assert(priority[0] + priority[1] + priority[2] == -6); // Simple axis presence check
+
+ // Extract axes by their priority
+ int primary = shape[priority[0]];
+ int secondary = shape[priority[1]];
+ int tertiary = shape[priority[2]];
+ int pgranule = granule[priority[0]];
+ int sgranule = granule[priority[1]];
+ int tgranule = granule[priority[2]];
+ int plimit = limit[priority[0]];
+ int slimit = limit[priority[1]];
+ int tlimit = limit[priority[2]];
+
+ // Fit a roughly aspect-correct 'shape' into 'fitVolume'
+ int sval, tval;
+ int pval = std::clamp(primary, 1, plimit);
+ pval = RoundAway(pval, pgranule);
+
+ // Planar area (depends on chosen axes)
+ int area = std::max(fitVolume / pval, 1);
+ assert(secondary > 0);
+ double aspect = double(tertiary) / secondary;
+
+ // Casting to int rounds down, making tval the smallest axis
+ tval = int(std::sqrt(area * aspect));
+ tval = std::clamp(tval, 1, tlimit);
+
+ // Divide before rounding tval to push sval upwards
+ sval = area / tval;
+ sval = std::clamp(sval, 1, slimit);
+
+ // Round to granule
+ sval = RoundAway(sval, sgranule);
+ tval = RoundAway(tval, tgranule);
+
+ Shape result(1, 1, 1);
+ result[priority[0]] = pval;
+ result[priority[1]] = sval;
+ result[priority[2]] = tval;
+
+ LOG_TRACE2("Pre-fitted shape: {}\n", result.ToString());
+
+ // The result MUST NOT exceed the requested volume
+ // TODO: Crude loop - WORKS but needs improvement
+ int elements = pval * sval * tval;
+ while ( elements > fitVolume )
+ {
+ double ratio = double(fitVolume) / elements;
+ if ( tval > tgranule )
+ {
+ tval = GranularScale(tval, tgranule, ratio);
+ }
+ else if ( sval > sgranule )
+ {
+ sval = GranularScale(sval, sgranule, ratio);
+ }
+ else if ( pval > pgranule )
+ {
+ pval = GranularScale(pval, pgranule, ratio);
+ }
+ else break; // Give up
+ elements = pval * sval * tval;
+ }
+
+ result[priority[0]] = pval;
+ result[priority[1]] = sval;
+ result[priority[2]] = tval;
+ LOG_TRACE2("Fitted Shape: {}\n", result.ToString());
+ return result;
+}
+
+Shape ArchEthosU85::FindElementwiseConfig(const ArchitectureConfigQuery &query, const FindConfigCommon &common)
+{
+ LOG_TRACE2("Elementwise OFM {}\n", query.ofmShape.ToString());
+ LOG_INDENT(Logging::Out);
+ assert(query.ifmBits > 0);
+ const Shape ofmShape = Shape::PadAxes(Shape::RoundAway(query.ofmShape, common.ublock), 3, 1);
+ Shape ofmBlockLimit = Shape::Min(ofmShape, common.ofmBlockMax);
+
+ // Default to width/depth for HCWB16
+ std::array<int, 3> axisPriority{-2, -1, -3};
+
+ const bool isScalar = (query.ifmShape[0].Elements() == 1) || (query.ifmShape[1] && query.ifmShape[1].Elements() == 1);
+ // Binary elementwise, potentially broadcast
+ if ( !isScalar && query.ifmShape[1] )
+ {
+ const int cbBricks = (_cbRamSizeBytes / CB_SLOTS) / (BRICK_ELEMENTS * (query.ifmBits / 8));
+ unsigned broadcastMask = query.ifmShape[0].LessMask(query.ofmShape);
+ broadcastMask |= query.ifmShape[1].LessMask(query.ofmShape);
+ // Broadcast in depth first
+ if ( broadcastMask & 1 )
+ {
+ int hLimit = common.ublock.Height();
+ int wLimit = cbBricks / common.ublock.Height();
+ while ( (wLimit > common.ublock.Width()) && (wLimit > ofmBlockLimit.Width() / 2) && (hLimit < ofmBlockLimit.Height()) )
+ {
+ wLimit = wLimit / 2;
+ hLimit = hLimit * 2;
+ }
+
+ return Shape(hLimit, wLimit, RoundAway(ofmBlockLimit.Depth(), common.ublock.Depth()));
+ }
+ // Broadcast in width first
+ else if ( broadcastMask & 2 )
+ {
+ axisPriority = {-2, -1, -3};
+ }
+ // Broadcast in height first
+ else if ( broadcastMask & 4 )
+ {
+ int cLimit = common.granule.Depth();
+ int wLimit = cbBricks;
+
+ while ( (wLimit > common.ublock.Width()) && (wLimit > ofmBlockLimit.Width() / 2) && (cLimit < ofmBlockLimit.Depth()) )
+ {
+ wLimit = wLimit / 2;
+ cLimit = cLimit * 2;
+ }
+
+ return Shape(ofmBlockLimit.Height(), wLimit, cLimit);
+ }
+ }
+
+ // As long as the output buffer is kept filled we will fill the pipeline. We size for
+ // at least this many elements which ultimately affects HW striping.
+ const int minOfmElements = 2 * (_obRamSizeBytes * 8) / query.ifmBits;
+ // Fit the ofmShape into the available elements with granule and limit constraints.
+ Shape ofmBlock = FitVolumeByAspect(ofmShape, minOfmElements, common.granule, ofmBlockLimit, axisPriority);
+ ofmBlock = Shape::RoundAway(ofmBlock, common.granule);
+ LOG_TRACE2("Elementwise choice: ofmBlock = {}\n", ofmBlock.ToString());
+ return ofmBlock;
+}
+
std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query)
{
+ constexpr int OFMSplitDepth = 16; // Specific to this architecture
assert(query.ifmBits > 0 && (query.ifmBits <= 32 || (query.ifmBits == 64 && opType == OpType::Rescale)));
assert(query.ofmShape.Size() > 2 && "Insufficient dimensions to search for block config");
assert(query.kernel != nullptr);
- if ( !SupportsAccumulatorMode(query.accSource, query.accOutputEnabled) ) return nullptr;
-
- const int OFMSplitDepth = 16; // Specific to this architecture
+ EthosU85NpuOp npuOp = GetHWOp(opType);
+ assert(npuOp != EthosU85NpuOp::None);
+ if ( npuOp == EthosU85NpuOp::Dma ) return nullptr; // DMA ops don't use block config
// Elementwise larger-volume correction
const Shape &ifmShape = (query.ifmShape[1].Elements() > query.ifmShape[0].Elements()) ? query.ifmShape[1] : query.ifmShape[0];
- EthosU85NpuOp npuOp = GetHWOp(opType);
- assert(npuOp != EthosU85NpuOp::None);
-
// Operator typing help
bool isPooling = npuOp == EthosU85NpuOp::Pooling || npuOp == EthosU85NpuOp::ReduceSum;
bool isReduceSum = npuOp == EthosU85NpuOp::ReduceSum;
@@ -716,32 +867,18 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opTyp
bool isElementwise = npuOp == EthosU85NpuOp::Elementwise;
bool isConvolution = npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::Depthwise;
bool isResize = npuOp == EthosU85NpuOp::Resize;
- bool isDma = npuOp == EthosU85NpuOp::Dma;
bool isPartKernel = isConvolution && ChooseKernelMethod(ifmShape, query.ifmBits, query.kernel);
bool isEqualDepthOp = isElementwise || (isPooling && !isReduceSum) || isDepthwise || isResize;
- if ( isDma )
- {
- // DMA ops doesn't use block config
- return nullptr;
- }
-
- // Operator configuration to be returned
- auto config = std::make_unique<EthosU85OpConfig>();
-
EthosU85Traversal traversal = isDepthwise ? EthosU85Traversal::Depthwise : (isPartKernel ? EthosU85Traversal::PartKernel : EthosU85Traversal::DepthFirst);
// Accumulator settings
EthosU85Accumulator accType = EthosU85Accumulator::Acc32;
- if ( query.ifmBits == 16 && (!isPooling || isReduceSum) && query.scaled )
+ if ( (query.ifmBits == 16 && (!isPooling || isReduceSum) && query.scaled) || // Normal 16-bit selection
+ (query.ifmBits >= 32) ) // Special case for Rescale int48
{
accType = EthosU85Accumulator::Acc48;
}
- else if ( query.ifmBits == 64 && opType == OpType::Rescale )
- {
- // Special case for Rescale int48
- accType = EthosU85Accumulator::Acc48;
- }
int accBits = AccumulatorBits(accType);
int rounding;
@@ -749,7 +886,7 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opTyp
int numBlocksInRam = 2;
const Shape ofmUBlock = FindUBlock(opType, query);
- if ( ofmUBlock == Shape() )
+ if ( !ofmUBlock )
{
// no valid ofm microblock found
LOG_WARN("Could not find a valid OFM microblock for {} with {}-bit input.\n", OpTypeToString(opType), query.ifmBits);
@@ -779,31 +916,45 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opTyp
ifmBlockDepth = 64;
}
- // Weights fetch (for operators that have them)
- int weightFetchWH = isConvolution ? query.kernel->Size().AreaXY() : 0;
-
- int ofmUBlockDepth = ofmUBlock.Depth();
-
// When using brick format and certain transposes, there are additional constraints to the block size, so we must
// extend the search space to be able to find a valid block size.
- Shape ofmBlockMin = Shape(0, 0, 0);
+ Shape ofmBlockGranule = ofmUBlock;
if ( query.ofmFormat == TensorFormat::NHCWB16 )
{
- switch ( query.transpose )
- {
- case TransposeType::NCHW:
- case TransposeType::NHCW:
- ofmBlockMin = ofmBlockMin.WithWidth(16);
- break;
- case TransposeType::NCWH:
- case TransposeType::NWCH:
- ofmBlockMin = ofmBlockMin.WithHeight(16);
- break;
- default:
- break;
- }
+ if ( (query.transpose & TransposeType::MaskC) == TransposeType::W ) ofmBlockGranule[-2] = 16;
+ if ( (query.transpose & TransposeType::MaskC) == TransposeType::H ) ofmBlockGranule[-3] = 16;
+ }
+ if ( query.ofmShape.Depth() >= 16 ) ofmBlockGranule[-1] = 16;
+
+ // Operator configuration to be returned
+ auto config = std::make_unique<EthosU85OpConfig>();
+ config->_ofmUBlock = ofmUBlock;
+ config->_accumulatorType = accType;
+ config->_accumulatorSource = query.accSource;
+ config->_accumulatorOutputEnabled = query.accOutputEnabled;
+ config->_ifmRamSizeBytes = _ifmRamSizeBytes;
+ config->_traversal = traversal;
+
+ // Common constant vars
+ FindConfigCommon common;
+ common.ofmBlockMax = _ofmBlockMax.Untranspose(Reduce4To3(query.transpose));
+ common.ublock = ofmUBlock;
+ common.granule = ofmBlockGranule;
+ common.accBits = AccumulatorBits(accType);
+
+ if ( isElementwise )
+ {
+ config->_ofmBlock = FindElementwiseConfig(query, common);
+ config->_ifmBlock = config->_ofmBlock;
+ return config;
}
- Shape searchSpaceStep = Shape::Max(ofmUBlock, ofmBlockMin);
+
+ // Weights fetch (for operators that have them)
+ int weightFetchWH = isConvolution ? query.kernel->Size().AreaXY() : 0;
+
+ int ofmUBlockDepth = ofmUBlock.Depth();
+
+ Shape searchSpaceStep = Shape::Max(ofmUBlock, ofmBlockGranule);
Shape ofmBlockMaxTp = _ofmBlockMax.Untranspose(Reduce4To3(query.transpose));
Shape searchSpaceEnd = Shape::RoundAway(Shape::Max(Shape::Min(query.ofmShape, ofmBlockMaxTp), searchSpaceStep), ofmUBlock);
@@ -891,8 +1042,7 @@ std::unique_ptr<ArchitectureOpConfig> ArchEthosU85::FindBlockConfig(OpType opTyp
}
// Scale relative to every output OFM element
- float relativeCost =
- (isElementwise || isResize) ? float(ofmElements) / (float(height) * width * depth) : (ifmFetch + weightFetch) / float(ofmElements);
+ float relativeCost = (isResize) ? float(ofmElements) / (float(height) * width * depth) : (ifmFetch + weightFetch) / float(ofmElements);
// If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
if ( ifmShape.Elements() < ifmBlock.Elements() * 2 )
@@ -992,12 +1142,8 @@ int ArchEthosU85::CalcResizeMaxOfmBlockWidth(int ifmBits, int scaleN, int scaleD
// the IFM block to fit in the chaining buffer
assert(scaleN > 0);
assert(scaleD > 0);
- int numIfmCbSlots = _macs / 16;
- if ( ifmBits == 16 )
- {
- numIfmCbSlots /= 2;
- }
- int maxOfmBlkW = int(std::ceil(((numIfmCbSlots - 2) * scaleN + 1) / double(scaleD)));
+ const int cbBricks = (_cbRamSizeBytes / CB_SLOTS) / (BRICK_ELEMENTS * (ifmBits / 8));
+ int maxOfmBlkW = int(std::ceil(((cbBricks - 2) * scaleN + 1) / double(scaleD)));
maxOfmBlkW = std::max(1, std::min(maxOfmBlkW, _ofmBlockMax.Width()));
return maxOfmBlkW;
}
diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
index 831904ec..b9986a17 100644
--- a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
+++ b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp
@@ -135,13 +135,14 @@ public:
int macs;
int cores;
std::array<Shape, 3> ofmUBlocks;
- int nOfmUBlocks;
Shape ifmUBlock;
+ int nOfmUBlocks;
int ifmRamSizeBytes;
int accRamSizeBytes;
- int elemUnits;
- int numAxiSramLog2;
- int numAxiExtLog2;
+ int obRamSizeBytes;
+ int cbRamSizeBytes;
+ uint8_t numAxiSramLog2;
+ uint8_t numAxiExtLog2;
const EthosU85PerfInfo *perfInfo;
};
@@ -160,6 +161,8 @@ private:
std::array<std::array<Shape, 3>, 3> _uBlockToIfmAuTable{};
Shape _ifmUBlock;
int _ifmRamSizeBytes = 0;
+ int _cbRamSizeBytes = 0;
+ int _obRamSizeBytes = 0;
int _accRamSizeBytes = 0;
int _numAxiSramLog2 = 0;
int _numAxiExtLog2 = 0;
@@ -202,6 +205,15 @@ public:
protected:
void ApplyConfig(const AcceleratorConfig *cfg);
+ struct FindConfigCommon
+ {
+ Shape ofmBlockMax;
+ Shape granule;
+ Shape ublock;
+ int accBits;
+ };
+
+ Shape FindElementwiseConfig(const ArchitectureConfigQuery &query, const FindConfigCommon &common);
std::unique_ptr<ArchitectureOpConfig> FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query);
bool TryBlockConfig(EthosU85NpuOp npuOp, const Shape &ofmBlock, const Shape &ifmBlock, const Shape &ifmShape,
diff --git a/ethosu/regor/common/shape.hpp b/ethosu/regor/common/shape.hpp
index 60e673aa..616e9cb8 100644
--- a/ethosu/regor/common/shape.hpp
+++ b/ethosu/regor/common/shape.hpp
@@ -497,6 +497,12 @@ public:
return result;
}
+ unsigned LessMask(const Shape &other) const { return MinAxisFunc<std::less<int32_t>>(*this, other); }
+
+ unsigned GreaterMask(const Shape &other) const { return MinAxisFunc<std::greater<int32_t>>(*this, other); }
+
+ unsigned EqualMask(const Shape &other) const { return MinAxisFunc<std::equal_to<int32_t>>(*this, other); }
+
bool IsValid() const { return _last >= 0; }
bool IsDynamic() const { return _dynamic; }
@@ -594,6 +600,21 @@ private:
int ToOffset(int index) const { return (index < 0) ? (-index - 1) : (_last - index); }
+ template<typename FUNC>
+ static unsigned MinAxisFunc(const Shape &a, const Shape &b)
+ {
+ int size = std::min(a.Size(), b.Size());
+ assert(size < 32);
+ auto *pa = a.Storage();
+ auto *pb = b.Storage();
+ unsigned axisMask = 0;
+ for ( int i = 0; i < size; i++ )
+ {
+ if ( FUNC()(pa[i], pb[i]) ) axisMask |= 1 << i;
+ }
+ return axisMask;
+ }
+
// Apply a function to the minimum number of axes between two shapes.
template<typename FUNC>
static Shape MinFunc(const Shape &a, const Shape &b)
diff --git a/ethosu/regor/common/transpose_type.hpp b/ethosu/regor/common/transpose_type.hpp
index 58e4a3b1..e873d81f 100644
--- a/ethosu/regor/common/transpose_type.hpp
+++ b/ethosu/regor/common/transpose_type.hpp
@@ -21,6 +21,10 @@
enum class TransposeType : uint32_t
{
+ H = 0x1,
+ W = 0x2,
+ C = 0x3,
+ MaskC = 0xF,
NHWC = 0x0123,
NWHC = 0x0213,
NHCW = 0x0132,
@@ -35,13 +39,15 @@ inline constexpr TransposeType operator>>(TransposeType type, uint32_t size)
return TransposeType(uint32_t(type) >> size);
}
-inline bool IsNone(TransposeType type)
+inline constexpr TransposeType operator&(TransposeType a, TransposeType b)
{
- for ( int p = 0; p < 32; p += 4 )
- {
- if ( type == (TransposeType::None >> p) ) return true;
- }
- return false;
+ return TransposeType(uint32_t(a) & uint32_t(b));
+}
+
+inline constexpr bool IsNone(TransposeType type)
+{
+ uint32_t offset = (7u - (uint32_t(type) & 7u)) * 4;
+ return uint32_t(TransposeType::None) >> offset == uint32_t(type);
}
// Reduce a 4D transpose mask to a 3D transpose mask (f.ex. 0x0123 -> 0x012)