From 67e11f7bce40d72e0dda97cf658a3c3ee600c1eb Mon Sep 17 00:00:00 2001 From: Mauricio Briceno Date: Wed, 5 May 2021 12:47:28 +0200 Subject: weight_compressor: added mlw_reorder_encode - Moves reordering to C - Runtime is greatly minimized for encoding weights Change-Id: Ifff01e7b1ea6d5cec68310a155c3b80aa1a38545 Signed-off-by: Mauricio Briceno --- ethosu/mlw_codec/mlw_codecmodule.c | 145 ++++++++++++++++++-- ethosu/mlw_codec/mlw_encode.c | 267 ++++++++++++++++++++++++++++++++++++- ethosu/vela/weight_compressor.py | 130 ++---------------- setup.py | 2 + 4 files changed, 416 insertions(+), 128 deletions(-) diff --git a/ethosu/mlw_codec/mlw_codecmodule.c b/ethosu/mlw_codec/mlw_codecmodule.c index 2c2fba2c..1e13dd22 100644 --- a/ethosu/mlw_codec/mlw_codecmodule.c +++ b/ethosu/mlw_codec/mlw_codecmodule.c @@ -18,10 +18,137 @@ #define PY_SSIZE_T_CLEAN #include +#include #include "mlw_decode.h" #include "mlw_encode.h" +/* C extension wrapper for mlw_reorder_encode + * + * This method is exposed directly in python with the arguments with a + * prototype of the form: + * + * output = mlw_codec.reorder_encode( + * ifm_ublock_depth, + * ofm_ublock_depth, + * input, + * ofm_block_depth, + * is_depthwise, + * is_partkernel, + * ifm_bitdepth, + * decomp_h, + * decomp_w, + * verbose=0) + * + * output: bytearray + */ + +static PyObject * +method_reorder_encode (PyObject *self, PyObject *args) +{ + /* Object to hold the input integer list. */ + int ifm_ublock_depth; + int ofm_ublock_depth; + PyObject *input_object; + int ofm_block_depth; + int is_depthwise; + int is_partkernel; + int ifm_bitdepth; + int decomp_h; + int decomp_w; + + /* Object to hold the input verbosity integer, the verbose argument + * is optional so defaulted to 0. + */ + int verbose = 0; + + /* Arguments to the method are delivered as a tuple, unpack the + * tuple to get the individual arguments, note the second is + * optional. + */ + if (!PyArg_ParseTuple(args, "iiOiiiiii|i", + &ifm_ublock_depth, + &ofm_ublock_depth, + &input_object, + &ofm_block_depth, + &is_depthwise, + &is_partkernel, + &ifm_bitdepth, + &decomp_h, + &decomp_w, + &verbose)) + return NULL; + + PyArrayObject* input_ndarray_object = PyArray_FROM_OTF( + input_object, + NPY_INT64, + NPY_ARRAY_ALIGNED); + if (input_ndarray_object == NULL) + { + return NULL; + } + + if ((int)PyArray_NDIM(input_ndarray_object) < 4) + { + PyErr_SetString(PyExc_ValueError, "Invalid input shape"); + return NULL; + } + + int ofm_depth = (int)PyArray_DIM(input_ndarray_object, 0); + int kernel_height = (int)PyArray_DIM(input_ndarray_object, 1); + int kernel_width = (int)PyArray_DIM(input_ndarray_object, 2); + int ifm_depth = (int)PyArray_DIM(input_ndarray_object, 3); + + int64_t* brick_weights = (int64_t*)PyArray_DATA(input_ndarray_object); + int brick_strides[4]; + for (int i = 0; i < 4; i++) + { + brick_strides[i] = (int)PyArray_STRIDE(input_ndarray_object, i); + } + if ((unsigned)PyArray_ITEMSIZE(input_ndarray_object) != sizeof(int64_t)) + { + PyErr_SetString(PyExc_ValueError, "Invalid input type"); + return NULL; + } + uint8_t* output_buffer = NULL; + int padded_length; + + int output_length = mlw_reorder_encode( + ifm_ublock_depth, + ofm_ublock_depth, + ofm_depth, + kernel_height, + kernel_width, + ifm_depth, + brick_strides, + brick_weights, + ofm_block_depth, + is_depthwise, + is_partkernel, + ifm_bitdepth, + decomp_h, + decomp_w, + &output_buffer, + &padded_length, + verbose); + + if (output_buffer == NULL) + { + return PyErr_NoMemory(); + } + + PyObject *output_byte_array = PyByteArray_FromStringAndSize((char*)output_buffer, output_length); + PyObject *padded_length_obj = Py_BuildValue("i", padded_length); + + /* Discard the output buffer */ + mlw_free_outbuf(output_buffer); + + PyObject* ret = PyTuple_Pack(2, output_byte_array, padded_length_obj); + Py_DECREF(output_byte_array); + Py_DECREF(padded_length_obj); + return ret; +} + /* C extension wrapper for mlw_encode * * This method is exposed directly in python with the arguments with a @@ -63,6 +190,7 @@ method_encode (PyObject *self, PyObject *args) * for that purpose. */ int16_t *input_buffer = (int16_t *) malloc(sizeof(int16_t *) * input_length); + uint8_t *output_buffer = NULL; if (input_buffer == NULL) return PyErr_NoMemory(); @@ -84,20 +212,13 @@ method_encode (PyObject *self, PyObject *args) return NULL; } - /* We don't know the output length required, we guess worst case, - * the mlw_encode call will do a resize (downwards) anyway. - */ - uint8_t *output_buffer = (uint8_t *) malloc(input_length); - if (output_buffer == NULL) - return PyErr_NoMemory(); - int output_length = mlw_encode(input_buffer, input_length, &output_buffer, verbose); PyObject *output_byte_array = PyByteArray_FromStringAndSize ((char *) output_buffer, output_length); /* Discard the temporary input and output buffers. */ free (input_buffer); - free (output_buffer); + mlw_free_outbuf(output_buffer); return output_byte_array; } @@ -163,6 +284,7 @@ method_decode(PyObject *self, PyObject *args) static PyMethodDef mlw_methods[] = { {"decode", method_decode, METH_VARARGS, "Python interface for decode"}, {"encode", method_encode, METH_VARARGS, "Python interface for encode"}, + {"reorder_encode", method_reorder_encode, METH_VARARGS, "Python interface for reorder and encode"}, {NULL, NULL, 0, NULL} }; @@ -177,6 +299,9 @@ static struct PyModuleDef mlw_codecmodule = { mlw_methods }; -PyMODINIT_FUNC PyInit_mlw_codec(void) { - return PyModule_Create(&mlw_codecmodule); +PyMODINIT_FUNC PyInit_mlw_codec(void) +{ + PyObject* ret = PyModule_Create(&mlw_codecmodule); + import_array(); + return ret; } diff --git a/ethosu/mlw_codec/mlw_encode.c b/ethosu/mlw_codec/mlw_encode.c index 04afa3ee..62e8360e 100644 --- a/ethosu/mlw_codec/mlw_encode.c +++ b/ethosu/mlw_codec/mlw_encode.c @@ -819,12 +819,13 @@ static int encode_section( const int16_t *inbuf, // Encode the given weight stream // inbuf uncompressed 9bit signed weights // inbuf_size number of weights -// outbuf compressed bitstream, buffer is malloced +// outbuf compressed bitstream, buffer is malloced within this function // verbose if non-zero, printf log // Return value is the size in bytes of the compressed output // Return -1 if error int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) { int i; +#ifndef NDEBUG // Range check for(i=0; i255) { @@ -832,8 +833,10 @@ int mlw_encode( int16_t *inbuf, int inbuf_size, uint8_t **outbuf, int verbose) { return -1; } } +#endif int bitbuf_size = inbuf_size*2+1024; + assert(*outbuf == NULL); *outbuf = malloc( bitbuf_size ); // Analyse input data to find palette re-programming points @@ -882,3 +885,265 @@ void mlw_free_outbuf( uint8_t *outbuf ) { if (outbuf) free(outbuf); } + +static int round_up_divide(int num, int den) +{ + return (num + den - 1) / den; +} + +static int round_up(int num, int den) +{ + return round_up_divide(num, den) * den; +} + +static int get_weight_cnt( + int ifm_ublock_depth, + int ofm_ublock_depth, + int ofm_depth, + int kernel_height, + int kernel_width, + int ifm_depth, + int ofm_block_depth, + int is_depthwise, + int is_partkernel, + int ifm_bitdepth, + int decomp_h, + int decomp_w) +{ + int ifm_block_depth = is_partkernel || ifm_bitdepth == 16 ? 16 : 32; + int subkernel_elements = decomp_w * decomp_h; + if (is_partkernel) + { + if (ifm_bitdepth == 16 && subkernel_elements % 2 != 0) + { + subkernel_elements = round_up(subkernel_elements, 2); + } + else if (ifm_bitdepth == 8 && subkernel_elements % 4 != 0) + { + subkernel_elements = round_up(subkernel_elements, 4); + } + } + else if (is_depthwise) + { + subkernel_elements = round_up(subkernel_elements, 4); + } + int clipped_ifm_block_depth = is_depthwise ? ifm_ublock_depth : ifm_block_depth; + int ifm_block_depth_outer = is_partkernel ? clipped_ifm_block_depth : 1; + int ifm_block_depth_inner = is_partkernel ? 1 : clipped_ifm_block_depth; + + int input_length = 1; + input_length *= is_depthwise ? 1 : ifm_ublock_depth; + input_length *= ofm_ublock_depth; + input_length *= round_up_divide(ifm_block_depth_inner, ifm_ublock_depth); + input_length *= subkernel_elements; + input_length *= round_up_divide(ofm_block_depth, ofm_ublock_depth); + input_length *= round_up_divide(ifm_block_depth_outer, ifm_ublock_depth); + input_length *= round_up_divide(kernel_width, decomp_w); + input_length *= round_up_divide(kernel_height, decomp_h); + input_length *= round_up_divide(is_depthwise ? 1 : ifm_depth, ifm_block_depth); + input_length *= round_up_divide(ofm_depth, ofm_block_depth); + + return input_length; +} + +struct brick_buf_s +{ + uint8_t* buf; + int* strides; +}; +typedef struct brick_buf_s brick_buf_t; + +static int16_t get_brick_weight(brick_buf_t* buf, int ofm_z, int wy, int wx, int ifm_z) +{ + uint8_t* p = buf->buf; + + p += ofm_z * buf->strides[0]; + p += wy * buf->strides[1]; + p += wx * buf->strides[2]; + p += ifm_z * buf->strides[3]; + + return *(int16_t*)p; +} + +static int reorder( + int ifm_ublock_depth, + int ofm_ublock_depth, + int ofm_depth, + int kernel_height, + int kernel_width, + int ifm_depth, + int* strides, + void* inbuf, + int ofm_block_depth, + int is_depthwise, + int is_partkernel, + int ifm_bitdepth, + int decomp_h, + int decomp_w, + int16_t* weights) +{ + brick_buf_t brick_buf; + brick_buf.buf = inbuf; + brick_buf.strides = strides; + + int ifm_block_depth = is_partkernel || ifm_bitdepth == 16 ? 16 : 32; + int weight_cnt = 0; + for (int ofm_block_z = 0; ofm_block_z < ofm_depth; ofm_block_z += ofm_block_depth) + { + int clipped_ofm_block_depth = min(ofm_block_depth, ofm_depth - ofm_block_z); + // IFM blocks required for the brick + for (int ifm_block_z = 0; ifm_block_z < (is_depthwise ? 1 : ifm_depth); ifm_block_z += ifm_block_depth) + { + int clipped_ifm_block_depth; + if (is_depthwise) + { + clipped_ifm_block_depth = ifm_ublock_depth; + } + else + { + clipped_ifm_block_depth = is_partkernel ? + min(ifm_block_depth, ifm_depth - ifm_block_z) : ifm_block_depth; + } + // Weight decomposition + // Subkernel Splitting (H) + for (int subkernel_y = 0; subkernel_y < kernel_height; subkernel_y += decomp_h) + { + int sub_height = min(kernel_height - subkernel_y, decomp_h); + // Subkernel splitting (W) + for (int subkernel_x = 0; subkernel_x < kernel_width; subkernel_x += decomp_w) + { + int sub_width = min(kernel_width - subkernel_x, decomp_w); + int subkernel_elements = sub_width * sub_height; + // Part kernel first works across the kernel H/W and needs padding + if (is_partkernel) + { + if (ifm_bitdepth == 16 && subkernel_elements % 2 != 0) + { + subkernel_elements = round_up(subkernel_elements, 2); + } + else if (ifm_bitdepth == 8 && subkernel_elements % 4 != 0) + { + subkernel_elements = round_up(subkernel_elements, 4); + } + } + else if (is_depthwise) + { + subkernel_elements = round_up(subkernel_elements, 4); + } + int ifm_block_depth_outer = is_partkernel ? clipped_ifm_block_depth : 1; + int ifm_block_depth_inner = is_partkernel ? 1 : clipped_ifm_block_depth; + for (int ifm_ublk_outer = 0; ifm_ublk_outer < ifm_block_depth_outer; ifm_ublk_outer += ifm_ublock_depth) + { + // OFM Ublocks in OFM-block over depth + for (int ofm_ublk = 0; ofm_ublk < clipped_ofm_block_depth; ofm_ublk += ofm_ublock_depth) + { + // HW Kernel element traversal - cannot be a H/W loop due to element + // padding requirement on depthwise/part-kernel configurations + for (int element = 0; element < subkernel_elements; element++) + { + int kx = element % sub_width; + int ky = element / sub_width; + // IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise) + // In case of part-kernel-first IFM Ublock traversal have already been handled + // and this loop is ignored. + for (int ifm_ublk_inner = 0; ifm_ublk_inner < ifm_block_depth_inner; ifm_ublk_inner += ifm_ublock_depth) + { + // Feed OFM ublock elements + for (int ofm_ublock_z = 0; ofm_ublock_z < ofm_ublock_depth; ofm_ublock_z++) + { + // Source IFM ublock elements (only 1 element deep if depthwise) + for (int ifm_ublock_z = 0; ifm_ublock_z < (is_depthwise ? 1 : ifm_ublock_depth); ifm_ublock_z++) + { + // Source position within the current subkernel + int wx = subkernel_x + kx; + int wy = subkernel_y + ky; + // Source IFM/OFM slices + int ifm_ublk = ifm_ublk_inner + ifm_ublk_outer; + int ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z; + int ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z; + if ((ifm_z < ifm_depth) && (ofm_z < ofm_depth) && (ky < sub_height)) + { + weights[weight_cnt] = get_brick_weight(&brick_buf, ofm_z, wy, wx, ifm_z); + } + weight_cnt++; + } + } + } + } + } + } + } + } + } + } + + return weight_cnt; +} + +// Reorder and encode the given weight stream +// Return value is the size in bytes of the compressed output +// Return -1 if error +int mlw_reorder_encode( + int ifm_ublock_depth, + int ofm_ublock_depth, + int ofm_depth, + int kernel_height, + int kernel_width, + int ifm_depth, + int* brick_strides, + void* inbuf, + int ofm_block_depth, + int is_depthwise, + int is_partkernel, + int ifm_bitdepth, + int decomp_h, + int decomp_w, + uint8_t **outbuf, // *outbuf must be freed by caller + int* padded_length, + int verbose) +{ + /* Get an upper bound of the weight count */ + int input_length = get_weight_cnt( + ifm_ublock_depth, + ofm_ublock_depth, + ofm_depth, + kernel_height, + kernel_width, + ifm_depth, + ofm_block_depth, + is_depthwise, + is_partkernel, + ifm_bitdepth, + decomp_h, + decomp_w); + + int16_t* weights = (int16_t*)calloc(input_length, sizeof(int16_t)); + if (weights == NULL) + { + return 0; + } + + /* Reorder weights and update input_length */ + input_length = reorder( + ifm_ublock_depth, + ofm_ublock_depth, + ofm_depth, + kernel_height, + kernel_width, + ifm_depth, + brick_strides, + inbuf, + ofm_block_depth, + is_depthwise, + is_partkernel, + ifm_bitdepth, + decomp_h, + decomp_w, + weights); + + int output_length = mlw_encode(weights, input_length, outbuf, verbose); + free(weights); + *padded_length = input_length; + + return output_length; +} diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py index 7ce237ca..9a1d5a16 100644 --- a/ethosu/vela/weight_compressor.py +++ b/ethosu/vela/weight_compressor.py @@ -15,7 +15,6 @@ # limitations under the License. # Description: # Compresses and pads the weigths. It also calculates the scales and packs with the biases. -import math from collections import namedtuple from typing import Tuple @@ -93,18 +92,20 @@ def encode_weights( ifm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ifm_ublock ofm_ublock = ArchitectureFeatures.accelerator_configs[accelerator].ofm_ublock - raw_stream = generate_brick( - ifm_ublock=ifm_ublock, - ofm_ublock=ofm_ublock, - brick_weights=weights_volume, - ofm_block_depth=ofm_block_depth, - is_depthwise=is_depthwise, - is_partkernel=block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST, - ifm_bitdepth=ifm_bitdepth, - dilation=dilation_xy, + decomp_h = ArchitectureFeatures.SubKernelMax.height // dilation_xy[0] + decomp_w = ArchitectureFeatures.SubKernelMax.width // dilation_xy[1] + + return mlw_codec.reorder_encode( + ifm_ublock.depth, + ofm_ublock.depth, + weights_volume, + ofm_block_depth, + is_depthwise, + block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST, + ifm_bitdepth, + decomp_h, + decomp_w, ) - encoded_stream = encode(raw_stream) - return encoded_stream, len(raw_stream) def encode_bias(bias: np.int64, scale: int, shift: int): @@ -180,111 +181,6 @@ class CompressedWeightCache: self.cache[wcc] = (tens_clone, unencoded_size) -def encode(weight_stream): - if len(weight_stream) == 0: - return [] - assert np.amin(weight_stream) >= -255 - assert np.amax(weight_stream) <= 255 - - # Encode flattened signed weight stream - compressed = mlw_codec.encode(weight_stream) - - # pad with 0xFF as needed so the length of the weight stream - # is a multiple of 16 - - while (len(compressed) % 16) != 0: - compressed.append(0xFF) - - return compressed - - -def generate_brick( - ifm_ublock, ofm_ublock, brick_weights, ofm_block_depth, is_depthwise, is_partkernel, ifm_bitdepth, dilation -): - - decomp_h = ArchitectureFeatures.SubKernelMax.height // dilation[0] - decomp_w = ArchitectureFeatures.SubKernelMax.width // dilation[1] - # Expect weights formatted OHWI - ofm_depth = brick_weights.shape[-4] - ifm_depth = brick_weights.shape[-1] - kernel_width = brick_weights.shape[-2] - kernel_height = brick_weights.shape[-3] - # IFM block depth - if is_partkernel or (ifm_bitdepth == 16): - # IFM block depth is always 16 for part-kernel-first - ifm_block_depth = 16 - elif ifm_bitdepth == 8: - ifm_block_depth = 32 - else: - assert False - - stream = [] - - # Top level striping - OFM blocks in the entire brick's depth - for ofm_block_z in range(0, ofm_depth, ofm_block_depth): - clipped_ofm_block_depth = min(ofm_block_depth, ofm_depth - ofm_block_z) - # IFM blocks required for the brick - for ifm_block_z in range(0, (1 if is_depthwise else ifm_depth), ifm_block_depth): - if is_depthwise: - clipped_ifm_block_depth = ifm_ublock.depth - else: - clipped_ifm_block_depth = ( - min(ifm_block_depth, ifm_depth - ifm_block_z) if is_partkernel else ifm_block_depth - ) - # Weight decomposition - # Subkernel Splitting (H) - for subkernel_y in range(0, kernel_height, decomp_h): - sub_height = min(kernel_height - subkernel_y, decomp_h) - # Subkernel splitting (W) - for subkernel_x in range(0, kernel_width, decomp_w): - sub_width = min(kernel_width - subkernel_x, decomp_w) - subkernel_elements = sub_width * sub_height - # Part kernel first works across the kernel H/W and needs padding - if is_partkernel: - if ifm_bitdepth == 16 and subkernel_elements % 2 != 0: - subkernel_elements = int(math.ceil(subkernel_elements / 2) * 2) - elif ifm_bitdepth == 8 and subkernel_elements % 4 != 0: - subkernel_elements = int(math.ceil(subkernel_elements / 4) * 4) - - # Depthwise Conv requires multiple of 4 kernel elements in its weight block - # this is different from normal conv which is considered "weights depth-first" - elif is_depthwise: - subkernel_elements = int(math.ceil(subkernel_elements / 4.0) * 4) - - ifm_block_depth_outer = clipped_ifm_block_depth if is_partkernel else 1 - ifm_block_depth_inner = 1 if is_partkernel else clipped_ifm_block_depth - # IFM Ublocks in IFM-block over depth for part-kernel-first mode - # For depth-first IFM Ublocks are traversed after subkernel elements so this loop is ignored. - for ifm_ublk_outer in range(0, ifm_block_depth_outer, ifm_ublock.depth): - # OFM Ublocks in OFM-block over depth - for ofm_ublk in range(0, clipped_ofm_block_depth, ofm_ublock.depth): - # HW Kernel element traversal - cannot be a H/W loop due to element - # padding requirement on depthwise/part-kernel configurations - for element in range(subkernel_elements): - kx = element % sub_width - ky = element // sub_width - # IFM Ublocks in IFM-block over depth (only 1 ublock if depthwise) - # In case of part-kernel-first IFM Ublock traversal have already been handled - # and this loop is ignored. - for ifm_ublk_inner in range(0, ifm_block_depth_inner, ifm_ublock.depth): - # Feed OFM ublock elements - for ofm_ublock_z in range(ofm_ublock.depth): - # Source IFM ublock elements (only 1 element deep if depthwise) - for ifm_ublock_z in range(1 if is_depthwise else ifm_ublock.depth): - # Source position within the current subkernel - wx = subkernel_x + kx - wy = subkernel_y + ky - # Source IFM/OFM slices - ifm_ublk = ifm_ublk_inner + ifm_ublk_outer - ifm_z = ifm_block_z + ifm_ublk + ifm_ublock_z - ofm_z = ofm_block_z + ofm_ublk + ofm_ublock_z - if (ifm_z >= ifm_depth) or (ofm_z >= ofm_depth) or (ky >= sub_height): - stream.append(0) - else: - stream.append(brick_weights[ofm_z][wy][wx][ifm_z]) - return stream - - def core_deinterleave(hwio, core, ncores): # Put weights back into OHWI ohwi = np.transpose(hwio, (3, 0, 1, 2)) diff --git a/setup.py b/setup.py index 8cf61951..d2137437 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ import os import re +import numpy as np from setuptools import Extension from setuptools import find_namespace_packages from setuptools import setup @@ -42,6 +43,7 @@ with open(os.path.join(this_directory, "README.md"), encoding="utf-8") as f: mlw_module = Extension( "ethosu.mlw_codec", ["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"], + include_dirs=[np.get_include()], ) setup( -- cgit v1.2.1