From ea857273d8b4a94fb7f1e63ce9068a60259fb9d3 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 22 Jan 2021 05:47:37 +0000 Subject: Compress OpenCL kernel files using zlib for Android Kernel files are embedded into the binary as the default option when building which leads to binary size bloating. Add `compress_kernels` option and utilize zlib for further compressing the text kernel files and reduce the overall binary size. We use a base64 encoding/decoding to ensure that the strings can be easily embedded. This adds to the binary size but still the overall reduction is significant. Maximum compression level 9 is used. Option is currently restricted to Android builds as android toolchain provides a zlib library. Initial experimentations indicate a binary size reduction of 50% Resolves: COMPMID-4017 Signed-off-by: Georgios Pinitas Change-Id: Iee81b8c00391b26a5f41642699692928a4d6bd6e Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4958 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gian Marco Iodice --- src/core/CL/CLKernelLibrary.cpp | 162 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-) (limited to 'src/core/CL/CLKernelLibrary.cpp') diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index e5cf8c6903..cf1c52e463 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -29,10 +29,151 @@ #include "support/StringSupport.h" #include +#include #include #include #include +#ifdef ARM_COMPUTE_COMPRESSED_KERNELS +#include + +namespace +{ +/* Decoding table */ +constexpr std::array b64_invtab = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0, + 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/** Decode a base64 encoded string + * + * @param[in] str Base64 encoded string to decode + * + * @return The decode string in case of a valid, non-empty string otherwise an empty string + */ +std::string decode_base64(const std::string &str) +{ + constexpr const char pad_char = '='; + + // Handle empty string + if(str.empty()) + { + return {}; + } + + // Base64 encoded string has size multiple of 4 + if(str.length() % 4) + { + return {}; + } + + // + // Check encoded string padding + std::size_t padding = (str.rbegin()[0] == pad_char) + (str.rbegin()[1] == pad_char); + const int str_len = str.size(); + + // Reserve memory for the decoded string + // Note each 4 consecutive elements of 6-bit encode 3 bytes + std::string dec_b64; + dec_b64.reserve(((str_len / 4) * 3)); + + // Block decoding function (exclude padding) + int c = 0; + const int end = str_len - 4 - padding; + for(; c <= end; c += 4) + { + const int byte0 = b64_invtab[str[c]]; + const int byte1 = b64_invtab[str[c + 1]]; + const int byte2 = b64_invtab[str[c + 2]]; + const int byte3 = b64_invtab[str[c + 3]]; + + dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); + dec_b64.push_back((byte1 << 4) | (byte2 >> 2)); + dec_b64.push_back((byte2 << 6) | (byte3)); + } + + // Last step that might contain padding symbols + if(padding == 1) + { + const int byte0 = b64_invtab[str[c]]; + const int byte1 = b64_invtab[str[c + 1]]; + const int byte2 = b64_invtab[str[c + 2]]; + + dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); + dec_b64.push_back((byte1 << 4) | (byte2 >> 2)); + } + else if(padding == 2) + { + const int byte0 = b64_invtab[str[c]]; + const int byte1 = b64_invtab[str[c + 1]]; + + dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); + } + + return dec_b64; +} + +/** Decompress a zlib compressed string + * + * @param[in] str ZLib compressed string + * + * @return The decompressed string if successful, otherwise false. + */ +std::string decompress_zlib(const std::string &str) +{ + // Create and initialize decompression stream + z_stream ds{}; + if(inflateInit(&ds) != Z_OK) + { + return std::string(); + } + ds.avail_in = str.size(); + ds.next_in = (Bytef *)str.data(); + + // Roll-over the string using a buffer and decompress + int status = Z_OK; + char roll_buff[16384]; + std::string inflated_str; + do + { + ds.avail_out = sizeof(roll_buff); + ds.next_out = reinterpret_cast(roll_buff); + + status = inflate(&ds, 0); + if(inflated_str.size() < ds.total_out) + { + inflated_str.append(roll_buff, ds.total_out - inflated_str.size()); + } + } + while(status == Z_OK); + + // Finalize decompression stream + inflateEnd(&ds); + if(status != Z_STREAM_END) + { + return std::string(); + } + + return inflated_str; +} +} // namespace +#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ + using namespace arm_compute; const std::map CLKernelLibrary::_kernel_program_map = { @@ -970,7 +1111,7 @@ const std::map CLKernelLibrary::_program_source_map = }; CLKernelLibrary::CLKernelLibrary() - : _compile_context(), _kernel_path() + : _compile_context(), _kernel_path(), _decompressed_source_map() { opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built } @@ -1068,14 +1209,29 @@ bool CLKernelLibrary::int64_base_atomics_supported() const std::pair CLKernelLibrary::get_program(const std::string &program_name) const { #ifdef EMBEDDED_KERNELS - const auto program_source_it = _program_source_map.find(program_name); +#ifdef ARM_COMPUTE_COMPRESSED_KERNELS + const auto inflatted_program_source_it = _decompressed_source_map.find(program_name); + if(inflatted_program_source_it != _decompressed_source_map.end()) + { + return std::make_pair(inflatted_program_source_it->second, false); + } +#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ + const auto program_source_it = _program_source_map.find(program_name); if(program_source_it == _program_source_map.end()) { ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str()); } + std::string program_source = program_source_it->second; + +#ifdef ARM_COMPUTE_COMPRESSED_KERNELS + std::string decompressed_program_source = decompress_zlib(decode_base64(program_source_it->second)); + ARM_COMPUTE_ERROR_ON_MSG(decompressed_program_source.empty(), "Cannot de-compress requested program"); + _decompressed_source_map.insert(std::make_pair(program_name, decompressed_program_source)); + program_source = std::move(decompressed_program_source); +#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ - return std::make_pair(program_source_it->second, false); + return std::make_pair(program_source, false); #else /* EMBEDDED_KERNELS */ // Check for binary std::string source_name = _kernel_path + program_name; -- cgit v1.2.1