/* * Copyright (c) 2022-2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* Depthwise kernel drivers commonly require a per-thread blob of working space * in which to store parameters required by the depthwise implementations. The * composition of this working space varies with the driver, kernel, and data * types -- but the tasks of requesting sufficient space, allocating buffer * space, and performing initialisation of the working space are common. * * The classes in this file consist of a number of working space "Elements" * (which are logical units of functionality) and a Workspace type which allows * for compile time composition of elements into a single working space type. * * Creating a workspace * ==================== * * A new workspace type can be created by combining Elements as an argument to * the Workspace class. For instance: * * Workspace< * depthwise_depthfirst::InputArrayElement, * InputBufferElement, * OutputArrayElement * > * * Creates a new Workspace consisting of the given elements. The workspace type * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to: * * struct WorkspaceType * { * const float **inptr_array; // From InputArrayElement * float *input_buffer; // From InputBufferElement * float **outptr_array; // From OutputArrayElement * float *output_buffer; // From OutputArrayElement * }; * * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount * of space required to store the above struct and the elements contained * within it. Once this space has been allocated, the workspace can be * initialised by calling `Workspace<...>::initialise` with a pointer to the * buffer and the same arguments. This will place a struct of type * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the * remaining space between the specified elements. As this is all done at * compile time, later code can access elements from the `WorkspaceType` by * name. * * Writing a new element * ===================== * * Each Element must provide: * - A struct called "Workspace" containing the variables contained within * this portion of the workspace. * - A static method called `get_element_size` which returns the amount of * buffer space required by this element of the workspace (NOT including the * size of the Workspace struct). For example, an element which stores a * vector of pointers will return the amount of space required top store the * vector. * - A static method called `initialise` which accepts a pointer to a struct * which will be composed of the Element's `Workspace` struct (along with * other elements), a pointer to the start of the buffer allocated for this * portion of the workspace, and arguments to be used to initialise the * workspace. The Element should consume as much of the buffer as it * requires, initialise the Workspace, and then return the pointer to the * next free byte of the buffer. * * See the below elements for an example of how this should work. */ #pragma once #include "depthwise.hpp" #include "depthfirst_driver.hpp" #include "utils.hpp" namespace arm_conv { namespace depthwise { namespace { // anonymous because we expect this to appear in several compilation units /* Arguments to use to size and initialise a workspace. */ template struct WorkspaceArgs { const StratType *strategy; const DepthwiseArgs &depthwise_args; const OutputStage &output_stage; WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {}) : strategy(strat), depthwise_args(dwargs), output_stage(os) { } }; /* Sometimes we use templated structs to fill in workspace types, the Empty * element can be useful for when a blank element is required for some sets of * parameters. */ struct EmptyElement { struct Workspace {}; template static size_t get_element_size(const WorkspaceArgs &) { return 0; } template static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs &) { return buffer; } }; /* Store fused activations for a kernel. * * Activations are set based on the DepthwiseArgs. */ template class ActivationsElement { public: struct Workspace { T activation_min, activation_max; }; template static size_t get_element_size(const WorkspaceArgs &) { return 0; } template static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) { ws->activation_min = static_cast(-std::numeric_limits::infinity()); ws->activation_max = static_cast(std::numeric_limits::infinity()); switch (args.depthwise_args.activation.type) { case arm_gemm::Activation::Type::BoundedReLU: ws->activation_max = static_cast(args.depthwise_args.activation.param1); // Fall through case arm_gemm::Activation::Type::ReLU: ws->activation_min = static_cast(0); break; default: break; } return buffer; } }; /* Activation clamps are contained within `arm_gemm::Requantize32`, so if the * output stage is one of these we substitute in an empty workspace element. */ template class ActivationsElement : public EmptyElement { }; /* Get the value with which to fill an input buffer. This defaults to `0` * (which we return as a `char` since it gets used by `memset`). */ template char get_input_buffer_fill_value(const OutputStage &) { return 0; } /* In the case of kernels operating on quantized data, we need to fill the * input buffer with the zero offset of the input tensor. */ template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused)); template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) { return qp.a_offset; } /* Container for a vector of padding values which can be safely consumed by the * depthwise kernel. The padding values are initialised to either `0` or the * zero offset of the input tensor (if quantized). */ template class InputBufferElement { public: struct Workspace { T *input_buffer; }; template static size_t get_element_size(const WorkspaceArgs &args) { return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; } template static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) { ws->input_buffer = reinterpret_cast(buffer); memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args)); return reinterpret_cast(buffer) + get_element_size(args); } }; /* Container for an array of output pointers, and a buffer which can be used as * a destination for unnecessary writes. */ template class OutputArrayElement { public: struct Workspace { T **outptr_array; T *output_buffer; }; template static size_t get_element_size(const WorkspaceArgs &args) { return sizeof_outptr_array(args) + sizeof_output_buffer(args); } template static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) { char *buffer_bytes = reinterpret_cast(buffer); ws->outptr_array = reinterpret_cast(buffer_bytes); buffer_bytes += sizeof_outptr_array(args); ws->output_buffer = reinterpret_cast(buffer_bytes); buffer_bytes += sizeof_output_buffer(args); return buffer_bytes; } protected: template static size_t sizeof_outptr_array(const WorkspaceArgs &args) { return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols(); } template static size_t sizeof_output_buffer(const WorkspaceArgs &args) { return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; } }; /* Intermediate array to store results of premultiplication. * Used as input to the kernel instead of the original input array. */ template class IntermediateBufferElement { public: struct Workspace { T *intermediate_buffer; }; template static size_t get_element_size(const WorkspaceArgs &args) { auto cols = args.depthwise_args.input_cols + args.depthwise_args.kernel_cols; auto rows = args.strategy->get_input_rows() + args.depthwise_args.kernel_rows; auto channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; return sizeof(T) * cols * rows * channels; } template static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) { ws->intermediate_buffer = reinterpret_cast(buffer); return reinterpret_cast(buffer) + get_element_size(args); } }; /* Container for requantization parameters. * * This removes the distinction between per-layer and per-channel * requantization parameters by providing a vector of requantization parameters * regardless of whether per-layer or per-channel is selected. */ class RequantizationParametersElement { public: struct Workspace { const int32_t *bias, *requant_muls, *requant_shifts; }; template static size_t get_element_size(const WorkspaceArgs &args) { return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args); } template static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) { const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; char *buffer_bytes = reinterpret_cast(buffer); ws->bias = args.output_stage.bias; ws->requant_muls = args.output_stage.per_channel_muls; ws->requant_shifts = args.output_stage.per_channel_right_shifts; if (ws->bias == nullptr) { ws->bias = reinterpret_cast(buffer_bytes); memset(buffer_bytes, 0, sizeof_bias(args)); buffer_bytes += sizeof_bias(args); } if (ws->requant_muls == nullptr) { ws->requant_muls = reinterpret_cast(buffer_bytes); auto muls = reinterpret_cast(buffer_bytes); buffer_bytes += sizeof_requant_muls(args); for (auto n = 0u; n < n_output_channels; n++) { muls[n] = args.output_stage.per_layer_mul; } } if (ws->requant_shifts == nullptr) { ws->requant_shifts = reinterpret_cast(buffer_bytes); auto shifts = reinterpret_cast(buffer_bytes); buffer_bytes += sizeof_requant_shifts(args); for (auto n = 0u; n < n_output_channels; n++) { shifts[n] = args.output_stage.per_layer_right_shift; } } return buffer_bytes; } protected: template static size_t sizeof_bias(const WorkspaceArgs &args) { return args.output_stage.bias != nullptr ? 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels; } template static size_t sizeof_requant_muls(const WorkspaceArgs &args) { return args.output_stage.per_channel_muls != nullptr ? 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels; } template static size_t sizeof_requant_shifts(const WorkspaceArgs &args) { return args.output_stage.per_channel_right_shifts != nullptr ? 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels; } }; template class Workspace; template class Workspace { public: struct WorkspaceType : Element::Workspace, Workspace::WorkspaceType { }; template static void initialise(void *buffer, const WorkspaceArgs &args) { // Allocate sufficient space for the struct, then initialise each of the // elements in turn. auto ws = reinterpret_cast(buffer); initialise_elements(ws, ws + 1, args); } template static size_t get_sizeof_workspace(const WorkspaceArgs &args) { return sizeof(WorkspaceType) + get_element_sizes(args); } template static inline size_t get_element_sizes(const WorkspaceArgs &args) { return Element::get_element_size(args) + Workspace::get_element_sizes(args); } template static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) { buffer = Element::initialise(ws, buffer, args); // Get the next buffer Workspace::initialise_elements(ws, buffer, args); } }; template <> class Workspace<> { public: struct WorkspaceType { }; template static inline size_t get_element_sizes(const WorkspaceArgs &) { return 0; } template static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs &) { } }; } // namespace {anonymous} } // namespace depthwise } // namespace arm_conv