/* * Copyright (c) 2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef CKW_PROTOTYPE_SRC_PROTOTYPE_H #define CKW_PROTOTYPE_SRC_PROTOTYPE_H #include "ckw/Error.h" #include "ckw/TensorInfo.h" #include "ckw/types/ConvertPolicy.h" #include "ckw/types/DataType.h" #include "ckw/types/Functions.h" #include "ckw/types/GpuTargetLanguage.h" #include "ckw/types/Operators.h" #include "ckw/types/TensorSamplerTypes.h" #include #include #include // assert (to be removed) #include #include #include // int32_t #include #include // cout (to be removed) #include #include #include #include #include #include namespace ckw { namespace prototype { // Dummy data structure for Size2D using Size2D = std::vector; // Dummy Status using Status = void; enum class ComponentType : int32_t { Complex = 0, Simple = 1, Unfusable = 2 }; enum class GpuCompilationSpeed { Fast = 0x00, // fast compilation may increase the latency of the network Slow = 0x01 // slow compilation may decrease the latency of the network }; enum class GpuExtensions { Fp16, Dot8, Mmul, FastMath }; struct TensorInfo { TensorShape shape{{0}}; DataType data_type{DataType::Unknown}; TensorDataLayout data_layout{TensorDataLayout::Nhwc}; int32_t id{-1}; }; struct ComponentAttribute { GpuCompilationSpeed compilation_speed{GpuCompilationSpeed::Fast}; bool overwrite_tile{true}; }; inline std::string data_type_to_cl_type(DataType dt) { switch (dt) { case DataType::Fp32: return "float"; case DataType::Fp16: return "half"; case DataType::Int8: return "char"; case DataType::Uint8: return "uchar"; case DataType::Uint16: return "ushort"; case DataType::Int16: return "short"; case DataType::Uint32: return "uint"; case DataType::Int32: return "int"; case DataType::Bool: return "bool"; default: assert(false); return ""; } } inline int32_t width_to_cl_vector_size(int32_t width) { switch (width) { case 1: return 1; case 2: return 2; case 3: return 3; case 4: return 4; case 5: case 6: case 7: case 8: return 8; case 9: case 10: case 11: case 12: case 13: case 14: case 15: case 16: return 16; default: assert(false); return 0; } } inline std::string get_cl_data_type(DataType dt, int32_t width) { std::string data_type; int32_t w = width_to_cl_vector_size(width); data_type += data_type_to_cl_type(dt); if (w != 1) { data_type += std::to_string(w); } return data_type; } inline std::string to_opencl_store(int32_t vector_length) { if (vector_length != 1) { return "vstore" + std::to_string(vector_length) + "("; } else { return "*("; } } struct TileInfo { TileInfo() { } TileInfo(DataType dt) : dt(dt), w(1), h(1) { } TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1) { } TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height) { } DataType dt{DataType::Unknown}; // Data type of the tile int32_t w{0}; // Width (i.e. c0 - portion of the channels) int32_t h{0}; // Height (i.e. s0 - portion of the spatial dimensions) }; inline std::ostream &operator<<(std::ostream &o, const TileInfo &a) { o << a.w << " x " << a.h; return o; } struct DataTypeAsString { std::string str{""}; DataType dt{DataType::Unknown}; int32_t size{1}; }; struct ValueAsString { std::string str{""}; DataTypeAsString type{}; }; // https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c // A Tile is a collection of variables used to express a 2D data. class IScalarTile { public: virtual ~IScalarTile() = default; /** Method to get the scalar variable from a tile * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge * * @return the scalar variable as a string */ virtual ValueAsString scalar(int32_t x, int32_t y) const = 0; /** Method to get the list of underlying variable names used by the tile * * @return the list of variable names */ virtual std::vector underlying_source_variables() const = 0; /** Method to get the name of the tile. * * @return the name of the tile */ std::string name() const { return _basename; } /** Method to get the tile format * * @return the format */ TileInfo format() const { return _format; } /** Method to know whether the tile is assignable or not (constant) * * @return true if the tile is assignable */ virtual bool is_assignable() const = 0; /** Method to know whether the tile needs to be declared * * @return true if the tile needs to be declared in the code before being used */ virtual bool need_declaration() const = 0; protected: TileInfo _format{}; // Tile format std::string _basename{""}; // Tile name }; // A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context. // The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors class IVectorTile : public IScalarTile { public: virtual ~IVectorTile() = default; /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars. * The user can query the list of supported width for the vectors through preferred_vector_sizes(). * * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge * * @return the vector variable as a string */ virtual ValueAsString vector(int32_t y) const = 0; /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars. * * @return the vector variable as a string */ virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0; /** Method to get the preferred vector sizes. * * @return a vector with the preferred vector sizes */ //virtual std::vector preferred_vector_sizes() const = 0; }; class ClTile : public IVectorTile { public: ClTile(const std::string &name, TileInfo format) { _format = format; _basename = name; } ValueAsString scalar(int32_t x, int32_t y) const override { x = std::max(std::min(x, _format.w - 1), static_cast(0)); y = std::max(std::min(y, _format.h - 1), static_cast(0)); ValueAsString t; t.str = build_variable_name(y); t.type.str = get_cl_data_type(_format.dt, 1); t.type.dt = _format.dt; t.type.size = 1; // Check required because if the width has only one element, we cannot use .s0 if (_format.w != 1) { // Automatic broadcasting t.str += ".s" + std::to_string(x); } return t; } ValueAsString vector(int32_t y) const override { y = std::max(std::min(y, _format.h - 1), static_cast(0)); ValueAsString t; t.str = build_variable_name(y); t.type.str = get_cl_data_type(_format.dt, _format.w); t.type.dt = _format.dt; t.type.size = _format.w; return t; } ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override { y = std::max(std::min(y, _format.h - 1), static_cast(0)); ValueAsString t; t.str = build_variable_name(y); t.type.str = get_cl_data_type(_format.dt, width); t.type.dt = _format.dt; t.type.size = width; if (_format.w != 1) { t.str += ".s"; for (int i = 0; i < width; ++i) { t.str += to_scalar_hex(x_start + i); } } return t; } std::vector underlying_source_variables() const override { std::vector vars; for (int32_t y = 0; y < _format.h; ++y) { ValueAsString t; t.str = build_variable_name(y); t.type.str = get_cl_data_type(_format.dt, _format.w); t.type.dt = _format.dt; t.type.size = _format.w; vars.push_back(t); } return vars; } bool is_assignable() const override { return true; } bool need_declaration() const override { return true; } private: std::string build_variable_name(int32_t y) const { std::string var_name = _basename; if (_format.h == 1) { return var_name; } else { var_name += "_"; var_name += std::to_string(y); } return var_name; } std::string to_scalar_hex(int32_t x) const { switch (x) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 9: return std::to_string(x); case 10: return "A"; case 11: return "B"; case 12: return "C"; case 13: return "D"; case 14: return "E"; case 15: return "F"; default: std::cout << "Unsupported hexadecimal value" << std::endl; assert(false); return ""; } } }; // Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time. class ClConstantTile : public IVectorTile { public: ClConstantTile(const std::vector> &in, DataType dt) { _format.w = in[0].size(); _format.h = in.size(); _format.dt = dt; _data = std::vector>(_format.h, std::vector(_format.w)); for (int32_t y = 0; y < _format.h; ++y) { for (int32_t x = 0; x < _format.w; ++x) { _data[y][x] = in[y][x]; } } } ValueAsString scalar(int32_t x, int32_t y) const override { x = std::max(std::min(x, _format.w - 1), static_cast(0)); y = std::max(std::min(y, _format.h - 1), static_cast(0)); ValueAsString t; t.str = _data[y][x]; t.type.str = get_cl_data_type(_format.dt, 1); t.type.dt = _format.dt; t.type.size = 1; return t; } ValueAsString vector(int32_t y) const override { y = std::max(std::min(y, _format.h - 1), static_cast(0)); return vector(0, _format.w, y); } ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override { y = std::max(std::min(y, _format.h - 1), static_cast(0)); ValueAsString t; t.str = ""; t.type.str = get_cl_data_type(_format.dt, width); t.type.dt = _format.dt; t.type.size = width; if (width > 1) { t.str += "((" + get_cl_data_type(_format.dt, width) + ")("; } int32_t x = x_start; for (; x < width - 1; ++x) { t.str += scalar(x, y).str; t.str += ", "; } t.str += scalar(x, y).str; if (width > 1) { t.str += "))"; } return t; } std::vector underlying_source_variables() const override { std::vector vars; for (int32_t y = 0; y < _format.h; ++y) { for (int32_t x = 0; x < _format.w; ++x) { ValueAsString t; t.str = _data[y][x]; t.type.str = get_cl_data_type(_format.dt, 1); t.type.dt = _format.dt; t.type.size = 1; vars.push_back(t); } } return vars; } bool is_assignable() const override { return false; } bool need_declaration() const override { return false; } private: std::vector> _data{}; }; enum class TensorComponentIndex : int32_t { IndexMask = 0x0000000f, }; enum class TensorComponentGroup : int32_t { OffsetFirstElement = 0x00000100, Stride = 0x00001000, Dimension = 0x00010000, FoldedDimension = 0x00100000, Constant = 0x01000000 }; inline std::string to_string(TensorComponentType x) { switch (x) { case TensorComponentType::Unknown: return "Unknown"; case TensorComponentType::OffsetFirstElement: return "OffsetFirstElement"; case TensorComponentType::Stride1: return "Stride1"; case TensorComponentType::Stride2: return "Stride2"; case TensorComponentType::Stride3: return "Stride3"; case TensorComponentType::Stride4: return "Stride4"; case TensorComponentType::Dim0: return "Dim0"; case TensorComponentType::Dim1: return "Dim1"; case TensorComponentType::Dim2: return "Dim2"; case TensorComponentType::Dim3: return "Dim3"; case TensorComponentType::Dim4: return "Dim4"; case TensorComponentType::Dim1xDim2: return "Dim1xDim2"; case TensorComponentType::Dim1xDim2xDim3: return "Dim1xDim2xDim3"; default: assert(false); return ""; } } class ITensorArgument { public: virtual ~ITensorArgument() = default; /** Method to get the tensor component as a string * * @param[in] x tensor component to query * * @return the tensor component as a string */ virtual std::string component(TensorComponentType x) = 0; /** Method to get the tensor component type declaration as a string * * @return the tensor component type declaration as a string */ virtual std::string component_type_declaration() const = 0; /** Method to get the tensor component data type * * @return the tensor component data type */ virtual DataType component_data_type() const = 0; /** Method to get the tensor component declarations * * @return a vector containing the tensor component declarations */ virtual std::vector component_declarations() const = 0; /** Method to get the name of the tensor argument. * * @return the name of the tensor argument */ std::string name() const { return _basename; } /** Method to get the tensor format * * @return the format */ TensorInfo format() const { return _format; } protected: TensorInfo _format{}; std::string _basename{}; }; enum class GpuTensorStorage : int32_t { Unknown = 0x0000, BufferUint8Ptr = 0x0012, Image2dReadOnly = 0x0020, Image2dWriteOnly = 0x0021, Image3dReadOnly = 0x0030, Image3dWriteOnly = 0x0031 }; inline GpuTensorStorage to_gpu_tensor_storage(TensorStorageType s) { switch (s) { case TensorStorageType::Unknown: return GpuTensorStorage::Unknown; case TensorStorageType::BufferUint8Ptr: return GpuTensorStorage::BufferUint8Ptr; case TensorStorageType::Texture2dReadOnly: return GpuTensorStorage::Image2dReadOnly; case TensorStorageType::Texture2dWriteOnly: return GpuTensorStorage::Image2dWriteOnly; default: assert(false); return GpuTensorStorage::Unknown; } } inline TensorStorageType to_tensor_storage(GpuTensorStorage s) { switch (s) { case GpuTensorStorage::Unknown: return TensorStorageType::Unknown; case GpuTensorStorage::BufferUint8Ptr: return TensorStorageType::BufferUint8Ptr; case GpuTensorStorage::Image2dReadOnly: return TensorStorageType::Texture2dReadOnly; case GpuTensorStorage::Image2dWriteOnly: return TensorStorageType::Texture2dWriteOnly; default: assert(false); return TensorStorageType::Unknown; } } class IGpuTensorArgument : public ITensorArgument { public: virtual ~IGpuTensorArgument() = default; /** Method to get the tensor storage, which is the underlying storage used to keep the data memory * * @param[in] x tensor storage to query * * @return the tensor storage as a string */ virtual std::string storage(GpuTensorStorage x) = 0; /** Method to get the tensor storage type declaration as a string * * @param[in] x tensor component to query * * @return the tensor storage type declaration as a string */ virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0; /** Method to get the tensor storage declarations * * @return a vector containing the tensor storage declarations */ virtual std::vector storage_declarations() const = 0; }; class ClTensorArgument : public IGpuTensorArgument { public: ClTensorArgument(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible) { _basename = name; _format = x; _return_by_value_when_possible = return_by_value_when_possible; } // Methods to override std::string component(TensorComponentType x) override { if ((static_cast(x) & static_cast(TensorComponentGroup::Constant))) { int32_t idx = static_cast(x) & static_cast(TensorComponentIndex::IndexMask); return std::to_string(idx - 1); } if (_return_by_value_when_possible) { if ((static_cast(x) & static_cast(TensorComponentGroup::Dimension))) { int32_t idx = static_cast(x) & static_cast(TensorComponentIndex::IndexMask); return std::to_string(_format.shape[idx]); } if ((static_cast(x) & static_cast(TensorComponentGroup::FoldedDimension))) { switch (x) { case TensorComponentType::Dim1xDim2: return std::to_string(_format.shape[1] * _format.shape[2]); case TensorComponentType::Dim1xDim2xDim3: return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]); default: std::cout << "Unsupported folded dimension" << std::endl; assert(false); } } } if (std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end()) { _components_required.push_back(x); } return build_component_name(x); } std::string component_type_declaration() const override { return "int"; }; DataType component_data_type() const override { return DataType::Int32; } std::string storage(GpuTensorStorage x) override { if (std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end()) { _storage_required.push_back(x); } return build_storage_name(x); } std::string storage_type_declaration(GpuTensorStorage x) const override { switch (x) { case GpuTensorStorage::BufferUint8Ptr: return "__global uchar*"; case GpuTensorStorage::Image2dReadOnly: return "__read_only image2d_t"; case GpuTensorStorage::Image2dWriteOnly: return "__write_only image2d_t"; case GpuTensorStorage::Image3dReadOnly: return "__read_only image3d_t "; case GpuTensorStorage::Image3dWriteOnly: return "__write_only image3d_t "; default: std::cout << "Unsupported storage" << std::endl; assert(false); return ""; } }; std::vector storage_declarations() const override { return _storage_required; } std::vector component_declarations() const override { return _components_required; } private: std::string build_storage_name(GpuTensorStorage x) const { std::string var_name = _basename; switch (x) { case GpuTensorStorage::BufferUint8Ptr: return var_name + "_ptr"; case GpuTensorStorage::Image2dReadOnly: case GpuTensorStorage::Image2dWriteOnly: return var_name + "_img2d"; case GpuTensorStorage::Image3dReadOnly: case GpuTensorStorage::Image3dWriteOnly: return var_name + "_img3d"; default: std::cout << "Unsupported storage" << std::endl; assert(false); } return var_name; } std::string build_component_name(TensorComponentType x) const { std::string var_name = _basename; switch (x) { case TensorComponentType::OffsetFirstElement: return var_name + "_offset_first_element"; case TensorComponentType::Stride1: return var_name + "_stride1"; case TensorComponentType::Stride2: return var_name + "_stride2"; case TensorComponentType::Stride3: return var_name + "_stride3"; case TensorComponentType::Dim0: return var_name + "_dim0"; case TensorComponentType::Dim1: return var_name + "_dim1"; case TensorComponentType::Dim2: return var_name + "_dim2"; case TensorComponentType::Dim3: return var_name + "_dim3"; case TensorComponentType::Dim1xDim2: return var_name + "_dim1xdim2"; case TensorComponentType::Dim1xDim2xDim3: return var_name + "_dim1xdim2xdim3"; default: std::cout << "Unsupported component" << std::endl; assert(false); } return var_name; } bool _return_by_value_when_possible{false}; std::vector _storage_required{}; std::vector _components_required{}; }; /** * @brief Data structure that contains the declared tiles by the components. * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack * and remove (pop) all the tiles from the level above. * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code. * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions * when declaring tiles among different components. * */ class GpuTileRegistry { public: enum class RegistryTileType { Tile, Link }; using RegistryIdSpace = int32_t; using RegistryLevel = int32_t; using RegistryTileName = std::string; struct RegistryTileTableEntry { RegistryLevel registry_level{0}; std::unique_ptr tile_object{nullptr}; }; struct RegistryTileTypeTableEntry { RegistryTileType tile_type{RegistryTileType::Tile}; RegistryTileName tile_name{}; RegistryIdSpace registry_idspace{0}; RegistryLevel registry_level{0}; }; using RegistryTileTable = std::map>; using RegistryTileTypeTable = std::map>; /** * @brief Construct a new Gpu Tile Registry object * */ GpuTileRegistry() { _language = GpuTargetLanguage::Unknown; } /** * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language * * @param[in] language Gpu programming language to use */ GpuTileRegistry(GpuTargetLanguage language) { _language = language; } /** * @brief Default destructor. Destroy the Gpu Tile Registry object * */ ~GpuTileRegistry() = default; /** * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles. * Therefore, the IdSpace should be set before declaring any tiles. * * @param[in] id The IdSpace id */ void set_IdSpace(int32_t id) { _IdSpace = id; } /** * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles * * @return The IdSpace id */ int32_t IdSpace() const { return _IdSpace; } /** * @brief Gets all the IdSpace declarations defined in the tile registry. * * @return all the IdSpace declarations defined in the tile registry as std::vector. It returns an empty vector if there are no IdSpace declarations. */ std::vector IdSpace_declarations() const { std::vector x; auto it = _frags.begin(); while (it != _frags.end()) { x.push_back(it->first); it++; } return x; } /** * @brief Declare a tile from a previously created tile */ void insert(const std::string &name, const IVectorTile *frag) { assert(_language == GpuTargetLanguage::OpenCL); const int32_t key_IdSpace = _IdSpace; const std::string key_var_name = name; const std::string var_name = frag->name(); TileInfo format = frag->format(); // First check whether a tile with the same name exists IVectorTile *result = (*this)[key_var_name]; assert(result == nullptr); if (result == nullptr) { std::unique_ptr tile = std::make_unique(var_name, format); _frags[key_IdSpace][key_var_name].tile_object = std::move(tile); _frags[key_IdSpace][key_var_name].registry_level = _registry_level; _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Link; _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name; _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace; _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level; } } /** * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace() * * @note The reference name used for declaring the tile should not be previously used in the IdSpace * * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry. * @param[in] format Tile format use to use */ void insert(const std::string &name, const TileInfo &format) { assert(_language == GpuTargetLanguage::OpenCL); const int32_t key_IdSpace = _IdSpace; const std::string key_var_name = name; const std::string var_name = generate_tile_name(name); // First check whether a tile with the same name exists IVectorTile *result = (*this)[key_var_name]; assert(result == nullptr); if (result == nullptr) { std::unique_ptr tile = std::make_unique(var_name, format); _frags[key_IdSpace][key_var_name].tile_object = std::move(tile); _frags[key_IdSpace][key_var_name].registry_level = _registry_level; _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile; _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name; _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace; _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level; } } /** * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string * * @note The reference name used for declaring the tile should not be previously used in the IdSpace * * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry. * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure * that the data type is aligned with the content of the std::string. */ void insert(const std::string &name, const std::vector> &in, DataType dt) { assert(_language == GpuTargetLanguage::OpenCL); const int32_t key_IdSpace = _IdSpace; const std::string key_var_name = name; // First check whether a tile with the same name exists IVectorTile *result = (*this)[key_var_name]; assert(result == nullptr); if (result == nullptr) { std::unique_ptr tile = std::make_unique(in, dt); _frags[key_IdSpace][key_var_name].tile_object = std::move(tile); _frags[key_IdSpace][key_var_name].registry_level = _registry_level; _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile; _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name; _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace; _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level; } } /** * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string * * @note This method can be used to declare temporary tiles that need to be accessed only once. * * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure * that the data type is aligned with what passed with the std::string. * * @return IVectorTile* the anonymous constant tile */ IVectorTile *insert(const std::vector> &in, DataType dt) { assert(_language == GpuTargetLanguage::OpenCL); const int32_t key_IdSpace = _IdSpace; const std::string key_var_name = "_" + std::to_string(_anonymous_frag_count++); // First check whether a tile with the same name exists IVectorTile *result = (*this)[key_var_name]; assert(result == nullptr); if (result == nullptr) { std::unique_ptr tile = std::make_unique(in, dt); _frags[key_IdSpace][key_var_name].tile_object = std::move(tile); _frags[key_IdSpace][key_var_name].registry_level = _registry_level; _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile; _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name; _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace; _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level; } return (*this)[key_var_name]; } /** * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user * * @param[in] name The name of the tile to retrieve * @param[in] IdSpace The IdSpace id where to search the tile * * @return IVectorTile* The tile */ IVectorTile *get(const std::string &name, int32_t IdSpace) { const int32_t key_IdSpace = IdSpace; const std::string key_var_name = name; IVectorTile *result = nullptr; auto search_IdSpace = _frags.find(key_IdSpace); if (search_IdSpace != _frags.end()) { auto search_tile = _frags[key_IdSpace].find(key_var_name); if (search_tile != _frags[key_IdSpace].end()) { result = search_tile->second.tile_object.get(); assert(result != nullptr); } } return result; } /** * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace() * * @param[in] name The name of the tile to retrieve * * @return IVectorTile* The tile */ IVectorTile *operator[](const std::string &name) { return get(name, _IdSpace); } /** * @brief Check whether the tile in the in the IdSpace provided by the user exists * * @param[in] name Name of the tile to search for * @param[in] IdSpace The IdSpace id where to search the tile * * @return true if the tile exists * @return false if the tile does not exist */ bool has_tile(const std::string &name, int32_t IdSpace) const { const int32_t key_IdSpace = IdSpace; const std::string key_var_name = name; // IVectorTile* result = nullptr; auto search_IdSpace = _frags.find(key_IdSpace); return search_IdSpace != _frags.end(); } /** * @brief Check whether the tile within the current IdSpace exists * * @param[in] name Name of the tile to search for * * @return true if the tile exists * @return false if the tile does not exist */ bool has_tile(const std::string &name) const { return has_tile(name, _IdSpace); } /** * @brief Get all the tiles declared within the IdSpace provided by the user * * @param[in] IdSpace IdSpace where to retrieve all the declared tiles * * @return std::vector A vector with all the declared tiles in the IdSpace provided by the user */ std::vector tile_declarations(int32_t IdSpace) { std::vector tiles; std::map::iterator it = _frag_types[IdSpace].begin(); while (it != _frag_types[IdSpace].end()) { // The following line should be enabled. However, we cannot at this stage // because it used to retrieve the output tile produced by each component. // However, this method should NOT be used to retrieve the output tile //if(it->second.tile_type == RegistryTileType::Tile) { tiles.push_back(get(it->second.tile_name, it->second.registry_idspace)); } it++; } return tiles; } /** * @brief Increase the level of stack. * */ void increment_registry_level() { _registry_level++; } /** * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack. * */ void decrement_registry_level() { assert(_registry_level >= 0); // Remove all variables in the local scope std::map::iterator it = _frags[_IdSpace].begin(); while (it != _frags[_IdSpace].end()) { if (it->second.registry_level == _registry_level) { it = _frags[_IdSpace].erase(it); } else { it++; } } std::map::iterator it_type = _frag_types[_IdSpace].begin(); while (it_type != _frag_types[_IdSpace].end()) { if (it_type->second.registry_level == _registry_level) { it_type = _frag_types[_IdSpace].erase(it_type); } else { it_type++; } } _registry_level--; } /** * @brief Get the level of the stack * */ int32_t level() const { return _registry_level; } private: // This method ensures that the key is unique among different components std::string generate_tile_name(const std::string &name) { assert(_IdSpace >= 0); if (_registry_level == 0) { return "_G" + std::to_string(_IdSpace) + "_" + name; } else { return name; } } RegistryTileTable _frags{}; RegistryTileTypeTable _frag_types{}; RegistryLevel _registry_level{0}; RegistryIdSpace _IdSpace{-1}; int32_t _anonymous_frag_count{0}; // Counter used to create the anonymous tiles GpuTargetLanguage _language{GpuTargetLanguage::Unknown}; // Gpu programming language }; using TensorEntry = std::unique_ptr; /** * @brief Data structure that contains the tensors consumed by the components. * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions * when declaring tensors among different components. * */ class GpuTensorArgumentRegistry { public: /** * @brief Construct a new Gpu Tensor Registry object * */ GpuTensorArgumentRegistry() { _language = GpuTargetLanguage::Unknown; } /** * @brief Construct a new Gpu Tensor Registry object * * @param[in] language Gpu programming language to use */ GpuTensorArgumentRegistry(GpuTargetLanguage language) { _language = language; } /** * @brief Default destructor. Destroy the Gpu Tensor Registry object * */ ~GpuTensorArgumentRegistry() = default; /** * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors. * Therefore, the IdSpace should be set before declaring any tensors. * * @param[in] id The IdSpace id */ void set_IdSpace(int32_t id) { _IdSpace = id; } /** * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors * * @return The IdSpace id */ int32_t IdSpace() const { return _IdSpace; } /** * @brief Gets all the IdSpace declarations defined in the tensor registry. * * @return all the IdSpace declarations defined in the tensor registry as std::vector. It returns an empty vector if there are no IdSpace declarations. */ std::vector IdSpace_declarations() const { std::vector x; auto it = _refs.begin(); while (it != _refs.end()) { x.push_back(it->first); it++; } return x; } /** * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace() * * @note The reference name used for declaring the tensor should not be previously used in the IdSpace * * @param[in] name Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry. * @param[in] x Pair of tensor info and tensor id * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components */ void insert(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible) { assert(_language == GpuTargetLanguage::OpenCL); const int32_t key_IdSpace = _IdSpace; const int32_t tensor_id = x.id; const std::string key_var_name = name; const std::string var_name = generate_tensor_name(name, tensor_id); // First, check whether the tensor has already a reference. If so, trigger an assert assert(!has_tensor_argument(name)); // Check whether a tensor with that tensorID exists auto result = _tensor_arguments.find(tensor_id); if (result == _tensor_arguments.end()) { // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference std::unique_ptr arg = std::make_unique(var_name, x, return_by_value_when_possible); _tensor_arguments[tensor_id] = std::move(arg); } _refs[key_IdSpace][key_var_name] = tensor_id; } /** * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace() * * @param[in] name The name of the tensor to retrieve * * @return IGpuTensor* The tensor */ IGpuTensorArgument *operator[](const std::string &name) { const int32_t key_IdSpace = _IdSpace; const std::string key_var_name = name; IGpuTensorArgument *result = nullptr; auto search_IdSpace = _refs.find(key_IdSpace); if (search_IdSpace != _refs.end()) { auto search_tensor_id = _refs[key_IdSpace].find(key_var_name); if (search_tensor_id != _refs[key_IdSpace].end()) { const int32_t tensor_id = search_tensor_id->second; auto search_tensor_argument = _tensor_arguments.find(tensor_id); if (search_tensor_argument != _tensor_arguments.end()) { result = search_tensor_argument->second.get(); } assert(result != nullptr); } } return result; } /** * @brief Get all the tensors declared in the IdSpace provided by the user * * @return std::vector A vector with all the declared tensors */ std::vector tensor_argument_declarations() { std::vector args; auto it = _tensor_arguments.begin(); while (it != _tensor_arguments.end()) { args.push_back(it->second.get()); it++; } return args; } /** * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists * * @param[in] name Name of the tensor argument to search for * * @return true if the tensor argument exists * @return false if the tensor argument does not exist */ bool has_tensor_argument(const std::string &name) { const int32_t key_IdSpace = _IdSpace; const std::string key_var_name = name; auto search_IdSpace = _refs.find(key_IdSpace); if (search_IdSpace != _refs.end()) { auto search_tensor_id = _refs[key_IdSpace].find(key_var_name); return search_tensor_id != _refs[key_IdSpace].end(); } else { return false; } } /** * @brief Check whether the tensor argument is in the the IdSpace provided by the user * * @param[in] name Name of the tensor argument to search for * @param[in] IdSpace The IdSpace id where to search the tensor argument * * @return true if the tile exists * @return false if the tile does not exist */ bool has_tensor_argument(const std::string &name, int32_t IdSpace) { const int32_t key_IdSpace = IdSpace; const std::string key_var_name = name; auto search_IdSpace = _refs.find(key_IdSpace); if (search_IdSpace != _refs.end()) { auto search_tensor_id = _refs[key_IdSpace].find(key_var_name); return search_tensor_id != _refs[key_IdSpace].end(); } else { return false; } } private: // This method ensures that the key is unique among different components std::string generate_tensor_name(const std::string &name, int32_t tensor_id) { assert(tensor_id >= 0); return name + std::to_string(tensor_id); } std::map _tensor_arguments{}; std::map> _refs{}; int32_t _IdSpace{-1}; GpuTargetLanguage _language{GpuTargetLanguage::Unknown}; // Gpu programming language }; enum class OpType : int32_t { Elementwise = 0x0000, Relational = 0x1000, Algebra = 0x2000 }; inline std::string to_string(AssignmentOp op) { switch (op) { case AssignmentOp::Decrement: return "-="; case AssignmentOp::Increment: return "+="; default: assert(false); return ""; } } inline std::string to_string(UnaryOp op) { switch (op) { case UnaryOp::LogicalNot: return "!"; case UnaryOp::BitwiseNot: return "~"; case UnaryOp::Negate: return "-"; default: assert(false); return ""; } } inline std::string to_string(BinaryOp op) { switch (op) { case BinaryOp::Add: return "+"; case BinaryOp::Sub: return "-"; case BinaryOp::Mul: return "*"; case BinaryOp::Div: return "/"; case BinaryOp::Mod: return "%"; case BinaryOp::Equal: return "=="; case BinaryOp::Less: return "<"; case BinaryOp::LessEqual: return "<="; case BinaryOp::Greater: return ">"; case BinaryOp::GreaterEqual: return ">="; case BinaryOp::LogicalAnd: return "&&"; case BinaryOp::LogicalOr: return "||"; case BinaryOp::BitwiseXOR: return "^"; default: assert(false); return ""; } } inline std::string binary_op_string(BinaryOp op) { switch (op) { case BinaryOp::Add: return "add"; case BinaryOp::Sub: return "sub"; case BinaryOp::Mul: return "mul"; case BinaryOp::Div: return "div"; case BinaryOp::Mod: return "mod"; case BinaryOp::Equal: return "eq"; case BinaryOp::Less: return "gt"; case BinaryOp::LessEqual: return "gteq"; case BinaryOp::Greater: return "lt"; case BinaryOp::GreaterEqual: return "lte"; default: assert(false); return ""; } } enum class OperandType : int32_t { Unknown = 0x00000000, ScalarFp32 = 0x00001011, // Immediate scalar tile ScalarFp16 = 0x00001012, // Immediate scalar tile ScalarInt32 = 0x00001021, // Immediate scalar tile ScalarInt16 = 0x00001022, // Immediate scalar tile ScalarInt8 = 0x00001024, // Immediate scalar tile ScalarUInt32 = 0x00001031, // Immediate scalar tile ScalarUInt16 = 0x00001032, // Immediate scalar tile ScalarUInt8 = 0x00001034, // Immediate scalar tile ScalarBool = 0x00001041, // Immediate scalar tile ScalarTile = 0x00001050, // Scalar from a tile Tile = 0x00010000, // Tile TensorStride1 = 0x00100001, // Tensor component TensorStride2 = 0x00100002, // Tensor component TensorStride3 = 0x00100003, // Tensor component TensorStride4 = 0x00100004, // Tensor component TensorDim0 = 0x00100010, // Tensor component TensorDim1 = 0x00100020, // Tensor component TensorDim2 = 0x00100030, // Tensor component TensorDim3 = 0x00100040, // Tensor component TensorDim4 = 0x00100050, // Tensor component TensorC = 0x00100010, // Tensor component TensorW = 0x00100020, // Tensor component TensorH = 0x00100030, // Tensor component TensorD = 0x00100040, // Tensor component TensorN = 0x00100050, // Tensor component TensorDim1xDim2 = 0x00100100, // Tensor component TensorDim1xDim2xDim3 = 0x00100200, // Tensor component TensorWxH = 0x00100300, // Tensor component TensorWxHxD = 0x00100400, // Tensor component TensorDataOffset = 0x00100500, // Tensor component }; struct ScalarTileCoord { ScalarTileCoord() { } ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0) { } int32_t x{-1}; int32_t y{-1}; }; /** * @brief Operand class. This object is used to pass the operands to the operations performed by the writer. * Operand can be of three types: * -# Scalar immediate: constant expression * -# Tile: A tile * -# Tensor component: A component (scalar) of a tensor * */ class Operand { public: Operand(const std::string &val) { _str = val; _type = OperandType::Tile; } Operand(const std::string &val, const ScalarTileCoord &coord) { _str = val; _type = OperandType::ScalarTile; _coord = coord; } Operand(const std::string &val, OperandType type) { _str = val; _type = type; } Operand(const Operand &t) { _str = t.value(); _type = t.type(); } Operand &operator=(const Operand &t) { _str = t.value(); _type = t.type(); _coord = t.scalar_tile_coordinate(); return *this; } std::string value() const { return _str; } OperandType type() const { return _type; } ScalarTileCoord scalar_tile_coordinate() const { return _coord; } private: std::string _str{}; OperandType _type{OperandType::Unknown}; ScalarTileCoord _coord{}; }; using GpuSamplerTensorStorage = GpuTensorStorage; struct GpuSampler { GpuSampler() = default; TensorSamplerFormat format{TensorSamplerFormat::Unknown}; GpuSamplerTensorStorage storage{GpuSamplerTensorStorage::Unknown}; TensorSamplerAddressModeX address_mode_x{TensorSamplerAddressModeX::Unknown}; TensorSamplerAddressModeY address_mode_y{TensorSamplerAddressModeY::Unknown}; TensorSamplerAddressModeZ address_mode_z{TensorSamplerAddressModeZ::Unknown}; }; inline GpuSampler create_simple_sampler( const TensorInfo *tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z) { CKW_UNUSED(step_x, step_y, step_z); auto tensor = tensor_info_id->shape; GpuSampler dst_sampler; dst_sampler.format = sampler.format; dst_sampler.storage = GpuSamplerTensorStorage::BufferUint8Ptr; dst_sampler.address_mode_x = sampler.address_mode_x; dst_sampler.address_mode_y = sampler.address_mode_y; dst_sampler.address_mode_z = sampler.address_mode_z; int32_t dim_x = 0; int32_t dim_y = 0; int32_t dim_z = 0; switch (sampler.format) { case TensorSamplerFormat::C_W_H: dim_x = tensor[0]; dim_y = tensor[1]; dim_z = tensor[2]; break; case TensorSamplerFormat::C_WH_1: dim_x = tensor[0]; dim_y = tensor[1] * tensor[2]; dim_z = 1; break; default: std::cout << "Unsupported tensor format" << std::endl; assert(false); break; } if (dim_x == 1) { assert(step_x == 1); dst_sampler.address_mode_x = TensorSamplerAddressModeX::None; } if (dim_y == 1) { assert(step_y == 1); dst_sampler.address_mode_y = TensorSamplerAddressModeY::None; } if (dim_z == 1) { assert(step_z == 1); dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None; } return dst_sampler; } class GpuOutputSampler { public: GpuOutputSampler() = default; /** * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once * by the root component. Once initialized, all simpler components will need to used this sampler * or a broadcasted version of it * * @param[in] sampler GpuSampler * @param[in] step_x Increment step in the X direction. Not necessarily it is the same of n0 of tile! * @param[in] step_y Increment step in the Y direction. Not necessarily it is the same of m0 of tile! * @param[in] step_z Increment step in the Z direction. Not necessarily it is the same of d0 of tile! */ void initialize(const TensorInfo *tensor_info_id, GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format, int32_t step_x, int32_t step_y, int32_t step_z) { assert(_is_initialized == false); _step_x = step_x; _step_y = step_y; _step_z = step_z; _tensor_info_id = tensor_info_id; _sampler = create_sampler(tensor_storage, tensor_format); _is_initialized = true; }; GpuSampler sampler() const { return _sampler; }; int32_t step_x() const { return _step_x; }; int32_t step_y() const { return _step_y; }; int32_t step_z() const { return _step_z; }; private: GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format) { // Output can only be in output mode assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly); assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly); auto tensor = _tensor_info_id->shape; GpuSampler sampler; sampler.format = tensor_format; sampler.storage = tensor_storage; sampler.address_mode_x = TensorSamplerAddressModeX::None; sampler.address_mode_y = TensorSamplerAddressModeY::None; sampler.address_mode_z = TensorSamplerAddressModeZ::None; // In the case of texture, we do not need any special checks at the border if (tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr) { int32_t dim_x = 0; int32_t dim_y = 0; int32_t dim_z = 0; switch (tensor_format) { case TensorSamplerFormat::C_W_H: dim_x = tensor[0]; dim_y = tensor[1]; dim_z = tensor[2]; break; case TensorSamplerFormat::C_WH_1: dim_x = tensor[0]; dim_y = tensor[1] * tensor[2]; dim_z = 1; break; default: std::cout << "Unsupported tensor format" << std::endl; assert(false); break; } if ((dim_x % _step_x) != 0 && dim_x != 1) { sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin; } if ((dim_y % _step_y) != 0 && dim_y != 1) { sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly; } if ((dim_z % _step_z) != 0 && dim_z != 1) { sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly; } } return sampler; } GpuSampler _sampler{}; // GpuSampler int32_t _step_x{1}; int32_t _step_y{1}; int32_t _step_z{1}; const TensorInfo *_tensor_info_id{nullptr}; bool _is_initialized{false}; }; /** * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer. */ class TensorOperand { public: TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler) { } TensorOperand &operator=(const TensorOperand &t) { _str = t.value(); _sampler = t.sampler(); return *this; } std::string value() const { return _str; } GpuSampler sampler() const { return _sampler; } private: std::string _str{}; GpuSampler _sampler{}; }; /** * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer * This data structure must be initialized before being passed to the Gpu Kernel Writer * */ class GpuKernelWriterDataHolder { public: /** * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store * the GPU target and target specific capabilities (extensions). For now, we just initialize the * programming language * * @param[in] language Gpu programming language to use */ GpuKernelWriterDataHolder(GpuTargetLanguage language) : tiles(language), arguments(language), code(""), _language(language) { } /** * @brief Get the Gpu programming language used * * @return GpuTargetLanguage the Gpu programming language */ GpuTargetLanguage programming_language() const { return _language; } /** * @brief @ref GpuTileRegistry * */ GpuTileRegistry tiles{}; /** * @brief @ref GpuTensorArgumentRegistry * */ GpuTensorArgumentRegistry arguments{}; /** * @brief @ref GpuOutputSampler. * */ GpuOutputSampler output_sampler{}; /** * @brief Source code * */ std::string code{}; // GpuExtensionRegistry extensions{}; private: GpuTargetLanguage _language; }; struct LWS { int32_t x{1}; int32_t y{1}; int32_t z{1}; }; /** * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker * declare an anonymous tile in the tile registry. */ class OperandUnpacker { public: OperandUnpacker(GpuTileRegistry &tiles, GpuTensorArgumentRegistry &arguments) : _tiles(tiles), _arguments(arguments) { // Increase the level of the stack to allocate possible temporary tiles _tiles.increment_registry_level(); }; ~OperandUnpacker() { // Decrease the level of the stack to deallocate any temporary tiles _tiles.decrement_registry_level(); } IVectorTile *unpack(const Operand &src) { // Get the tile if (src.type() == OperandType::Tile) { assert(_tiles.has_tile(src.value())); return _tiles[src.value()]; } // Create an anonymous tile with a constant else if (static_cast(src.type()) & 0x00001000) { if (src.type() == OperandType::ScalarTile) { ScalarTileCoord coord = src.scalar_tile_coordinate(); assert(_tiles.has_tile(src.value())); assert(coord.x >= 0); assert(coord.y >= 0); auto val = _tiles[src.value()]->scalar(coord.x, coord.y); return _tiles.insert({{{val.str}}}, val.type.dt); } else { return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type())); } } // Create an anonymous tile with the tensor component else { assert(_arguments.has_tensor_argument(src.value())); auto x = _arguments[src.value()]; const std::string val = x->component(to_tensor_component(src.type())); const DataType dt = x->component_data_type(); return _tiles.insert({{{val}}}, dt); } } private: DataType to_tile_data_type(OperandType x) { return static_cast(static_cast(x) & 0x00ff); } TensorComponentType to_tensor_component(OperandType x) { switch (x) { case OperandType::TensorDim0: return TensorComponentType::Dim0; case OperandType::TensorDim1: return TensorComponentType::Dim1; case OperandType::TensorDim2: return TensorComponentType::Dim2; case OperandType::TensorDim3: return TensorComponentType::Dim3; case OperandType::TensorDim4: return TensorComponentType::Dim4; case OperandType::TensorStride1: return TensorComponentType::Stride1; case OperandType::TensorStride2: return TensorComponentType::Stride2; case OperandType::TensorStride3: return TensorComponentType::Stride3; case OperandType::TensorStride4: return TensorComponentType::Stride4; case OperandType::TensorDim1xDim2: return TensorComponentType::Dim1xDim2; case OperandType::TensorDim1xDim2xDim3: return TensorComponentType::Dim1xDim2xDim3; case OperandType::TensorDataOffset: return TensorComponentType::OffsetFirstElement; default: assert(false); return TensorComponentType::Unknown; } } GpuTileRegistry &_tiles; GpuTensorArgumentRegistry &_arguments; }; /** * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker * declare an anonymous tile in the tile registry. * Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure. */ class TensorOperandUnpacker { public: TensorOperandUnpacker(GpuTensorArgumentRegistry &arguments) : _arguments(arguments){}; IGpuTensorArgument *unpack(const TensorOperand &src) { assert(_arguments.has_tensor_argument(src.value())); return _arguments[src.value()]; } private: GpuTensorArgumentRegistry &_arguments; }; /** * @brief The GpuKernel will be used in three occasions (stages): * #- Compilation stage * #- Tuning stage * #- Dispatch stage */ struct GpuKernel { // Compilation stage std::string code{}; // Source code, required for the compilation stage std::vector list_extensions{}; // Extensions, required for the compilation stage // Tuning stage std::string config_id{}; // Unique id, required for the tuning stage std::vector list_lws{}; // LWS to test, required for the tuning stage // Dispatch stage GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage std::vector> list_tensor_storages; // List of tensor storages, required for the dispatch stage std::vector> list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage) }; // Generate all extension pragmas (hardcoded for now) inline std::string generate_extensions() { std::string ext = R"( #if defined(cl_khr_fp16) #pragma OPENCL EXTENSION cl_khr_fp16 : enable #endif // defined(cl_khr_fp16) #if defined(cl_arm_integer_dot_product_int8) #pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable #endif // defined(cl_arm_integer_dot_product_int8) #if defined(cl_arm_integer_dot_product_accumulate_int8) #pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable #endif // defined(cl_arm_integer_dot_product_accumulate_int8) #if defined(cl_arm_printf) #pragma OPENCL EXTENSION cl_arm_printf : enable #endif // defined(cl_arm_printf); )"; return ext; } // This function should produce an object with the source inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string &name) { std::string code; code += generate_extensions(); code += "__kernel void "; code += name; code += "(\n"; auto IdSpaces = in.arguments.IdSpace_declarations(); std::vector arg_str; auto tensor_args = in.arguments.tensor_argument_declarations(); for (auto &i : tensor_args) { // For each tensor used, get the storage and tensor components auto storages = i->storage_declarations(); auto components = i->component_declarations(); for (auto &y : storages) { std::string str; str += i->storage_type_declaration(y); str += " "; str += i->storage(y); arg_str.push_back(str); } for (auto &y : components) { std::string str; str += i->component_type_declaration(); str += " "; str += i->component(y); arg_str.push_back(str); } } for (size_t i = 0; i < arg_str.size(); ++i) { code += arg_str[i]; if (i + 1 < arg_str.size()) { code += ",\n"; } } code += ")\n"; code += "{\n"; code += in.code; code += "}\n"; return code; } /** * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know * how to reduce the dimensionality of a tensor * */ class GpuTensor3dMapper { public: GpuTensor3dMapper(IGpuTensorArgument *tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor){}; std::string tensor_component_x() const { const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: case TensorSamplerFormat::C_W_H: return _tensor->component(TensorComponentType::Dim0); default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } std::string tensor_component_y() const { const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: return _tensor->component(TensorComponentType::Dim1xDim2); case TensorSamplerFormat::C_W_H: return _tensor->component(TensorComponentType::Dim1); default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } std::string tensor_component_z() const { const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: return "1"; case TensorSamplerFormat::C_W_H: return _tensor->component(TensorComponentType::Dim2); default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } std::string tensor_component_stride_y() const { const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: case TensorSamplerFormat::C_W_H: return _tensor->component(TensorComponentType::Stride1); default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } std::string tensor_component_stride_z() const { const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: return "0"; case TensorSamplerFormat::C_W_H: return _tensor->component(TensorComponentType::Stride2); default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } std::string tensor_component_stride_batch() const { const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: case TensorSamplerFormat::C_W_H: return _tensor->component(TensorComponentType::Stride3); default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } bool is_one_component_x() const { auto t = _tensor->format(); const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: case TensorSamplerFormat::C_W_H: return t.shape[0] == 1; default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } bool is_one_component_y() const { auto t = _tensor->format(); const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: return (t.shape[1] * t.shape[2]) == 1; case TensorSamplerFormat::C_W_H: return t.shape[1] == 1; default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } bool is_one_component_z() const { auto t = _tensor->format(); const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: return true; case TensorSamplerFormat::C_W_H: return t.shape[2] == 1; default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } bool is_one_component_batch() const { auto t = _tensor->format(); const auto format = _sampler.format; switch (format) { case TensorSamplerFormat::C_WH_1: case TensorSamplerFormat::C_W_H: return t.shape[3] == 1; default: std::cout << "Unsupported tensor format" << std::endl; assert(false); return ""; } } GpuSampler gpu_sampler() const { return _sampler; } IGpuTensorArgument *tensor_argument() const { return _tensor; } private: GpuSampler _sampler; IGpuTensorArgument *_tensor; }; struct GpuKernelWriterAttribute { bool return_tensor_component_by_value{false}; }; enum class RoundingMode { None, Rte, Rtz, Rtp, Rtn }; // https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html class IGpuKernelWriter { public: virtual ~IGpuKernelWriter() = default; virtual void set_IdSpace(int32_t id) = 0; virtual void import_tile(const std::string &dst, const IVectorTile *src) = 0; virtual void declare_argument(const std::string &name, const TensorInfo &tensor) = 0; virtual void declare_tile(const std::string &name, const TileInfo &info) = 0; virtual void declare_const_tile(const std::string &name, const std::vector> &in, DataType dt) = 0; virtual void write_text(const std::string &x) = 0; virtual void compound_statement_begin() = 0; virtual void compound_statement_end() = 0; // Operations virtual void op_get_global_id(const Operand &dst_var, int32_t dim) = 0; virtual void op_get_global_coord(const Operand &dst, const Operand &step, const TensorOperand &tensor, int32_t dim) = 0; virtual void op_get_global_batch(const Operand &dst, const TensorOperand &tensor) = 0; virtual void op_get_global_size(const Operand &dst_var, int32_t dim) = 0; virtual void op_unary_expression(const Operand &dst, UnaryOp op, const Operand &src) = 0; virtual void op_binary_expression(const Operand &dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0; virtual void op_assign(const Operand &dst_name, const Operand &src_name) = 0; virtual void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) = 0; virtual void op_binary_elementwise_function(const Operand &dst_name, BinaryFunction func, const Operand &first_name, const Operand &second_name) = 0; virtual void op_ternary_elementwise_function(const Operand &dst_name, TernaryFunction func, const Operand &first_name, const Operand &second_name, const Operand &third_name) = 0; virtual void op_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0; virtual void op_else_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0; virtual void op_else_header() = 0; virtual void op_for_loop_header(const Operand &var_name, BinaryOp cond_op, const Operand &cond_value, const Operand &update_var, AssignmentOp update_op, const Operand &update_value) = 0; virtual void op_load_indirect(const TensorOperand &tensor, const Operand &dst, const Operand &x, const Operand &y_indirect, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32)) = 0; virtual void op_load_immediate(const TensorOperand &tensor, const Operand &dst, const Operand &x, const Operand &y, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32), const Operand &dilation_y = Operand("1", OperandType::ScalarInt32)) = 0; virtual void op_store_immediate(const TensorOperand &tensor, const Operand &src, const Operand &x, const Operand &y, const Operand &z, const Operand &b = Operand("0", OperandType::ScalarInt32)) = 0; virtual void op_cast_expression(const Operand &dst, const Operand &src, ConvertPolicy policy) = 0; virtual void op_return() = 0; // Utils // It is the process of converting virtual void util_get_indirect_buffer(const Operand &dst, const TensorOperand &tensor, const Operand &x, const Operand &y, const Operand &x_off, const Operand &y_off) = 0; }; enum class GpuLoadStoreType { Load = 1, Store = 2 }; class IGpuLoadStoreHelperWriter { public: IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type) : _writer(x), _mapper(mapper), _type(type) { } IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default; IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default; virtual ~IGpuLoadStoreHelperWriter() = default; virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0; virtual void write(const std::pair &y) = 0; virtual void finalize() = 0; protected: IGpuKernelWriter *_writer; GpuTensor3dMapper _mapper; GpuLoadStoreType _type; }; class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter { public: ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type) { } ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default; ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default; static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst) { CKW_UNUSED(x, type, dst); if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr) { return false; } return true; } void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override { assert(validate(_writer, _mapper, _type, dst)); _dst = dst; _ls_width_full = dst->format().w; _coord_x = x->scalar(0, 0).str; _coord_z = z->scalar(0, 0).str; _coord_b = b->scalar(0, 0).str; _coord_orig_z = _coord_z; out_of_bound_initialize_x(_coord_x); out_of_bound_initialize_z(_coord_z); /* meaning of else: - x: partial load/store - y: no load/store operation - z: no load/store operation if(x) { if(z) { if(y) { // full load/store width } else { // no load/store } } else { // no load/store } } else { if(z) { if(y) { // partial load/store width } else { // no load/store } } else { // no load/store } } */ } void write(const std::pair &y) override { int32_t idx_y = y.first; std::string coord_y = y.second; // The only check required is on Y. out_of_bound_initialize_y(coord_y); const std::string dst = _dst->vector(idx_y).str; const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b); const std::string ls_buf = to_ls_buffer(_type, _ls_width_full, dst, address); _writer->write_text(ls_buf); _writer->write_text(";\n"); out_of_bound_finalize_y(dst); // The left over load/store will be written in the finalize stage if (_ls_width_part.size() != 0) { int32_t w = 0; for (auto &p : _ls_width_part) { const std::string dst0 = _dst->vector(w, p, idx_y).str; const std::string coord_x = _coord_x + " + " + std::to_string(w); const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b); const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address); _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0)); w += p; } } } void finalize() override { out_of_bound_finalize_z(); out_of_bound_finalize_x(); } private: IVectorTile *_dst{nullptr}; int32_t _ls_width_full{0}; std::vector _ls_width_part{}; std::vector, std::string>> _leftovers_x{}; std::string _coord_x{}; std::string _coord_z{}; std::string _coord_orig_z{}; std::string _coord_b{}; void out_of_bound_initialize_x(std::string &coord) { if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin) { auto tensor_format = _mapper.tensor_argument()->format(); auto shape = tensor_format.shape; _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full); if (_ls_width_part.size() != 0) { _writer->write_text("if(" + coord + " > 0)\n"); _writer->compound_statement_begin(); } } }; void out_of_bound_finalize_x() { if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin) { if (_ls_width_part.size() != 0) { _writer->compound_statement_end(); _writer->write_text("else\n"); _writer->compound_statement_begin(); out_of_bound_initialize_z(_coord_orig_z); for (auto &i : _leftovers_x) { out_of_bound_initialize_y(i.first.second); _writer->write_text(i.second); _writer->write_text(";\n"); out_of_bound_finalize_y(i.first.first); } out_of_bound_finalize_z(); _writer->compound_statement_end(); } } }; void out_of_bound_initialize_y(std::string &coord) { std::string max = ""; const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; switch (address_mode_y) { case TensorSamplerAddressModeY::Skip: case TensorSamplerAddressModeY::ClampToBorder: // NOTE: This line should not be moved outside of the switch statement. // The reason for that is because when we query the component, the component is marked as used // and added to the list of arguments of the kernel. Since, not in all cases this component is required, // we should request the component only when used max = _mapper.tensor_component_y(); _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n"); _writer->compound_statement_begin(); break; case TensorSamplerAddressModeY::SkipMinEdgeOnly: case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: _writer->write_text("if(" + coord + " >= 0)\n"); _writer->compound_statement_begin(); break; case TensorSamplerAddressModeY::SkipMaxEdgeOnly: case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: max = _mapper.tensor_component_y(); _writer->write_text("if(" + coord + " < " + max + ")\n"); _writer->compound_statement_begin(); break; case TensorSamplerAddressModeY::ClampToNearest: max = _mapper.tensor_component_y(); coord = "clamp(" + coord + ", 0, " + max + " - 1)"; break; case TensorSamplerAddressModeY::ClampToMaxEdgeOnly: max = _mapper.tensor_component_y(); coord = "min(" + coord + ", " + max + " - 1)"; break; case TensorSamplerAddressModeY::ClampToMinEdgeOnly: coord = "max(" + coord + ", 0)"; break; case TensorSamplerAddressModeY::None: break; default: std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl; assert(false); } }; void out_of_bound_finalize_y(const std::string &dst) { const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; switch (address_mode_y) { case TensorSamplerAddressModeY::ClampToBorder: case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: case TensorSamplerAddressModeY::Skip: case TensorSamplerAddressModeY::SkipMaxEdgeOnly: case TensorSamplerAddressModeY::SkipMinEdgeOnly: _writer->compound_statement_end(); break; case TensorSamplerAddressModeY::None: break; default: assert(false); } switch (address_mode_y) { case TensorSamplerAddressModeY::ClampToBorder: case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: _writer->write_text("else\n"); _writer->compound_statement_begin(); _writer->write_text(dst); _writer->write_text(" = 0.0f;\n"); _writer->compound_statement_end(); break; case TensorSamplerAddressModeY::None: break; default: assert(false); } }; void out_of_bound_initialize_z(std::string &coord) { std::string max = ""; const auto address_mode_z = _mapper.gpu_sampler().address_mode_z; switch (address_mode_z) { case TensorSamplerAddressModeZ::Skip: max = _mapper.tensor_component_z(); _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n"); _writer->compound_statement_begin(); break; case TensorSamplerAddressModeZ::SkipMinEdgeOnly: _writer->write_text("if(" + coord + " >= 0)\n"); _writer->compound_statement_begin(); break; case TensorSamplerAddressModeZ::SkipMaxEdgeOnly: max = _mapper.tensor_component_z(); _writer->write_text("if(" + coord + " < " + max + ")\n"); _writer->compound_statement_begin(); break; case TensorSamplerAddressModeZ::ClampToNearest: max = _mapper.tensor_component_z(); coord = "clamp(" + coord + ", 0, " + max + " - 1)"; break; case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly: max = _mapper.tensor_component_z(); coord = "min(" + coord + ", " + max + " - 1)"; break; case TensorSamplerAddressModeZ::ClampToMinEdgeOnly: coord = "max(" + coord + ", 0)"; break; case TensorSamplerAddressModeZ::None: break; default: std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl; assert(false); } }; void out_of_bound_finalize_z() { const auto address_mode_z = _mapper.gpu_sampler().address_mode_z; switch (address_mode_z) { case TensorSamplerAddressModeZ::Skip: case TensorSamplerAddressModeZ::SkipMinEdgeOnly: case TensorSamplerAddressModeZ::SkipMaxEdgeOnly: _writer->compound_statement_end(); break; case TensorSamplerAddressModeZ::None: break; default: assert(false); } }; std::vector decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const { std::vector x; switch (ls_leftover_vector_width) { case 0: break; case 1: case 2: case 3: case 4: case 8: case 16: x.push_back(ls_leftover_vector_width); break; case 5: x.push_back(4); x.push_back(1); break; case 6: x.push_back(4); x.push_back(2); break; case 7: x.push_back(4); x.push_back(3); break; case 9: x.push_back(8); x.push_back(1); break; case 10: x.push_back(8); x.push_back(2); break; case 11: x.push_back(8); x.push_back(3); break; case 12: x.push_back(8); x.push_back(4); break; case 13: x.push_back(8); x.push_back(4); x.push_back(1); break; case 14: x.push_back(8); x.push_back(4); x.push_back(2); break; case 15: x.push_back(8); x.push_back(4); x.push_back(3); break; default: assert(false); } return x; } std::string to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string &data, const std::string &address) { switch (type) { case GpuLoadStoreType::Load: if (vector_width != 1) { return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")"; } else { return data + " = *(" + address + ")"; } break; case GpuLoadStoreType::Store: if (vector_width != 1) { return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")"; } else { return "*(" + address + ") = " + data; } break; default: std::cout << "Unsupported GpuLoadStoreType" << std::endl; assert(false); return ""; } } std::string to_ls_buffer_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const { auto tensor_storage = static_cast(_mapper.gpu_sampler().storage); assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr); const std::string ptr_buf = _mapper.tensor_argument()->storage(tensor_storage); const std::string dst_type = get_cl_data_type(_dst->format().dt, 1); std::string address; address += "(__global "; address += dst_type; address += "*)("; address += ptr_buf; if (x != "0" && (_mapper.is_one_component_x() != true)) { address += " + ("; address += x + ") * sizeof(" + dst_type + ")"; } if (y != "0") { const std::string stride_y = _mapper.tensor_component_stride_y(); address += " + ("; address += y + ")"; address += " * "; address += stride_y; } if (z != "0" && (_mapper.is_one_component_z() != true)) { const std::string stride_z = _mapper.tensor_component_stride_z(); address += " + ("; address += z + ")"; address += " * "; address += stride_z; } if (b != "0" && (_mapper.is_one_component_batch() != true)) { const std::string stride_b = _mapper.tensor_component_stride_batch(); address += " + ("; address += b + ")"; address += " * "; address += stride_b; } address += ")"; return address; } }; class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter { public: static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type, IVectorTile *dst) { CKW_UNUSED(x); if (dst->format().w != 4) { return false; } if (mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None) { return false; } if (mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None) { return false; } if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load) { return false; } if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly && type == GpuLoadStoreType::Store) { return false; } if ((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16)) { return false; } return true; /* - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4 - z: Only GpuSamplerAddressModeZ::None is supported */ } ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type) : IGpuLoadStoreHelperWriter(x, mapper, type) { } ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default; ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default; void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override { assert(validate(_writer, _mapper, _type, dst)); _dst = dst; _ls_width_full = dst->format().w; _coord_x = x->scalar(0, 0).str; _coord_z = z->scalar(0, 0).str; _coord_b = b->scalar(0, 0).str; /* if(y) { // full load/store width } else { // no load/store } */ } void write(const std::pair &y) override { int32_t idx_y = y.first; std::string coord_y = y.second; // The only check required is on Y. out_of_bound_initialize_y(coord_y); const std::string dst = _dst->vector(idx_y).str; const std::string sampler = to_ls_image2d_sampler(); const std::string coord = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b); const std::string ls_buf = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord); _writer->write_text(ls_buf); _writer->write_text(";\n"); out_of_bound_finalize_y(dst); } void finalize() override { } private: IVectorTile *_dst{nullptr}; int32_t _ls_width_full{0}; std::string _coord_x{}; std::string _coord_z{}; std::string _coord_b{}; void out_of_bound_initialize_y(std::string &coord) { std::string max = ""; const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; switch (address_mode_y) { case TensorSamplerAddressModeY::Skip: max = _mapper.tensor_component_y(); _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n"); _writer->compound_statement_begin(); break; case TensorSamplerAddressModeY::SkipMinEdgeOnly: _writer->write_text("if(" + coord + " >= 0)\n"); _writer->compound_statement_begin(); break; case TensorSamplerAddressModeY::SkipMaxEdgeOnly: max = _mapper.tensor_component_y(); _writer->write_text("if(" + coord + " < " + max + ")\n"); _writer->compound_statement_begin(); break; case TensorSamplerAddressModeY::ClampToBorder: case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: case TensorSamplerAddressModeY::ClampToNearest: case TensorSamplerAddressModeY::ClampToMaxEdgeOnly: case TensorSamplerAddressModeY::ClampToMinEdgeOnly: case TensorSamplerAddressModeY::None: break; default: std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl; assert(false); } }; void out_of_bound_finalize_y(const std::string &dst) { CKW_UNUSED(dst); const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; switch (address_mode_y) { case TensorSamplerAddressModeY::Skip: case TensorSamplerAddressModeY::SkipMinEdgeOnly: case TensorSamplerAddressModeY::SkipMaxEdgeOnly: _writer->compound_statement_end(); break; default: assert(false); } }; std::string to_ls_image2d(GpuLoadStoreType type, int32_t vector_width, const std::string &data, const std::string &sampler, const std::string &coord) { CKW_UNUSED(vector_width); auto tensor_storage = static_cast(_mapper.gpu_sampler().storage); const std::string image2d_obj = _mapper.tensor_argument()->storage(tensor_storage); const std::string post_fix = _dst->format().dt == DataType::Fp32 ? "f" : "h"; switch (type) { case GpuLoadStoreType::Load: return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")"; break; case GpuLoadStoreType::Store: return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")"; default: assert(false); std::cout << "Unsupported GpuLoadStoreType" << std::endl; assert(false); return ""; } } std::string to_ls_image2d_sampler() const { const auto address_mode_y = _mapper.gpu_sampler().address_mode_y; switch (address_mode_y) { case TensorSamplerAddressModeY::None: return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST"; case TensorSamplerAddressModeY::Skip: case TensorSamplerAddressModeY::SkipMinEdgeOnly: case TensorSamplerAddressModeY::SkipMaxEdgeOnly: case TensorSamplerAddressModeY::ClampToBorder: case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly: case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly: return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST"; case TensorSamplerAddressModeY::ClampToNearest: case TensorSamplerAddressModeY::ClampToMaxEdgeOnly: case TensorSamplerAddressModeY::ClampToMinEdgeOnly: return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST"; default: std::cout << "Unsupported address_mode_coord" << std::endl; assert(false); return ""; } } std::string to_ls_image2d_coord(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const { std::string coord_x = "(" + x + ") >> 2"; std::string coord_y = "("; if (y != "0") { coord_y += y; } if (z != "0" && (_mapper.is_one_component_z() != true)) { const std::string dim = _mapper.tensor_component_y(); coord_y += " + ("; coord_y += z + ")"; coord_y += " * "; coord_y += dim; } if (b != "0" && (_mapper.is_one_component_batch() != true)) { const std::string dim0 = _mapper.tensor_component_y(); const std::string dim1 = _mapper.tensor_component_z(); coord_y += " + ("; coord_y += b + ")"; coord_y += " * "; coord_y += dim0; coord_y += " * "; coord_y += dim1; } coord_y += ")"; return "(int2)(" + coord_x + ", " + coord_y + ")"; } }; /** IGpuLoadStoreHelperWriter factory class */ class ClLoadStoreHelperWriterFactory final { public: /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper * * * @return IGpuLoadStoreHelperWriter */ static std::unique_ptr create(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type) { const auto tensor_storage = mapper.gpu_sampler().storage; switch (tensor_storage) { case GpuSamplerTensorStorage::BufferUint8Ptr: return std::make_unique(x, mapper, type); case GpuSamplerTensorStorage::Image2dReadOnly: case GpuSamplerTensorStorage::Image2dWriteOnly: return std::make_unique(x, mapper, type); default: std::cout << "Unsupported Gpu tensor storage" << std::endl; assert(false); return nullptr; } } }; // This utility method needs to go in utils.h inline bool is_tile_scalar(const IVectorTile *x) { return x->format().w == 1 && x->format().h == 1; } class ClKernelWriter : public IGpuKernelWriter { public: ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x) { _data = x; _attr = attr; } ClKernelWriter(const ClKernelWriter &) = default; ClKernelWriter &operator=(const ClKernelWriter &) = default; // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure // there are no conflicts or ambiguity in the code void set_IdSpace(int32_t id) override { _data->tiles.set_IdSpace(id); _data->arguments.set_IdSpace(id); } void import_tile(const std::string &dst_name, const IVectorTile *src) override { _data->tiles.insert(dst_name, src); } void declare_argument(const std::string &name, const TensorInfo &tensor) override { assert(_data->arguments[name] == nullptr); _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value); } void declare_tile(const std::string &name, const TileInfo &format) override { assert(_data->tiles[name] == nullptr); _data->tiles.insert(name, format); IVectorTile *x = _data->tiles[name]; for (auto &t : x->underlying_source_variables()) { _data->code += t.type.str + " " + t.str + ";\n"; } } void declare_const_tile(const std::string &name, const std::vector> &in, DataType dt) override { assert(_data->tiles[name] == nullptr); _data->tiles.insert(name, in, dt); // Note: A constant does not need to be declared in the code } void write_text(const std::string &x) override { _data->code += x; } void compound_statement_begin() override { _data->tiles.increment_registry_level(); _data->code += "{\n"; } void compound_statement_end() override { _data->tiles.decrement_registry_level(); _data->code += "}\n"; } void op_get_global_id(const Operand &dst_var, int32_t dim) override { assert(dst_var.type() == OperandType::Tile); assert(_data->tiles.has_tile(dst_var.value())); assert(_data->tiles[dst_var.value()]->format().w == 1 && _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable auto var = _data->tiles[dst_var.value()]; _data->code += var->scalar(0, 0).str; _data->code += " = get_global_id("; _data->code += std::to_string(dim); _data->code += ");\n"; }; void op_get_global_coord(const Operand &o_dst, const Operand &o_step, const TensorOperand &o_tensor, int32_t dim) override { OperandUnpacker operands(_data->tiles, _data->arguments); auto dst = operands.unpack(o_dst); auto step = operands.unpack(o_step); // Validation: Check that x, y and z are scalar TensorOperandUnpacker tensor_operands(_data->arguments); auto tensor = tensor_operands.unpack(o_tensor); auto gpu_sampler = o_tensor.sampler(); GpuTensor3dMapper mapper(tensor, gpu_sampler); switch (dim) { case 0: if (mapper.is_one_component_x()) { _data->code += dst->scalar(0, 0).str; _data->code += " = 0;\n"; } else { if (mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin) { // Validation: Check: fixed tensor shape // TO BE CHANGED _data->code += dst->scalar(0, 0).str; _data->code += " = get_global_id(0) * "; _data->code += step->scalar(0, 0).str; _data->code += ";\n"; } else { _data->code += dst->scalar(0, 0).str; _data->code += " = get_global_id(0) * "; _data->code += step->scalar(0, 0).str; _data->code += ";\n"; } } break; case 1: if (mapper.is_one_component_y()) { _data->code += dst->scalar(0, 0).str; _data->code += " = 0;\n"; } else { if (mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin) { } else { _data->code += dst->scalar(0, 0).str; _data->code += " = get_global_id(1) * "; _data->code += step->scalar(0, 0).str; _data->code += ";\n"; } } break; case 2: if (mapper.is_one_component_z()) { _data->code += dst->scalar(0, 0).str; _data->code += " = 0;\n"; } else { _data->code += dst->scalar(0, 0).str; _data->code += " = get_global_id(2) * "; _data->code += step->scalar(0, 0).str; _data->code += ";\n"; } break; default: break; } }; void op_get_global_batch(const Operand &o_dst, const TensorOperand &o_tensor) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *dst = operands.unpack(o_dst); TensorOperandUnpacker tensor_operands(_data->arguments); IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor); auto gpu_sampler = o_tensor.sampler(); GpuTensor3dMapper mapper(tensor, gpu_sampler); if (mapper.is_one_component_batch()) { _data->code += dst->scalar(0, 0).str; _data->code += " = 0;\n"; } else { std::cout << "Unsupported batched computation" << std::endl; assert(false); } }; void op_get_global_size(const Operand &dst_var, int32_t dim) override { assert(dst_var.type() == OperandType::Tile); assert(_data->tiles.has_tile(dst_var.value())); assert(_data->tiles[dst_var.value()]->format().w == 1 && _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable auto var = _data->tiles[dst_var.value()]; _data->code += var->scalar(0, 0).str; _data->code += " = get_global_size("; _data->code += std::to_string(dim); _data->code += ");\n"; } void op_unary_expression(const Operand &dst_name, UnaryOp op, const Operand &src_name) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *src = operands.unpack(src_name); const IVectorTile *dst = operands.unpack(dst_name); const int32_t dst_w = dst->format().w; const int32_t dst_h = dst->format().h; const int32_t src_w = src->format().w; const std::string dt = dst->underlying_source_variables()[0].type.str; const bool broadcast_src_x = dst_w != 1 && src_w == 1; const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : ""; // Broadcasting on Y is automatic for (int32_t y = 0; y < dst_h; ++y) { _data->code += dst->vector(y).str; _data->code += " = "; _data->code += to_string(op); _data->code += src_prefix + src->vector(y).str; _data->code += ";\n"; } } void op_binary_expression(const Operand &dst_name, const Operand &lhs_name, BinaryOp op, const Operand &rhs_name) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *lhs = operands.unpack(lhs_name); const IVectorTile *rhs = operands.unpack(rhs_name); const IVectorTile *dst = operands.unpack(dst_name); const int32_t dst_w = dst->format().w; const int32_t dst_h = dst->format().h; assert(lhs != nullptr); const int32_t lhs_w = lhs->format().w; const int32_t rhs_w = rhs->format().w; if (op == BinaryOp::MatMul_Nt_T) { assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16)); for (int32_t y = 0; y < dst_h; ++y) { for (int32_t x = 0; x < dst_w; ++x) { for (int32_t k = 0; k < lhs_w; ++k) { _data->code += dst->scalar(x, y).str; _data->code += " = fma("; _data->code += lhs->scalar(k, y).str; _data->code += ", "; _data->code += rhs->scalar(k, x).str; _data->code += ", "; _data->code += dst->scalar(x, y).str; _data->code += ");\n"; } } } return; } const bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1; const bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1; const std::string lhs_prefix = broadcast_lhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : ""; const std::string rhs_prefix = broadcast_rhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : ""; const std::string op_str = to_string(op); // Broadcasting on Y is automatic for (int32_t y = 0; y < dst_h; ++y) { _data->code += dst->vector(y).str; _data->code += " = "; _data->code += lhs_prefix + lhs->vector(y).str; _data->code += " "; _data->code += op_str; _data->code += " "; _data->code += rhs_prefix + rhs->vector(y).str; _data->code += ";\n"; } }; void op_cast_expression(const Operand &o_dst, const Operand &o_src, ConvertPolicy policy) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *src = operands.unpack(o_src); const IVectorTile *dst = operands.unpack(o_dst); // const int32_t dst_w = dst->format().w; const int32_t dst_h = dst->format().h; const std::string dt = dst->underlying_source_variables()[0].type.str; const bool is_float = (dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16); const std::string sat = ((policy == ConvertPolicy::Saturate && !is_float) ? "_sat" : ""); // Broadcasting on Y is automatic for (int32_t y = 0; y < dst_h; ++y) { _data->code += dst->vector(y).str; _data->code += " = convert_" + dt + sat + "("; _data->code += src->vector(y).str; _data->code += ");\n"; } }; void op_assign(const Operand &dst_name, const Operand &src_name) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *src = operands.unpack(src_name); const IVectorTile *dst = operands.unpack(dst_name); const int32_t dst_w = dst->format().w; const int32_t dst_h = dst->format().h; const int32_t src_w = src->format().w; const std::string dt = dst->underlying_source_variables()[0].type.str; const bool broadcast_src_x = dst_w != 1 && src_w == 1; const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : ""; // Broadcasting on Y is automatic for (int32_t y = 0; y < dst_h; ++y) { _data->code += dst->vector(y).str; _data->code += " = "; _data->code += src_prefix + src->vector(y).str; _data->code += ";\n"; } } void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *src = operands.unpack(src_name); const IVectorTile *dst = operands.unpack(dst_name); const int32_t dst_h = dst->format().h; const std::string dt = dst->underlying_source_variables()[0].type.str; // Always perform an explicit cast. This automatically covers at least the 2 scenarios: // 1. Widen a scalar into a vector type. This enables scalar-vector broadcasting // 2. Ensure non-ambiguity over function overloads. // E.g. a constant tile may be accidentally initialized with a double literal. By casting it to single float, // it avoids ambiguous function calls const std::string src_prefix = "(" + dt + ")"; // Broadcasting on Y is automatic for (int32_t y = 0; y < dst_h; ++y) { _data->code += dst->vector(y).str; _data->code += " = "; switch (func) { case UnaryFunction::Exp: _data->code += "exp("; break; case UnaryFunction::Tanh: _data->code += "tanh("; break; case UnaryFunction::Sqrt: _data->code += "sqrt("; break; case UnaryFunction::Erf: _data->code += "erf("; break; case UnaryFunction::Fabs: _data->code += "fabs("; break; case UnaryFunction::Log: _data->code += "log("; break; case UnaryFunction::SizeOf: _data->code += "sizeof("; break; case UnaryFunction::Round: _data->code += "round("; break; case UnaryFunction::Floor: _data->code += "floor("; break; default: CKW_ASSERT_MSG(false, "Unexpected UnaryFunction used."); } _data->code += src_prefix + src->vector(y).str; _data->code += ");\n"; } } void op_binary_elementwise_function(const Operand &dst_name, BinaryFunction func, const Operand &first_name, const Operand &second_name) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *first = operands.unpack(first_name); const IVectorTile *second = operands.unpack(second_name); const IVectorTile *dst = operands.unpack(dst_name); const int32_t dst_h = dst->format().h; const auto datatype = dst->underlying_source_variables()[0].type; const std::string datatype_str = datatype.str; // Always perform an explicit cast. See similar comments in op_unary_elementwise_function const std::string first_prefix = "(" + datatype_str + ")"; const std::string second_prefix = "(" + datatype_str + ")"; const bool is_float = (datatype.dt == DataType::Fp32 || datatype.dt == DataType::Fp16); // Broadcasting on Y is automatic for (int32_t y = 0; y < dst_h; ++y) { _data->code += dst->vector(y).str; _data->code += " = "; switch (func) { case BinaryFunction::Min: _data->code += is_float ? "fmin(" : "min("; break; case BinaryFunction::Max: _data->code += is_float ? "fmax(" : "max("; break; default: CKW_ASSERT_MSG(false, "Unexpected BinaryFunction used."); } _data->code += first_prefix + first->vector(y).str; _data->code += ", "; _data->code += second_prefix + second->vector(y).str; _data->code += ");\n"; } } void op_ternary_elementwise_function(const Operand &dst_name, TernaryFunction func, const Operand &first_name, const Operand &second_name, const Operand &third_name) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *first = operands.unpack(first_name); const IVectorTile *second = operands.unpack(second_name); const IVectorTile *third = operands.unpack(third_name); const IVectorTile *dst = operands.unpack(dst_name); const int32_t dst_h = dst->format().h; const std::string dt = dst->underlying_source_variables()[0].type.str; // Always perform an explicit cast. See similar comments in op_unary_elementwise_function const std::string first_prefix = "(" + dt + ")"; const std::string second_prefix = "(" + dt + ")"; const std::string third_prefix = "(" + dt + ")"; // Broadcasting on Y is automatic for (int32_t y = 0; y < dst_h; ++y) { _data->code += dst->vector(y).str; _data->code += " = "; switch (func) { case TernaryFunction::Select: _data->code += "select("; break; case TernaryFunction::Clamp: _data->code += "clamp("; break; default: CKW_ASSERT_MSG(false, "Unexpected TernaryFunction used."); } _data->code += first_prefix + first->vector(y).str; _data->code += ", "; _data->code += second_prefix + second->vector(y).str; _data->code += ", "; _data->code += third_prefix + third->vector(y).str; _data->code += ");\n"; } } void op_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *lhs = operands.unpack(o_lhs); const IVectorTile *rhs = operands.unpack(o_rhs); assert(is_tile_scalar(lhs)); assert(is_tile_scalar(rhs)); _data->code += "if("; _data->code += lhs->scalar(0, 0).str; _data->code += " "; _data->code += to_string(op); _data->code += " "; _data->code += rhs->scalar(0, 0).str; _data->code += ")\n"; } void op_else_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override { _data->code += "else "; op_if_header(o_lhs, op, o_rhs); } void op_else_header() override { _data->code += "else\n"; } void op_for_loop_header(const Operand &var_name, BinaryOp cond_op, const Operand &cond_value_name, const Operand &update_var_name, AssignmentOp update_op, const Operand &update_value_name) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *var = operands.unpack(var_name); const IVectorTile *cond_value = operands.unpack(cond_value_name); const IVectorTile *update_var = operands.unpack(update_var_name); const IVectorTile *update_value = operands.unpack(update_value_name); const int32_t dst_w = var->format().w; const int32_t dst_h = var->format().h; // It must be a scalar variable CKW_UNUSED(dst_w, dst_h); assert(dst_w == 1); assert(dst_h == 1); _data->code += "for(; "; _data->code += var->scalar(0, 0).str; _data->code += " "; _data->code += to_string(cond_op); _data->code += " " + cond_value->scalar(0, 0).str + "; "; _data->code += update_var->scalar(0, 0).str; _data->code += " "; _data->code += to_string(update_op); _data->code += " " + update_value->scalar(0, 0).str + ")"; _data->code += "\n"; } void op_load_immediate(const TensorOperand &o_tensor, const Operand &o_dst, const Operand &o_x, const Operand &o_y, const Operand &o_z, const Operand &o_batch_idx, const Operand &dilation_y) override { OperandUnpacker operands(_data->tiles, _data->arguments); // Not const as it requires changes to 'load_writer'. IVectorTile *dst = operands.unpack(o_dst); IVectorTile *x = operands.unpack(o_x); IVectorTile *y = operands.unpack(o_y); IVectorTile *z = operands.unpack(o_z); IVectorTile *dil_y = operands.unpack(dilation_y); IVectorTile *b = operands.unpack(o_batch_idx); TensorOperandUnpacker tensor_operands(_data->arguments); IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor); auto gpu_sampler = o_tensor.sampler(); GpuTensor3dMapper mapper(tensor, gpu_sampler); auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load); // Initialize the constant part load_writer->initialize(dst, x, z, b); for (int i = 0; i < dst->format().h; ++i) { std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i); if (dil_y->scalar(0, 0).str != "1") { coord_y += " * " + dil_y->scalar(0, 0).str; } load_writer->write(std::make_pair(i, coord_y)); } load_writer->finalize(); } void op_load_indirect(const TensorOperand &o_tensor, const Operand &o_dst, const Operand &o_x, const Operand &o_indirect_h, const Operand &o_z, const Operand &o_batch_idx) override { OperandUnpacker operands(_data->tiles, _data->arguments); // Not const as it requires changes to 'load_writer'. IVectorTile *dst = operands.unpack(o_dst); IVectorTile *x = operands.unpack(o_x); IVectorTile *y_ind = operands.unpack(o_indirect_h); IVectorTile *z = operands.unpack(o_z); IVectorTile *b = operands.unpack(o_batch_idx); TensorOperandUnpacker tensor_operands(_data->arguments); IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor); auto gpu_sampler = o_tensor.sampler(); GpuTensor3dMapper mapper(tensor, gpu_sampler); auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load); // Initialize the constant part load_writer->initialize(dst, x, z, b); for (int i = 0; i < dst->format().h; ++i) { load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str)); } load_writer->finalize(); } void op_store_immediate(const TensorOperand &tensor_name, const Operand &src_name, const Operand &x_name, const Operand &y_name, const Operand &z_name, const Operand &batch_index_name) override { OperandUnpacker operands(_data->tiles, _data->arguments); // Not const as it requires changes to 'load_writer'. IVectorTile *src = operands.unpack(src_name); IVectorTile *x = operands.unpack(x_name); IVectorTile *y = operands.unpack(y_name); IVectorTile *z = operands.unpack(z_name); IVectorTile *b = operands.unpack(batch_index_name); TensorOperandUnpacker tensor_operands(_data->arguments); IGpuTensorArgument *tensor = tensor_operands.unpack(tensor_name); auto gpu_sampler = tensor_name.sampler(); GpuTensor3dMapper mapper(tensor, gpu_sampler); auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store); // Initialize the constant part store_writer->initialize(src, x, z, b); int32_t tile_h = src->format().h; for (int m0 = tile_h - 1; m0 >= 0; m0--) { store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0))); } store_writer->finalize(); } void op_return() override { _data->code += "return;\n"; } void util_get_indirect_buffer(const Operand &o_dst, const TensorOperand &o_tensor, const Operand &o_x, const Operand &o_y, const Operand &o_x_off, const Operand &o_y_off) override { OperandUnpacker operands(_data->tiles, _data->arguments); const IVectorTile *dst = operands.unpack(o_dst); const IVectorTile *x = operands.unpack(o_x); const IVectorTile *y = operands.unpack(o_y); const IVectorTile *x_off = operands.unpack(o_x_off); const IVectorTile *y_off = operands.unpack(o_y_off); TensorOperandUnpacker tensor_operands(_data->arguments); IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor); assert(dst->format().w == 1); assert(x->format().w == 1); assert(y->format().w == 1); assert(x_off->format().w == 1); assert(y_off->format().w == 1); assert(dst->format().dt == DataType::Int32); assert(x->format().dt == DataType::Int32); assert(y->format().dt == DataType::Int32); assert(x_off->format().dt == DataType::Int32); assert(y_off->format().dt == DataType::Int32); const std::string width = tensor->component(TensorComponentType::Dim1); const std::string height = tensor->component(TensorComponentType::Dim2); const std::string wxh = tensor->component(TensorComponentType::Dim1xDim2); /* int x_s; int y_s; x_s = (xi_0 + x_k); y_s = (yi_0 + y_k); mi_0 = x_s + y_s * width + b * widthxheight; mi_0 = select(-1, mi_0, x_s >= 0); mi_0 = select(-1, mi_0, y_s >= 0); mi_0 = select(-1, mi_0, x_s < 128); mi_0 = select(-1, mi_0, y_s < 128); */ compound_statement_begin(); declare_tile("_x_s", TileInfo(DataType::Int32)); declare_tile("_y_s", TileInfo(DataType::Int32)); auto x_s = operands.unpack(Operand("_x_s")); auto y_s = operands.unpack(Operand("_y_s")); for (int i = 0; i < dst->format().h; ++i) { // x_s = (xi_0 + x_k); // y_s = (yi_0 + y_k); _data->code += x_s->scalar(0, i).str; _data->code += " = ("; _data->code += x->scalar(0, i).str; _data->code += " + "; _data->code += x_off->scalar(0, i).str; _data->code += ");\n"; _data->code += y_s->scalar(0, i).str; _data->code += " = ("; _data->code += y->scalar(0, i).str; _data->code += " + "; _data->code += y_off->scalar(0, i).str; _data->code += ");\n"; // mi_0 = x_s + y_s * width; _data->code += dst->scalar(0, i).str; _data->code += " = "; _data->code += x_s->scalar(0, i).str; _data->code += " + "; _data->code += y_s->scalar(0, i).str; _data->code += " * " + width + ";\n"; // mi_0 = select(wxh, mi_0, x_s >= 0); _data->code += dst->scalar(0, i).str; _data->code += " = select(-1, "; _data->code += dst->scalar(0, i).str; _data->code += ", "; _data->code += x_s->scalar(0, i).str; _data->code += " >= 0);\n"; // mi_0 = select(wxh, mi_0, x_s < width); _data->code += dst->scalar(0, i).str; _data->code += " = select(-1, "; _data->code += dst->scalar(0, i).str; _data->code += ", "; _data->code += x_s->scalar(0, i).str; _data->code += " < "; _data->code += width + ");\n"; // mi_0 = select(wxh, mi_0, y_s >= 0); _data->code += dst->scalar(0, i).str; _data->code += " = select(-1, "; _data->code += dst->scalar(0, i).str; _data->code += ", "; _data->code += y_s->scalar(0, i).str; _data->code += " >= 0);\n"; // mi_0 = select(wxh, mi_0, y_s < height); _data->code += dst->scalar(0, i).str; _data->code += " = select(-1, "; _data->code += dst->scalar(0, i).str; _data->code += ", "; _data->code += y_s->scalar(0, i).str; _data->code += " < "; _data->code += height + ");\n"; } compound_statement_end(); } private: GpuKernelWriterDataHolder *_data{nullptr}; GpuKernelWriterAttribute *_attr{nullptr}; }; /** IGpuKernelWriter factory class */ class GpuKernelWriterFactory final { public: /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language * * @param[in] gpu GPU target * * @return IGpuKernelWriter */ static std::unique_ptr create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x) { switch (x->programming_language()) { case GpuTargetLanguage::OpenCL: return std::make_unique(attr, x); default: std::cout << "Unsupported Gpu programming language" << std::endl; assert(false); return nullptr; } } }; inline int32_t adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx) { auto tensor = tensor_info_id->shape; int32_t dim[3] = {0}; switch (tensor_format) { case TensorSamplerFormat::C_W_H: dim[0] = tensor[0]; dim[1] = tensor[1]; dim[2] = tensor[2]; break; case TensorSamplerFormat::C_WH_1: dim[0] = tensor[0]; dim[1] = tensor[1] * tensor[2]; dim[2] = 1; break; default: std::cout << "Unsupported tensor format" << std::endl; assert(false); break; } return std::min(step, dim[idx]); } } // namespace prototype } // namespace ckw #endif // CKW_PROTOTYPE_SRC_PROTOTYPE_H