aboutsummaryrefslogtreecommitdiff
path: root/compute_kernel_writer/prototype/src/Prototype.h
diff options
context:
space:
mode:
authorGunes Bayir <gunes.bayir@arm.com>2024-02-07 15:34:45 +0000
committerGunes Bayir <gunes.bayir@arm.com>2024-02-09 15:59:45 +0000
commit0ee13afc4429411de9a05ba4c2ff8a580784b568 (patch)
treec9ee1acf684d52b92ffb7500b0b65eee8377ce45 /compute_kernel_writer/prototype/src/Prototype.h
parenta3e1b50588b89a2c0c67da2679728a422fc16402 (diff)
downloadComputeLibrary-0ee13afc4429411de9a05ba4c2ff8a580784b568.tar.gz
Remove CKW prototype and Template Writer
Gpu code in dynamic fusion is now written by stable CKW. We do not need CKW protoype and the older writer implementation, i.e. TemplateWriter. It also removes the need for the flag -DACL_INTERNAL_TEST_CKW_IN_DF to compile and test dynamic fusion operator. Resolves: COMPMID-6715 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Change-Id: I9f9453311e79d9be612bd4754240d832f98503e8 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11116 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'compute_kernel_writer/prototype/src/Prototype.h')
-rw-r--r--compute_kernel_writer/prototype/src/Prototype.h4189
1 files changed, 0 insertions, 4189 deletions
diff --git a/compute_kernel_writer/prototype/src/Prototype.h b/compute_kernel_writer/prototype/src/Prototype.h
deleted file mode 100644
index b392fe2651..0000000000
--- a/compute_kernel_writer/prototype/src/Prototype.h
+++ /dev/null
@@ -1,4189 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef CKW_PROTOTYPE_SRC_PROTOTYPE_H
-#define CKW_PROTOTYPE_SRC_PROTOTYPE_H
-
-#include "ckw/Error.h"
-#include "ckw/TensorInfo.h"
-#include "ckw/types/ConvertPolicy.h"
-#include "ckw/types/DataType.h"
-#include "ckw/types/Functions.h"
-#include "ckw/types/GpuTargetLanguage.h"
-#include "ckw/types/Operators.h"
-#include "ckw/types/TensorSamplerTypes.h"
-
-#include <algorithm>
-#include <array>
-#include <cassert> // assert (to be removed)
-#include <chrono>
-#include <cmath>
-#include <cstdint> // int32_t
-#include <functional>
-#include <iostream> // cout (to be removed)
-#include <map>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace ckw
-{
-namespace prototype
-{
-
-// Dummy data structure for Size2D
-using Size2D = std::vector<int32_t>;
-
-// Dummy Status
-using Status = void;
-
-enum class ComponentType : int32_t
-{
- Complex = 0,
- Simple = 1,
- Unfusable = 2
-};
-
-enum class GpuCompilationSpeed
-{
- Fast = 0x00, // fast compilation may increase the latency of the network
- Slow = 0x01 // slow compilation may decrease the latency of the network
-};
-
-enum class GpuExtensions
-{
- Fp16,
- Dot8,
- Mmul,
- FastMath
-};
-
-struct TensorInfo
-{
- TensorShape shape{{0}};
- DataType data_type{DataType::Unknown};
- TensorDataLayout data_layout{TensorDataLayout::Nhwc};
- int32_t id{-1};
-};
-
-struct ComponentAttribute
-{
- GpuCompilationSpeed compilation_speed{GpuCompilationSpeed::Fast};
- bool overwrite_tile{true};
-};
-
-inline std::string data_type_to_cl_type(DataType dt)
-{
- switch (dt)
- {
- case DataType::Fp32:
- return "float";
- case DataType::Fp16:
- return "half";
- case DataType::Int8:
- return "char";
- case DataType::Uint8:
- return "uchar";
- case DataType::Uint16:
- return "ushort";
- case DataType::Int16:
- return "short";
- case DataType::Uint32:
- return "uint";
- case DataType::Int32:
- return "int";
- case DataType::Bool:
- return "bool";
- default:
- assert(false);
- return "";
- }
-}
-
-inline int32_t width_to_cl_vector_size(int32_t width)
-{
- switch (width)
- {
- case 1:
- return 1;
- case 2:
- return 2;
- case 3:
- return 3;
- case 4:
- return 4;
- case 5:
- case 6:
- case 7:
- case 8:
- return 8;
- case 9:
- case 10:
- case 11:
- case 12:
- case 13:
- case 14:
- case 15:
- case 16:
- return 16;
- default:
- assert(false);
- return 0;
- }
-}
-
-inline std::string get_cl_data_type(DataType dt, int32_t width)
-{
- std::string data_type;
- int32_t w = width_to_cl_vector_size(width);
- data_type += data_type_to_cl_type(dt);
- if (w != 1)
- {
- data_type += std::to_string(w);
- }
- return data_type;
-}
-
-inline std::string to_opencl_store(int32_t vector_length)
-{
- if (vector_length != 1)
- {
- return "vstore" + std::to_string(vector_length) + "(";
- }
- else
- {
- return "*(";
- }
-}
-
-struct TileInfo
-{
- TileInfo()
- {
- }
-
- TileInfo(DataType dt) : dt(dt), w(1), h(1)
- {
- }
-
- TileInfo(DataType dt, int32_t width) : dt(dt), w(width), h(1)
- {
- }
-
- TileInfo(DataType dt, int32_t width, int32_t height) : dt(dt), w(width), h(height)
- {
- }
-
- DataType dt{DataType::Unknown}; // Data type of the tile
- int32_t w{0}; // Width (i.e. c0 - portion of the channels)
- int32_t h{0}; // Height (i.e. s0 - portion of the spatial dimensions)
-};
-
-inline std::ostream &operator<<(std::ostream &o, const TileInfo &a)
-{
- o << a.w << " x " << a.h;
- return o;
-}
-
-struct DataTypeAsString
-{
- std::string str{""};
- DataType dt{DataType::Unknown};
- int32_t size{1};
-};
-
-struct ValueAsString
-{
- std::string str{""};
- DataTypeAsString type{};
-};
-
-// https://stackoverflow.com/questions/51515378/storing-and-accessing-tile-properties-in-c
-// A Tile is a collection of variables used to express a 2D data.
-class IScalarTile
-{
-public:
- virtual ~IScalarTile() = default;
-
- /** Method to get the scalar variable from a tile
- * @param[in] x X coordinate on the width of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
- * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
- *
- * @return the scalar variable as a string
- */
- virtual ValueAsString scalar(int32_t x, int32_t y) const = 0;
-
- /** Method to get the list of underlying variable names used by the tile
- *
- * @return the list of variable names
- */
- virtual std::vector<ValueAsString> underlying_source_variables() const = 0;
-
- /** Method to get the name of the tile.
- *
- * @return the name of the tile
- */
- std::string name() const
- {
- return _basename;
- }
-
- /** Method to get the tile format
- *
- * @return the format
- */
- TileInfo format() const
- {
- return _format;
- }
-
- /** Method to know whether the tile is assignable or not (constant)
- *
- * @return true if the tile is assignable
- */
- virtual bool is_assignable() const = 0;
-
- /** Method to know whether the tile needs to be declared
- *
- * @return true if the tile needs to be declared in the code before being used
- */
- virtual bool need_declaration() const = 0;
-
-protected:
- TileInfo _format{}; // Tile format
- std::string _basename{""}; // Tile name
-};
-
-// A tile is a collection of variables used to express a 2D data. The variables are vectors in the GPU context.
-// The vector size is given by the width of the tile. The number of vectors height by depth defines the number of vectors
-class IVectorTile : public IScalarTile
-{
-public:
- virtual ~IVectorTile() = default;
-
- /** Method to get the vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
- * The user can query the list of supported width for the vectors through preferred_vector_sizes().
- *
- * @param[in] y Y coordinate on the height of the tile. If out-of-bound, the coordinate is clamped to the nearest valid edge
- *
- * @return the vector variable as a string
- */
- virtual ValueAsString vector(int32_t y) const = 0;
-
- /** Method to get a vector variable from a tile. A vector is an ordered homogeneous collection of two or more scalars.
- *
- * @return the vector variable as a string
- */
- virtual ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const = 0;
- /** Method to get the preferred vector sizes.
- *
- * @return a vector with the preferred vector sizes
- */
- //virtual std::vector<int32_t> preferred_vector_sizes() const = 0;
-};
-
-class ClTile : public IVectorTile
-{
-public:
- ClTile(const std::string &name, TileInfo format)
- {
- _format = format;
- _basename = name;
- }
-
- ValueAsString scalar(int32_t x, int32_t y) const override
- {
- x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
- y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
- ValueAsString t;
- t.str = build_variable_name(y);
- t.type.str = get_cl_data_type(_format.dt, 1);
- t.type.dt = _format.dt;
- t.type.size = 1;
-
- // Check required because if the width has only one element, we cannot use .s0
- if (_format.w != 1)
- {
- // Automatic broadcasting
- t.str += ".s" + std::to_string(x);
- }
-
- return t;
- }
-
- ValueAsString vector(int32_t y) const override
- {
- y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
- ValueAsString t;
- t.str = build_variable_name(y);
- t.type.str = get_cl_data_type(_format.dt, _format.w);
- t.type.dt = _format.dt;
- t.type.size = _format.w;
- return t;
- }
-
- ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
- {
- y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
- ValueAsString t;
- t.str = build_variable_name(y);
- t.type.str = get_cl_data_type(_format.dt, width);
- t.type.dt = _format.dt;
- t.type.size = width;
-
- if (_format.w != 1)
- {
- t.str += ".s";
- for (int i = 0; i < width; ++i)
- {
- t.str += to_scalar_hex(x_start + i);
- }
- }
- return t;
- }
-
- std::vector<ValueAsString> underlying_source_variables() const override
- {
- std::vector<ValueAsString> vars;
- for (int32_t y = 0; y < _format.h; ++y)
- {
- ValueAsString t;
- t.str = build_variable_name(y);
- t.type.str = get_cl_data_type(_format.dt, _format.w);
- t.type.dt = _format.dt;
- t.type.size = _format.w;
- vars.push_back(t);
- }
- return vars;
- }
-
- bool is_assignable() const override
- {
- return true;
- }
-
- bool need_declaration() const override
- {
- return true;
- }
-
-private:
- std::string build_variable_name(int32_t y) const
- {
- std::string var_name = _basename;
-
- if (_format.h == 1)
- {
- return var_name;
- }
- else
- {
- var_name += "_";
- var_name += std::to_string(y);
- }
-
- return var_name;
- }
-
- std::string to_scalar_hex(int32_t x) const
- {
- switch (x)
- {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- case 8:
- case 9:
- return std::to_string(x);
- case 10:
- return "A";
- case 11:
- return "B";
- case 12:
- return "C";
- case 13:
- return "D";
- case 14:
- return "E";
- case 15:
- return "F";
- default:
- std::cout << "Unsupported hexadecimal value" << std::endl;
- assert(false);
- return "";
- }
- }
-};
-
-// Unique features: It contains values in the form of string. The name used for this object is misleading since the variables can change the value over time.
-class ClConstantTile : public IVectorTile
-{
-public:
- ClConstantTile(const std::vector<std::vector<std::string>> &in, DataType dt)
- {
- _format.w = in[0].size();
- _format.h = in.size();
- _format.dt = dt;
-
- _data = std::vector<std::vector<std::string>>(_format.h, std::vector<std::string>(_format.w));
-
- for (int32_t y = 0; y < _format.h; ++y)
- {
- for (int32_t x = 0; x < _format.w; ++x)
- {
- _data[y][x] = in[y][x];
- }
- }
- }
-
- ValueAsString scalar(int32_t x, int32_t y) const override
- {
- x = std::max(std::min(x, _format.w - 1), static_cast<int32_t>(0));
- y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
- ValueAsString t;
- t.str = _data[y][x];
- t.type.str = get_cl_data_type(_format.dt, 1);
- t.type.dt = _format.dt;
- t.type.size = 1;
-
- return t;
- }
-
- ValueAsString vector(int32_t y) const override
- {
- y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
- return vector(0, _format.w, y);
- }
-
- ValueAsString vector(int32_t x_start, int32_t width, int32_t y) const override
- {
- y = std::max(std::min(y, _format.h - 1), static_cast<int32_t>(0));
-
- ValueAsString t;
- t.str = "";
- t.type.str = get_cl_data_type(_format.dt, width);
- t.type.dt = _format.dt;
- t.type.size = width;
-
- if (width > 1)
- {
- t.str += "((" + get_cl_data_type(_format.dt, width) + ")(";
- }
-
- int32_t x = x_start;
- for (; x < width - 1; ++x)
- {
- t.str += scalar(x, y).str;
- t.str += ", ";
- }
- t.str += scalar(x, y).str;
-
- if (width > 1)
- {
- t.str += "))";
- }
-
- return t;
- }
-
- std::vector<ValueAsString> underlying_source_variables() const override
- {
- std::vector<ValueAsString> vars;
-
- for (int32_t y = 0; y < _format.h; ++y)
- {
- for (int32_t x = 0; x < _format.w; ++x)
- {
- ValueAsString t;
- t.str = _data[y][x];
- t.type.str = get_cl_data_type(_format.dt, 1);
- t.type.dt = _format.dt;
- t.type.size = 1;
- vars.push_back(t);
- }
- }
-
- return vars;
- }
-
- bool is_assignable() const override
- {
- return false;
- }
-
- bool need_declaration() const override
- {
- return false;
- }
-
-private:
- std::vector<std::vector<std::string>> _data{};
-};
-
-enum class TensorComponentIndex : int32_t
-{
- IndexMask = 0x0000000f,
-};
-
-enum class TensorComponentGroup : int32_t
-{
- OffsetFirstElement = 0x00000100,
- Stride = 0x00001000,
- Dimension = 0x00010000,
- FoldedDimension = 0x00100000,
- Constant = 0x01000000
-};
-
-inline std::string to_string(TensorComponentType x)
-{
- switch (x)
- {
- case TensorComponentType::Unknown:
- return "Unknown";
- case TensorComponentType::OffsetFirstElement:
- return "OffsetFirstElement";
- case TensorComponentType::Stride1:
- return "Stride1";
- case TensorComponentType::Stride2:
- return "Stride2";
- case TensorComponentType::Stride3:
- return "Stride3";
- case TensorComponentType::Stride4:
- return "Stride4";
- case TensorComponentType::Dim0:
- return "Dim0";
- case TensorComponentType::Dim1:
- return "Dim1";
- case TensorComponentType::Dim2:
- return "Dim2";
- case TensorComponentType::Dim3:
- return "Dim3";
- case TensorComponentType::Dim4:
- return "Dim4";
- case TensorComponentType::Dim1xDim2:
- return "Dim1xDim2";
- case TensorComponentType::Dim1xDim2xDim3:
- return "Dim1xDim2xDim3";
- default:
- assert(false);
- return "";
- }
-}
-
-class ITensorArgument
-{
-public:
- virtual ~ITensorArgument() = default;
-
- /** Method to get the tensor component as a string
- *
- * @param[in] x tensor component to query
- *
- * @return the tensor component as a string
- */
- virtual std::string component(TensorComponentType x) = 0;
-
- /** Method to get the tensor component type declaration as a string
- *
- * @return the tensor component type declaration as a string
- */
- virtual std::string component_type_declaration() const = 0;
-
- /** Method to get the tensor component data type
- *
- * @return the tensor component data type
- */
- virtual DataType component_data_type() const = 0;
-
- /** Method to get the tensor component declarations
- *
- * @return a vector containing the tensor component declarations
- */
- virtual std::vector<TensorComponentType> component_declarations() const = 0;
-
- /** Method to get the name of the tensor argument.
- *
- * @return the name of the tensor argument
- */
- std::string name() const
- {
- return _basename;
- }
-
- /** Method to get the tensor format
- *
- * @return the format
- */
- TensorInfo format() const
- {
- return _format;
- }
-
-protected:
- TensorInfo _format{};
- std::string _basename{};
-};
-
-enum class GpuTensorStorage : int32_t
-{
- Unknown = 0x0000,
- BufferUint8Ptr = 0x0012,
- Image2dReadOnly = 0x0020,
- Image2dWriteOnly = 0x0021,
- Image3dReadOnly = 0x0030,
- Image3dWriteOnly = 0x0031
-};
-
-inline GpuTensorStorage to_gpu_tensor_storage(TensorStorageType s)
-{
- switch (s)
- {
- case TensorStorageType::Unknown:
- return GpuTensorStorage::Unknown;
-
- case TensorStorageType::BufferUint8Ptr:
- return GpuTensorStorage::BufferUint8Ptr;
-
- case TensorStorageType::Texture2dReadOnly:
- return GpuTensorStorage::Image2dReadOnly;
-
- case TensorStorageType::Texture2dWriteOnly:
- return GpuTensorStorage::Image2dWriteOnly;
-
- default:
- assert(false);
- return GpuTensorStorage::Unknown;
- }
-}
-
-inline TensorStorageType to_tensor_storage(GpuTensorStorage s)
-{
- switch (s)
- {
- case GpuTensorStorage::Unknown:
- return TensorStorageType::Unknown;
-
- case GpuTensorStorage::BufferUint8Ptr:
- return TensorStorageType::BufferUint8Ptr;
-
- case GpuTensorStorage::Image2dReadOnly:
- return TensorStorageType::Texture2dReadOnly;
-
- case GpuTensorStorage::Image2dWriteOnly:
- return TensorStorageType::Texture2dWriteOnly;
-
- default:
- assert(false);
- return TensorStorageType::Unknown;
- }
-}
-
-class IGpuTensorArgument : public ITensorArgument
-{
-public:
- virtual ~IGpuTensorArgument() = default;
-
- /** Method to get the tensor storage, which is the underlying storage used to keep the data memory
- *
- * @param[in] x tensor storage to query
- *
- * @return the tensor storage as a string
- */
- virtual std::string storage(GpuTensorStorage x) = 0;
-
- /** Method to get the tensor storage type declaration as a string
- *
- * @param[in] x tensor component to query
- *
- * @return the tensor storage type declaration as a string
- */
- virtual std::string storage_type_declaration(GpuTensorStorage x) const = 0;
-
- /** Method to get the tensor storage declarations
- *
- * @return a vector containing the tensor storage declarations
- */
- virtual std::vector<GpuTensorStorage> storage_declarations() const = 0;
-};
-
-class ClTensorArgument : public IGpuTensorArgument
-{
-public:
- ClTensorArgument(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible)
- {
- _basename = name;
- _format = x;
- _return_by_value_when_possible = return_by_value_when_possible;
- }
-
- // Methods to override
- std::string component(TensorComponentType x) override
- {
- if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Constant)))
- {
- int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
- return std::to_string(idx - 1);
- }
-
- if (_return_by_value_when_possible)
- {
- if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::Dimension)))
- {
- int32_t idx = static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentIndex::IndexMask);
- return std::to_string(_format.shape[idx]);
- }
-
- if ((static_cast<int32_t>(x) & static_cast<int32_t>(TensorComponentGroup::FoldedDimension)))
- {
- switch (x)
- {
- case TensorComponentType::Dim1xDim2:
- return std::to_string(_format.shape[1] * _format.shape[2]);
- case TensorComponentType::Dim1xDim2xDim3:
- return std::to_string(_format.shape[1] * _format.shape[2] * _format.shape[2]);
- default:
- std::cout << "Unsupported folded dimension" << std::endl;
- assert(false);
- }
- }
- }
-
- if (std::find(_components_required.begin(), _components_required.end(), x) == _components_required.end())
- {
- _components_required.push_back(x);
- }
-
- return build_component_name(x);
- }
-
- std::string component_type_declaration() const override
- {
- return "int";
- };
-
- DataType component_data_type() const override
- {
- return DataType::Int32;
- }
-
- std::string storage(GpuTensorStorage x) override
- {
- if (std::find(_storage_required.begin(), _storage_required.end(), x) == _storage_required.end())
- {
- _storage_required.push_back(x);
- }
-
- return build_storage_name(x);
- }
-
- std::string storage_type_declaration(GpuTensorStorage x) const override
- {
- switch (x)
- {
- case GpuTensorStorage::BufferUint8Ptr:
- return "__global uchar*";
- case GpuTensorStorage::Image2dReadOnly:
- return "__read_only image2d_t";
- case GpuTensorStorage::Image2dWriteOnly:
- return "__write_only image2d_t";
- case GpuTensorStorage::Image3dReadOnly:
- return "__read_only image3d_t ";
- case GpuTensorStorage::Image3dWriteOnly:
- return "__write_only image3d_t ";
- default:
- std::cout << "Unsupported storage" << std::endl;
- assert(false);
- return "";
- }
- };
-
- std::vector<GpuTensorStorage> storage_declarations() const override
- {
- return _storage_required;
- }
-
- std::vector<TensorComponentType> component_declarations() const override
- {
- return _components_required;
- }
-
-private:
- std::string build_storage_name(GpuTensorStorage x) const
- {
- std::string var_name = _basename;
-
- switch (x)
- {
- case GpuTensorStorage::BufferUint8Ptr:
- return var_name + "_ptr";
- case GpuTensorStorage::Image2dReadOnly:
- case GpuTensorStorage::Image2dWriteOnly:
- return var_name + "_img2d";
- case GpuTensorStorage::Image3dReadOnly:
- case GpuTensorStorage::Image3dWriteOnly:
- return var_name + "_img3d";
- default:
- std::cout << "Unsupported storage" << std::endl;
- assert(false);
- }
-
- return var_name;
- }
-
- std::string build_component_name(TensorComponentType x) const
- {
- std::string var_name = _basename;
-
- switch (x)
- {
- case TensorComponentType::OffsetFirstElement:
- return var_name + "_offset_first_element";
- case TensorComponentType::Stride1:
- return var_name + "_stride1";
- case TensorComponentType::Stride2:
- return var_name + "_stride2";
- case TensorComponentType::Stride3:
- return var_name + "_stride3";
- case TensorComponentType::Dim0:
- return var_name + "_dim0";
- case TensorComponentType::Dim1:
- return var_name + "_dim1";
- case TensorComponentType::Dim2:
- return var_name + "_dim2";
- case TensorComponentType::Dim3:
- return var_name + "_dim3";
- case TensorComponentType::Dim1xDim2:
- return var_name + "_dim1xdim2";
- case TensorComponentType::Dim1xDim2xDim3:
- return var_name + "_dim1xdim2xdim3";
- default:
- std::cout << "Unsupported component" << std::endl;
- assert(false);
- }
-
- return var_name;
- }
-
- bool _return_by_value_when_possible{false};
- std::vector<GpuTensorStorage> _storage_required{};
- std::vector<TensorComponentType> _components_required{};
-};
-
-/**
- * @brief Data structure that contains the declared tiles by the components.
- * The registry is a linear data structure that follows the similar principle of the stack. The user can use the @p increment_registry_level() method to
- * increase the level of the stack (0 when it starts). When the user uses the @p decrement_registry_level() method, the registry decreases the level of the stack
- * and remove (pop) all the tiles from the level above.
- * When a tile is declared on the level 0, it is a global tile. A global tile is visible in all parts of the code.
- * Since different components may use the same name to define a tile, the registry adopts the IdSpace concept, an @p id to prevent name collisions
- * when declaring tiles among different components.
- *
- */
-class GpuTileRegistry
-{
-public:
- enum class RegistryTileType
- {
- Tile,
- Link
- };
-
- using RegistryIdSpace = int32_t;
- using RegistryLevel = int32_t;
- using RegistryTileName = std::string;
-
- struct RegistryTileTableEntry
- {
- RegistryLevel registry_level{0};
- std::unique_ptr<IVectorTile> tile_object{nullptr};
- };
-
- struct RegistryTileTypeTableEntry
- {
- RegistryTileType tile_type{RegistryTileType::Tile};
- RegistryTileName tile_name{};
- RegistryIdSpace registry_idspace{0};
- RegistryLevel registry_level{0};
- };
-
- using RegistryTileTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTableEntry>>;
- using RegistryTileTypeTable = std::map<RegistryIdSpace, std::map<RegistryTileName, RegistryTileTypeTableEntry>>;
-
- /**
- * @brief Construct a new Gpu Tile Registry object
- *
- */
- GpuTileRegistry()
- {
- _language = GpuTargetLanguage::Unknown;
- }
-
- /**
- * @brief Construct a new Gpu Tile Registry object providing the Gpu programming language
- *
- * @param[in] language Gpu programming language to use
- */
- GpuTileRegistry(GpuTargetLanguage language)
- {
- _language = language;
- }
-
- /**
- * @brief Default destructor. Destroy the Gpu Tile Registry object
- *
- */
- ~GpuTileRegistry() = default;
-
- /**
- * @brief Set the working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles.
- * Therefore, the IdSpace should be set before declaring any tiles.
- *
- * @param[in] id The IdSpace id
- */
- void set_IdSpace(int32_t id)
- {
- _IdSpace = id;
- }
-
- /**
- * @brief Get the current working IdSpace for the tile registry. IdSpace is used to prevent name collisions when declaring tiles
- *
- * @return The IdSpace id
- */
- int32_t IdSpace() const
- {
- return _IdSpace;
- }
-
- /**
- * @brief Gets all the IdSpace declarations defined in the tile registry.
- *
- * @return all the IdSpace declarations defined in the tile registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
- */
- std::vector<int32_t> IdSpace_declarations() const
- {
- std::vector<int32_t> x;
-
- auto it = _frags.begin();
-
- while (it != _frags.end())
- {
- x.push_back(it->first);
-
- it++;
- }
-
- return x;
- }
-
- /**
- * @brief Declare a tile from a previously created tile
- */
- void insert(const std::string &name, const IVectorTile *frag)
- {
- assert(_language == GpuTargetLanguage::OpenCL);
- const int32_t key_IdSpace = _IdSpace;
- const std::string key_var_name = name;
- const std::string var_name = frag->name();
- TileInfo format = frag->format();
-
- // First check whether a tile with the same name exists
- IVectorTile *result = (*this)[key_var_name];
- assert(result == nullptr);
- if (result == nullptr)
- {
- std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
-
- _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
- _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
- _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Link;
- _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
- _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
- _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
- }
- }
-
- /**
- * @brief Declare a tile with TileInfo. The tile will be stored in the IdSpace set with @p set_IdSpace()
- *
- * @note The reference name used for declaring the tile should not be previously used in the IdSpace
- *
- * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
- * @param[in] format Tile format use to use
- */
- void insert(const std::string &name, const TileInfo &format)
- {
- assert(_language == GpuTargetLanguage::OpenCL);
- const int32_t key_IdSpace = _IdSpace;
- const std::string key_var_name = name;
- const std::string var_name = generate_tile_name(name);
-
- // First check whether a tile with the same name exists
- IVectorTile *result = (*this)[key_var_name];
- assert(result == nullptr);
- if (result == nullptr)
- {
- std::unique_ptr<ClTile> tile = std::make_unique<ClTile>(var_name, format);
- _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
- _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
- _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
- _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
- _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
- _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
- }
- }
-
- /**
- * @brief Declare a constant tile. The content of the tile is passed as a vector of std::string
- *
- * @note The reference name used for declaring the tile should not be previously used in the IdSpace
- *
- * @param[in] name Reference name for the tile. The reference name can be used to retrieve the tile stored in the registry.
- * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
- * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user's responsibilty to ensure
- * that the data type is aligned with the content of the std::string.
- */
- void insert(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt)
- {
- assert(_language == GpuTargetLanguage::OpenCL);
- const int32_t key_IdSpace = _IdSpace;
- const std::string key_var_name = name;
-
- // First check whether a tile with the same name exists
- IVectorTile *result = (*this)[key_var_name];
- assert(result == nullptr);
- if (result == nullptr)
- {
- std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
- _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
- _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
- _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
- _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
- _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
- _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
- }
- }
-
- /**
- * @brief Declare an anonymous constant tile. The content of the tile is passed as a vector of std::string
- *
- * @note This method can be used to declare temporary tiles that need to be accessed only once.
- *
- * @param[in] in A 3D std::vector of std::string. From the 3D std::vector we can know the dimensions for the tile
- * @param[in] dt The data type for the elements stored in the 3D std::vector as std::string. It is user responsibilty to ensure
- * that the data type is aligned with what passed with the std::string.
- *
- * @return IVectorTile* the anonymous constant tile
- */
- IVectorTile *insert(const std::vector<std::vector<std::string>> &in, DataType dt)
- {
- assert(_language == GpuTargetLanguage::OpenCL);
- const int32_t key_IdSpace = _IdSpace;
- const std::string key_var_name = "_" + std::to_string(_anonymous_frag_count++);
-
- // First check whether a tile with the same name exists
- IVectorTile *result = (*this)[key_var_name];
- assert(result == nullptr);
- if (result == nullptr)
- {
- std::unique_ptr<ClConstantTile> tile = std::make_unique<ClConstantTile>(in, dt);
- _frags[key_IdSpace][key_var_name].tile_object = std::move(tile);
- _frags[key_IdSpace][key_var_name].registry_level = _registry_level;
-
- _frag_types[key_IdSpace][key_var_name].tile_type = RegistryTileType::Tile;
- _frag_types[key_IdSpace][key_var_name].tile_name = key_var_name;
- _frag_types[key_IdSpace][key_var_name].registry_idspace = _IdSpace;
- _frag_types[key_IdSpace][key_var_name].registry_level = _registry_level;
- }
-
- return (*this)[key_var_name];
- }
-
- /**
- * @brief Get the tile from the registry. This method searches the tile in the IdSpace provided by the user
- *
- * @param[in] name The name of the tile to retrieve
- * @param[in] IdSpace The IdSpace id where to search the tile
- *
- * @return IVectorTile* The tile
- */
- IVectorTile *get(const std::string &name, int32_t IdSpace)
- {
- const int32_t key_IdSpace = IdSpace;
- const std::string key_var_name = name;
-
- IVectorTile *result = nullptr;
- auto search_IdSpace = _frags.find(key_IdSpace);
- if (search_IdSpace != _frags.end())
- {
- auto search_tile = _frags[key_IdSpace].find(key_var_name);
- if (search_tile != _frags[key_IdSpace].end())
- {
- result = search_tile->second.tile_object.get();
- assert(result != nullptr);
- }
- }
-
- return result;
- }
-
- /**
- * @brief Get the tile from the registry. This method searches the tile in the IdSpace set with @p set_IdSpace()
- *
- * @param[in] name The name of the tile to retrieve
- *
- * @return IVectorTile* The tile
- */
- IVectorTile *operator[](const std::string &name)
- {
- return get(name, _IdSpace);
- }
-
- /**
- * @brief Check whether the tile in the in the IdSpace provided by the user exists
- *
- * @param[in] name Name of the tile to search for
- * @param[in] IdSpace The IdSpace id where to search the tile
- *
- * @return true if the tile exists
- * @return false if the tile does not exist
- */
- bool has_tile(const std::string &name, int32_t IdSpace) const
- {
- const int32_t key_IdSpace = IdSpace;
- const std::string key_var_name = name;
-
- // IVectorTile* result = nullptr;
- auto search_IdSpace = _frags.find(key_IdSpace);
-
- return search_IdSpace != _frags.end();
- }
-
- /**
- * @brief Check whether the tile within the current IdSpace exists
- *
- * @param[in] name Name of the tile to search for
- *
- * @return true if the tile exists
- * @return false if the tile does not exist
- */
- bool has_tile(const std::string &name) const
- {
- return has_tile(name, _IdSpace);
- }
-
- /**
- * @brief Get all the tiles declared within the IdSpace provided by the user
- *
- * @param[in] IdSpace IdSpace where to retrieve all the declared tiles
- *
- * @return std::vector<IVectorTile*> A vector with all the declared tiles in the IdSpace provided by the user
- */
- std::vector<IVectorTile *> tile_declarations(int32_t IdSpace)
- {
- std::vector<IVectorTile *> tiles;
-
- std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it = _frag_types[IdSpace].begin();
-
- while (it != _frag_types[IdSpace].end())
- {
- // The following line should be enabled. However, we cannot at this stage
- // because it used to retrieve the output tile produced by each component.
- // However, this method should NOT be used to retrieve the output tile
- //if(it->second.tile_type == RegistryTileType::Tile)
- {
- tiles.push_back(get(it->second.tile_name, it->second.registry_idspace));
- }
- it++;
- }
-
- return tiles;
- }
-
- /**
- * @brief Increase the level of stack.
- *
- */
- void increment_registry_level()
- {
- _registry_level++;
- }
-
- /**
- * @brief Remove all the tiles declared at the current stack level and decrease the level of the stack.
- *
- */
- void decrement_registry_level()
- {
- assert(_registry_level >= 0);
-
- // Remove all variables in the local scope
- std::map<RegistryTileName, RegistryTileTableEntry>::iterator it = _frags[_IdSpace].begin();
-
- while (it != _frags[_IdSpace].end())
- {
- if (it->second.registry_level == _registry_level)
- {
- it = _frags[_IdSpace].erase(it);
- }
- else
- {
- it++;
- }
- }
-
- std::map<RegistryTileName, RegistryTileTypeTableEntry>::iterator it_type = _frag_types[_IdSpace].begin();
-
- while (it_type != _frag_types[_IdSpace].end())
- {
- if (it_type->second.registry_level == _registry_level)
- {
- it_type = _frag_types[_IdSpace].erase(it_type);
- }
- else
- {
- it_type++;
- }
- }
-
- _registry_level--;
- }
-
- /**
- * @brief Get the level of the stack
- *
- */
- int32_t level() const
- {
- return _registry_level;
- }
-
-private:
- // This method ensures that the key is unique among different components
- std::string generate_tile_name(const std::string &name)
- {
- assert(_IdSpace >= 0);
- if (_registry_level == 0)
- {
- return "_G" + std::to_string(_IdSpace) + "_" + name;
- }
- else
- {
- return name;
- }
- }
-
- RegistryTileTable _frags{};
- RegistryTileTypeTable _frag_types{};
- RegistryLevel _registry_level{0};
- RegistryIdSpace _IdSpace{-1};
- int32_t _anonymous_frag_count{0}; // Counter used to create the anonymous tiles
- GpuTargetLanguage _language{GpuTargetLanguage::Unknown}; // Gpu programming language
-};
-
-using TensorEntry = std::unique_ptr<IGpuTensorArgument>;
-
-/**
- * @brief Data structure that contains the tensors consumed by the components.
- * Since different components may use the same name as reference for a tensor, the registry adopts the IdSpace concept, an @p id to prevent name collisions
- * when declaring tensors among different components.
- *
- */
-class GpuTensorArgumentRegistry
-{
-public:
- /**
- * @brief Construct a new Gpu Tensor Registry object
- *
- */
- GpuTensorArgumentRegistry()
- {
- _language = GpuTargetLanguage::Unknown;
- }
-
- /**
- * @brief Construct a new Gpu Tensor Registry object
- *
- * @param[in] language Gpu programming language to use
- */
- GpuTensorArgumentRegistry(GpuTargetLanguage language)
- {
- _language = language;
- }
-
- /**
- * @brief Default destructor. Destroy the Gpu Tensor Registry object
- *
- */
- ~GpuTensorArgumentRegistry() = default;
-
- /**
- * @brief Set the working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors.
- * Therefore, the IdSpace should be set before declaring any tensors.
- *
- * @param[in] id The IdSpace id
- */
- void set_IdSpace(int32_t id)
- {
- _IdSpace = id;
- }
-
- /**
- * @brief Get the current working IdSpace for the tensor registry. IdSpace is used to prevent name collisions when declaring tensors
- *
- * @return The IdSpace id
- */
- int32_t IdSpace() const
- {
- return _IdSpace;
- }
-
- /**
- * @brief Gets all the IdSpace declarations defined in the tensor registry.
- *
- * @return all the IdSpace declarations defined in the tensor registry as std::vector<int32_t>. It returns an empty vector if there are no IdSpace declarations.
- */
- std::vector<int32_t> IdSpace_declarations() const
- {
- std::vector<int32_t> x;
-
- auto it = _refs.begin();
-
- while (it != _refs.end())
- {
- x.push_back(it->first);
-
- it++;
- }
-
- return x;
- }
-
- /**
- * @brief Declare a tensor with TensorInfo. The tensor will be stored in the IdSpace set with @p set_IdSpace()
- *
- * @note The reference name used for declaring the tensor should not be previously used in the IdSpace
- *
- * @param[in] name Reference name for the tensor. The reference name can be used to retrieve the tensor stored in the registry.
- * @param[in] x Pair of tensor info and tensor id
- * @param[in] return_by_value_when_possible True if we want the value stored in the tensor components
- */
- void insert(const std::string &name, const TensorInfo &x, bool return_by_value_when_possible)
- {
- assert(_language == GpuTargetLanguage::OpenCL);
- const int32_t key_IdSpace = _IdSpace;
- const int32_t tensor_id = x.id;
- const std::string key_var_name = name;
- const std::string var_name = generate_tensor_name(name, tensor_id);
-
- // First, check whether the tensor has already a reference. If so, trigger an assert
- assert(!has_tensor_argument(name));
-
- // Check whether a tensor with that tensorID exists
- auto result = _tensor_arguments.find(tensor_id);
- if (result == _tensor_arguments.end())
- {
- // It means that we haven't added a tensor with that tensor_id yet. Create a IGpuTensorArgument before creating the reference
- std::unique_ptr<ClTensorArgument> arg =
- std::make_unique<ClTensorArgument>(var_name, x, return_by_value_when_possible);
- _tensor_arguments[tensor_id] = std::move(arg);
- }
-
- _refs[key_IdSpace][key_var_name] = tensor_id;
- }
-
- /**
- * @brief Get the tensor from the registry. This method searches the tensor in the IdSpace set with @p set_IdSpace()
- *
- * @param[in] name The name of the tensor to retrieve
- *
- * @return IGpuTensor* The tensor
- */
- IGpuTensorArgument *operator[](const std::string &name)
- {
- const int32_t key_IdSpace = _IdSpace;
- const std::string key_var_name = name;
-
- IGpuTensorArgument *result = nullptr;
- auto search_IdSpace = _refs.find(key_IdSpace);
- if (search_IdSpace != _refs.end())
- {
- auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
- if (search_tensor_id != _refs[key_IdSpace].end())
- {
- const int32_t tensor_id = search_tensor_id->second;
- auto search_tensor_argument = _tensor_arguments.find(tensor_id);
- if (search_tensor_argument != _tensor_arguments.end())
- {
- result = search_tensor_argument->second.get();
- }
- assert(result != nullptr);
- }
- }
-
- return result;
- }
-
- /**
- * @brief Get all the tensors declared in the IdSpace provided by the user
- *
- * @return std::vector<IGpuTensorArgument*> A vector with all the declared tensors
- */
- std::vector<IGpuTensorArgument *> tensor_argument_declarations()
- {
- std::vector<IGpuTensorArgument *> args;
-
- auto it = _tensor_arguments.begin();
-
- while (it != _tensor_arguments.end())
- {
- args.push_back(it->second.get());
- it++;
- }
-
- return args;
- }
-
- /**
- * @brief Check whether the tensor argument in the IdSpace set with @p set_IdSpace() exists
- *
- * @param[in] name Name of the tensor argument to search for
- *
- * @return true if the tensor argument exists
- * @return false if the tensor argument does not exist
- */
- bool has_tensor_argument(const std::string &name)
- {
- const int32_t key_IdSpace = _IdSpace;
- const std::string key_var_name = name;
-
- auto search_IdSpace = _refs.find(key_IdSpace);
-
- if (search_IdSpace != _refs.end())
- {
- auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
- return search_tensor_id != _refs[key_IdSpace].end();
- }
- else
- {
- return false;
- }
- }
-
- /**
- * @brief Check whether the tensor argument is in the the IdSpace provided by the user
- *
- * @param[in] name Name of the tensor argument to search for
- * @param[in] IdSpace The IdSpace id where to search the tensor argument
- *
- * @return true if the tile exists
- * @return false if the tile does not exist
- */
- bool has_tensor_argument(const std::string &name, int32_t IdSpace)
- {
- const int32_t key_IdSpace = IdSpace;
- const std::string key_var_name = name;
-
- auto search_IdSpace = _refs.find(key_IdSpace);
-
- if (search_IdSpace != _refs.end())
- {
- auto search_tensor_id = _refs[key_IdSpace].find(key_var_name);
-
- return search_tensor_id != _refs[key_IdSpace].end();
- }
- else
- {
- return false;
- }
- }
-
-private:
- // This method ensures that the key is unique among different components
- std::string generate_tensor_name(const std::string &name, int32_t tensor_id)
- {
- assert(tensor_id >= 0);
-
- return name + std::to_string(tensor_id);
- }
-
- std::map<int32_t, TensorEntry> _tensor_arguments{};
- std::map<int32_t, std::map<std::string, int32_t>> _refs{};
- int32_t _IdSpace{-1};
- GpuTargetLanguage _language{GpuTargetLanguage::Unknown}; // Gpu programming language
-};
-
-enum class OpType : int32_t
-{
- Elementwise = 0x0000,
- Relational = 0x1000,
- Algebra = 0x2000
-};
-
-inline std::string to_string(AssignmentOp op)
-{
- switch (op)
- {
- case AssignmentOp::Decrement:
- return "-=";
- case AssignmentOp::Increment:
- return "+=";
- default:
- assert(false);
- return "";
- }
-}
-
-inline std::string to_string(UnaryOp op)
-{
- switch (op)
- {
- case UnaryOp::LogicalNot:
- return "!";
- case UnaryOp::BitwiseNot:
- return "~";
- case UnaryOp::Negate:
- return "-";
- default:
- assert(false);
- return "";
- }
-}
-
-inline std::string to_string(BinaryOp op)
-{
- switch (op)
- {
- case BinaryOp::Add:
- return "+";
- case BinaryOp::Sub:
- return "-";
- case BinaryOp::Mul:
- return "*";
- case BinaryOp::Div:
- return "/";
- case BinaryOp::Mod:
- return "%";
- case BinaryOp::Equal:
- return "==";
- case BinaryOp::Less:
- return "<";
- case BinaryOp::LessEqual:
- return "<=";
- case BinaryOp::Greater:
- return ">";
- case BinaryOp::GreaterEqual:
- return ">=";
- case BinaryOp::LogicalAnd:
- return "&&";
- case BinaryOp::LogicalOr:
- return "||";
- case BinaryOp::BitwiseXOR:
- return "^";
- default:
- assert(false);
- return "";
- }
-}
-
-inline std::string binary_op_string(BinaryOp op)
-{
- switch (op)
- {
- case BinaryOp::Add:
- return "add";
- case BinaryOp::Sub:
- return "sub";
- case BinaryOp::Mul:
- return "mul";
- case BinaryOp::Div:
- return "div";
- case BinaryOp::Mod:
- return "mod";
- case BinaryOp::Equal:
- return "eq";
- case BinaryOp::Less:
- return "gt";
- case BinaryOp::LessEqual:
- return "gteq";
- case BinaryOp::Greater:
- return "lt";
- case BinaryOp::GreaterEqual:
- return "lte";
- default:
- assert(false);
- return "";
- }
-}
-
-enum class OperandType : int32_t
-{
- Unknown = 0x00000000,
- ScalarFp32 = 0x00001011, // Immediate scalar tile
- ScalarFp16 = 0x00001012, // Immediate scalar tile
- ScalarInt32 = 0x00001021, // Immediate scalar tile
- ScalarInt16 = 0x00001022, // Immediate scalar tile
- ScalarInt8 = 0x00001024, // Immediate scalar tile
- ScalarUInt32 = 0x00001031, // Immediate scalar tile
- ScalarUInt16 = 0x00001032, // Immediate scalar tile
- ScalarUInt8 = 0x00001034, // Immediate scalar tile
- ScalarBool = 0x00001041, // Immediate scalar tile
- ScalarTile = 0x00001050, // Scalar from a tile
- Tile = 0x00010000, // Tile
- TensorStride1 = 0x00100001, // Tensor component
- TensorStride2 = 0x00100002, // Tensor component
- TensorStride3 = 0x00100003, // Tensor component
- TensorStride4 = 0x00100004, // Tensor component
- TensorDim0 = 0x00100010, // Tensor component
- TensorDim1 = 0x00100020, // Tensor component
- TensorDim2 = 0x00100030, // Tensor component
- TensorDim3 = 0x00100040, // Tensor component
- TensorDim4 = 0x00100050, // Tensor component
- TensorC = 0x00100010, // Tensor component
- TensorW = 0x00100020, // Tensor component
- TensorH = 0x00100030, // Tensor component
- TensorD = 0x00100040, // Tensor component
- TensorN = 0x00100050, // Tensor component
- TensorDim1xDim2 = 0x00100100, // Tensor component
- TensorDim1xDim2xDim3 = 0x00100200, // Tensor component
- TensorWxH = 0x00100300, // Tensor component
- TensorWxHxD = 0x00100400, // Tensor component
- TensorDataOffset = 0x00100500, // Tensor component
-};
-
-struct ScalarTileCoord
-{
- ScalarTileCoord()
- {
- }
-
- ScalarTileCoord(int32_t x0, int32_t y0) : x(x0), y(y0)
- {
- }
-
- int32_t x{-1};
- int32_t y{-1};
-};
-
-/**
- * @brief Operand class. This object is used to pass the operands to the operations performed by the writer.
- * Operand can be of three types:
- * -# Scalar immediate: constant expression
- * -# Tile: A tile
- * -# Tensor component: A component (scalar) of a tensor
- *
- */
-class Operand
-{
-public:
- Operand(const std::string &val)
- {
- _str = val;
- _type = OperandType::Tile;
- }
-
- Operand(const std::string &val, const ScalarTileCoord &coord)
- {
- _str = val;
- _type = OperandType::ScalarTile;
- _coord = coord;
- }
-
- Operand(const std::string &val, OperandType type)
- {
- _str = val;
- _type = type;
- }
-
- Operand(const Operand &t)
- {
- _str = t.value();
- _type = t.type();
- }
-
- Operand &operator=(const Operand &t)
- {
- _str = t.value();
- _type = t.type();
- _coord = t.scalar_tile_coordinate();
- return *this;
- }
-
- std::string value() const
- {
- return _str;
- }
-
- OperandType type() const
- {
- return _type;
- }
-
- ScalarTileCoord scalar_tile_coordinate() const
- {
- return _coord;
- }
-
-private:
- std::string _str{};
- OperandType _type{OperandType::Unknown};
- ScalarTileCoord _coord{};
-};
-
-using GpuSamplerTensorStorage = GpuTensorStorage;
-
-struct GpuSampler
-{
- GpuSampler() = default;
-
- TensorSamplerFormat format{TensorSamplerFormat::Unknown};
- GpuSamplerTensorStorage storage{GpuSamplerTensorStorage::Unknown};
- TensorSamplerAddressModeX address_mode_x{TensorSamplerAddressModeX::Unknown};
- TensorSamplerAddressModeY address_mode_y{TensorSamplerAddressModeY::Unknown};
- TensorSamplerAddressModeZ address_mode_z{TensorSamplerAddressModeZ::Unknown};
-};
-
-inline GpuSampler create_simple_sampler(
- const TensorInfo *tensor_info_id, GpuSampler sampler, int32_t step_x, int32_t step_y, int32_t step_z)
-{
- CKW_UNUSED(step_x, step_y, step_z);
-
- auto tensor = tensor_info_id->shape;
-
- GpuSampler dst_sampler;
- dst_sampler.format = sampler.format;
- dst_sampler.storage = GpuSamplerTensorStorage::BufferUint8Ptr;
- dst_sampler.address_mode_x = sampler.address_mode_x;
- dst_sampler.address_mode_y = sampler.address_mode_y;
- dst_sampler.address_mode_z = sampler.address_mode_z;
-
- int32_t dim_x = 0;
- int32_t dim_y = 0;
- int32_t dim_z = 0;
-
- switch (sampler.format)
- {
- case TensorSamplerFormat::C_W_H:
- dim_x = tensor[0];
- dim_y = tensor[1];
- dim_z = tensor[2];
- break;
- case TensorSamplerFormat::C_WH_1:
- dim_x = tensor[0];
- dim_y = tensor[1] * tensor[2];
- dim_z = 1;
- break;
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- break;
- }
-
- if (dim_x == 1)
- {
- assert(step_x == 1);
- dst_sampler.address_mode_x = TensorSamplerAddressModeX::None;
- }
-
- if (dim_y == 1)
- {
- assert(step_y == 1);
- dst_sampler.address_mode_y = TensorSamplerAddressModeY::None;
- }
-
- if (dim_z == 1)
- {
- assert(step_z == 1);
- dst_sampler.address_mode_z = TensorSamplerAddressModeZ::None;
- }
-
- return dst_sampler;
-}
-
-class GpuOutputSampler
-{
-public:
- GpuOutputSampler() = default;
-
- /**
- * @brief Method used to initialize the GpuOutputSampler. The GpuOutputSampler can be initialized only once
- * by the root component. Once initialized, all simpler components will need to used this sampler
- * or a broadcasted version of it
- *
- * @param[in] sampler GpuSampler
- * @param[in] step_x Increment step in the X direction. Not necessarily it is the same of n0 of tile!
- * @param[in] step_y Increment step in the Y direction. Not necessarily it is the same of m0 of tile!
- * @param[in] step_z Increment step in the Z direction. Not necessarily it is the same of d0 of tile!
- */
- void initialize(const TensorInfo *tensor_info_id,
- GpuSamplerTensorStorage tensor_storage,
- TensorSamplerFormat tensor_format,
- int32_t step_x,
- int32_t step_y,
- int32_t step_z)
- {
- assert(_is_initialized == false);
-
- _step_x = step_x;
- _step_y = step_y;
- _step_z = step_z;
- _tensor_info_id = tensor_info_id;
- _sampler = create_sampler(tensor_storage, tensor_format);
- _is_initialized = true;
- };
-
- GpuSampler sampler() const
- {
- return _sampler;
- };
-
- int32_t step_x() const
- {
- return _step_x;
- };
-
- int32_t step_y() const
- {
- return _step_y;
- };
-
- int32_t step_z() const
- {
- return _step_z;
- };
-
-private:
- GpuSampler create_sampler(GpuSamplerTensorStorage tensor_storage, TensorSamplerFormat tensor_format)
- {
- // Output can only be in output mode
- assert(tensor_storage != GpuSamplerTensorStorage::Image2dReadOnly);
- assert(tensor_storage != GpuSamplerTensorStorage::Image3dReadOnly);
-
- auto tensor = _tensor_info_id->shape;
-
- GpuSampler sampler;
- sampler.format = tensor_format;
- sampler.storage = tensor_storage;
- sampler.address_mode_x = TensorSamplerAddressModeX::None;
- sampler.address_mode_y = TensorSamplerAddressModeY::None;
- sampler.address_mode_z = TensorSamplerAddressModeZ::None;
-
- // In the case of texture, we do not need any special checks at the border
- if (tensor_storage == GpuSamplerTensorStorage::BufferUint8Ptr)
- {
- int32_t dim_x = 0;
- int32_t dim_y = 0;
- int32_t dim_z = 0;
-
- switch (tensor_format)
- {
- case TensorSamplerFormat::C_W_H:
- dim_x = tensor[0];
- dim_y = tensor[1];
- dim_z = tensor[2];
- break;
- case TensorSamplerFormat::C_WH_1:
- dim_x = tensor[0];
- dim_y = tensor[1] * tensor[2];
- dim_z = 1;
- break;
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- break;
- }
-
- if ((dim_x % _step_x) != 0 && dim_x != 1)
- {
- sampler.address_mode_x = TensorSamplerAddressModeX::OverlappingMin;
- }
-
- if ((dim_y % _step_y) != 0 && dim_y != 1)
- {
- sampler.address_mode_y = TensorSamplerAddressModeY::ClampToMaxEdgeOnly;
- }
-
- if ((dim_z % _step_z) != 0 && dim_z != 1)
- {
- sampler.address_mode_z = TensorSamplerAddressModeZ::ClampToMaxEdgeOnly;
- }
- }
-
- return sampler;
- }
-
- GpuSampler _sampler{}; // GpuSampler
- int32_t _step_x{1};
- int32_t _step_y{1};
- int32_t _step_z{1};
- const TensorInfo *_tensor_info_id{nullptr};
- bool _is_initialized{false};
-};
-
-/**
- * @brief Tensor operand class. This object is used to pass the operands as tensor to the operations performed by the writer.
- */
-class TensorOperand
-{
-public:
- TensorOperand(const std::string &val, GpuSampler sampler) : _str(val), _sampler(sampler)
- {
- }
-
- TensorOperand &operator=(const TensorOperand &t)
- {
- _str = t.value();
- _sampler = t.sampler();
- return *this;
- }
-
- std::string value() const
- {
- return _str;
- }
-
- GpuSampler sampler() const
- {
- return _sampler;
- }
-
-private:
- std::string _str{};
- GpuSampler _sampler{};
-};
-
-/**
- * @brief Data structure that contains all the necessary information to write the Gpu kernel with the Gpu kernel Writer
- * This data structure must be initialized before being passed to the Gpu Kernel Writer
- *
- */
-class GpuKernelWriterDataHolder
-{
-public:
- /**
- * @brief Construct a new Gpu Kernel Data object. In this phase, we should also store
- * the GPU target and target specific capabilities (extensions). For now, we just initialize the
- * programming language
- *
- * @param[in] language Gpu programming language to use
- */
- GpuKernelWriterDataHolder(GpuTargetLanguage language)
- : tiles(language), arguments(language), code(""), _language(language)
- {
- }
-
- /**
- * @brief Get the Gpu programming language used
- *
- * @return GpuTargetLanguage the Gpu programming language
- */
- GpuTargetLanguage programming_language() const
- {
- return _language;
- }
-
- /**
- * @brief @ref GpuTileRegistry
- *
- */
- GpuTileRegistry tiles{};
- /**
- * @brief @ref GpuTensorArgumentRegistry
- *
- */
- GpuTensorArgumentRegistry arguments{};
- /**
- * @brief @ref GpuOutputSampler.
- *
- */
- GpuOutputSampler output_sampler{};
- /**
- * @brief Source code
- *
- */
- std::string code{};
-
- // GpuExtensionRegistry extensions{};
-private:
- GpuTargetLanguage _language;
-};
-
-struct LWS
-{
- int32_t x{1};
- int32_t y{1};
- int32_t z{1};
-};
-
-/**
- * @brief Utility class used to get the tile from the operand. If the operand is not a tile, @ref OperandUnpacker
- * declare an anonymous tile in the tile registry.
- */
-class OperandUnpacker
-{
-public:
- OperandUnpacker(GpuTileRegistry &tiles, GpuTensorArgumentRegistry &arguments) : _tiles(tiles), _arguments(arguments)
- {
- // Increase the level of the stack to allocate possible temporary tiles
- _tiles.increment_registry_level();
- };
-
- ~OperandUnpacker()
- {
- // Decrease the level of the stack to deallocate any temporary tiles
- _tiles.decrement_registry_level();
- }
-
- IVectorTile *unpack(const Operand &src)
- {
- // Get the tile
- if (src.type() == OperandType::Tile)
- {
- assert(_tiles.has_tile(src.value()));
- return _tiles[src.value()];
- }
- // Create an anonymous tile with a constant
- else if (static_cast<int32_t>(src.type()) & 0x00001000)
- {
- if (src.type() == OperandType::ScalarTile)
- {
- ScalarTileCoord coord = src.scalar_tile_coordinate();
- assert(_tiles.has_tile(src.value()));
- assert(coord.x >= 0);
- assert(coord.y >= 0);
- auto val = _tiles[src.value()]->scalar(coord.x, coord.y);
- return _tiles.insert({{{val.str}}}, val.type.dt);
- }
- else
- {
- return _tiles.insert({{{src.value()}}}, to_tile_data_type(src.type()));
- }
- }
- // Create an anonymous tile with the tensor component
- else
- {
- assert(_arguments.has_tensor_argument(src.value()));
- auto x = _arguments[src.value()];
- const std::string val = x->component(to_tensor_component(src.type()));
- const DataType dt = x->component_data_type();
- return _tiles.insert({{{val}}}, dt);
- }
- }
-
-private:
- DataType to_tile_data_type(OperandType x)
- {
- return static_cast<DataType>(static_cast<int32_t>(x) & 0x00ff);
- }
-
- TensorComponentType to_tensor_component(OperandType x)
- {
- switch (x)
- {
- case OperandType::TensorDim0:
- return TensorComponentType::Dim0;
- case OperandType::TensorDim1:
- return TensorComponentType::Dim1;
- case OperandType::TensorDim2:
- return TensorComponentType::Dim2;
- case OperandType::TensorDim3:
- return TensorComponentType::Dim3;
- case OperandType::TensorDim4:
- return TensorComponentType::Dim4;
- case OperandType::TensorStride1:
- return TensorComponentType::Stride1;
- case OperandType::TensorStride2:
- return TensorComponentType::Stride2;
- case OperandType::TensorStride3:
- return TensorComponentType::Stride3;
- case OperandType::TensorStride4:
- return TensorComponentType::Stride4;
- case OperandType::TensorDim1xDim2:
- return TensorComponentType::Dim1xDim2;
- case OperandType::TensorDim1xDim2xDim3:
- return TensorComponentType::Dim1xDim2xDim3;
- case OperandType::TensorDataOffset:
- return TensorComponentType::OffsetFirstElement;
- default:
- assert(false);
- return TensorComponentType::Unknown;
- }
- }
-
- GpuTileRegistry &_tiles;
- GpuTensorArgumentRegistry &_arguments;
-};
-
-/**
- * @brief Utility class used to get the tensor argument from the operand. If the operand is not a tile, @ref OperandUnpacker
- * declare an anonymous tile in the tile registry.
- * Tensor dimension reduction aims for reducing the tensor data dimension while keeping data's tensor structure.
- */
-class TensorOperandUnpacker
-{
-public:
- TensorOperandUnpacker(GpuTensorArgumentRegistry &arguments) : _arguments(arguments){};
-
- IGpuTensorArgument *unpack(const TensorOperand &src)
- {
- assert(_arguments.has_tensor_argument(src.value()));
- return _arguments[src.value()];
- }
-
-private:
- GpuTensorArgumentRegistry &_arguments;
-};
-
-/**
- * @brief The GpuKernel will be used in three occasions (stages):
- * #- Compilation stage
- * #- Tuning stage
- * #- Dispatch stage
- */
-struct GpuKernel
-{
- // Compilation stage
- std::string code{}; // Source code, required for the compilation stage
- std::vector<GpuExtensions> list_extensions{}; // Extensions, required for the compilation stage
- // Tuning stage
- std::string config_id{}; // Unique id, required for the tuning stage
- std::vector<LWS> list_lws{}; // LWS to test, required for the tuning stage
- // Dispatch stage
- GpuOutputSampler output_sampler{}; // GpuOutputSampler, required for the dispatch stage
- std::vector<std::pair<int32_t, GpuTensorStorage>>
- list_tensor_storages; // List of tensor storages, required for the dispatch stage
- std::vector<std::pair<int32_t, TensorComponentType>>
- list_tensor_components; // List of tensor components (width, stride,..), required for the dispatch stage)
-};
-
-// Generate all extension pragmas (hardcoded for now)
-inline std::string generate_extensions()
-{
- std::string ext = R"(
-#if defined(cl_khr_fp16)
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#endif // defined(cl_khr_fp16)
-
-#if defined(cl_arm_integer_dot_product_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
-#endif // defined(cl_arm_integer_dot_product_int8)
-
-#if defined(cl_arm_integer_dot_product_accumulate_int8)
-#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
-#endif // defined(cl_arm_integer_dot_product_accumulate_int8)
-
-#if defined(cl_arm_printf)
-#pragma OPENCL EXTENSION cl_arm_printf : enable
-#endif // defined(cl_arm_printf);
-)";
- return ext;
-}
-
-// This function should produce an object with the source
-inline std::string generate_code(GpuKernelWriterDataHolder &in, const std::string &name)
-{
- std::string code;
- code += generate_extensions();
- code += "__kernel void ";
- code += name;
- code += "(\n";
-
- auto IdSpaces = in.arguments.IdSpace_declarations();
-
- std::vector<std::string> arg_str;
-
- auto tensor_args = in.arguments.tensor_argument_declarations();
-
- for (auto &i : tensor_args)
- {
- // For each tensor used, get the storage and tensor components
- auto storages = i->storage_declarations();
- auto components = i->component_declarations();
-
- for (auto &y : storages)
- {
- std::string str;
- str += i->storage_type_declaration(y);
- str += " ";
- str += i->storage(y);
- arg_str.push_back(str);
- }
-
- for (auto &y : components)
- {
- std::string str;
- str += i->component_type_declaration();
- str += " ";
- str += i->component(y);
- arg_str.push_back(str);
- }
- }
-
- for (size_t i = 0; i < arg_str.size(); ++i)
- {
- code += arg_str[i];
- if (i + 1 < arg_str.size())
- {
- code += ",\n";
- }
- }
-
- code += ")\n";
- code += "{\n";
- code += in.code;
- code += "}\n";
-
- return code;
-}
-
-/**
- * @brief This class is responsible to map a N-Tensor to a 3d tensor. The mapper needs the GpuSampler to know
- * how to reduce the dimensionality of a tensor
- *
- */
-class GpuTensor3dMapper
-{
-public:
- GpuTensor3dMapper(IGpuTensorArgument *tensor, GpuSampler sampler) : _sampler(sampler), _tensor(tensor){};
-
- std::string tensor_component_x() const
- {
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- case TensorSamplerFormat::C_W_H:
- return _tensor->component(TensorComponentType::Dim0);
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- std::string tensor_component_y() const
- {
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- return _tensor->component(TensorComponentType::Dim1xDim2);
- case TensorSamplerFormat::C_W_H:
- return _tensor->component(TensorComponentType::Dim1);
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- std::string tensor_component_z() const
- {
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- return "1";
- case TensorSamplerFormat::C_W_H:
- return _tensor->component(TensorComponentType::Dim2);
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- std::string tensor_component_stride_y() const
- {
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- case TensorSamplerFormat::C_W_H:
- return _tensor->component(TensorComponentType::Stride1);
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- std::string tensor_component_stride_z() const
- {
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- return "0";
- case TensorSamplerFormat::C_W_H:
- return _tensor->component(TensorComponentType::Stride2);
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- std::string tensor_component_stride_batch() const
- {
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- case TensorSamplerFormat::C_W_H:
- return _tensor->component(TensorComponentType::Stride3);
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- bool is_one_component_x() const
- {
- auto t = _tensor->format();
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- case TensorSamplerFormat::C_W_H:
- return t.shape[0] == 1;
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- bool is_one_component_y() const
- {
- auto t = _tensor->format();
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- return (t.shape[1] * t.shape[2]) == 1;
- case TensorSamplerFormat::C_W_H:
- return t.shape[1] == 1;
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- bool is_one_component_z() const
- {
- auto t = _tensor->format();
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- return true;
- case TensorSamplerFormat::C_W_H:
- return t.shape[2] == 1;
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- bool is_one_component_batch() const
- {
- auto t = _tensor->format();
- const auto format = _sampler.format;
- switch (format)
- {
- case TensorSamplerFormat::C_WH_1:
- case TensorSamplerFormat::C_W_H:
- return t.shape[3] == 1;
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- return "";
- }
- }
-
- GpuSampler gpu_sampler() const
- {
- return _sampler;
- }
-
- IGpuTensorArgument *tensor_argument() const
- {
- return _tensor;
- }
-
-private:
- GpuSampler _sampler;
- IGpuTensorArgument *_tensor;
-};
-
-struct GpuKernelWriterAttribute
-{
- bool return_tensor_component_by_value{false};
-};
-
-enum class RoundingMode
-{
- None,
- Rte,
- Rtz,
- Rtp,
- Rtn
-};
-
-// https://llvm.org/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.html
-class IGpuKernelWriter
-{
-public:
- virtual ~IGpuKernelWriter() = default;
-
- virtual void set_IdSpace(int32_t id) = 0;
-
- virtual void import_tile(const std::string &dst, const IVectorTile *src) = 0;
-
- virtual void declare_argument(const std::string &name, const TensorInfo &tensor) = 0;
-
- virtual void declare_tile(const std::string &name, const TileInfo &info) = 0;
-
- virtual void
- declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) = 0;
-
- virtual void write_text(const std::string &x) = 0;
-
- virtual void compound_statement_begin() = 0;
-
- virtual void compound_statement_end() = 0;
-
- // Operations
- virtual void op_get_global_id(const Operand &dst_var, int32_t dim) = 0;
-
- virtual void
- op_get_global_coord(const Operand &dst, const Operand &step, const TensorOperand &tensor, int32_t dim) = 0;
-
- virtual void op_get_global_batch(const Operand &dst, const TensorOperand &tensor) = 0;
-
- virtual void op_get_global_size(const Operand &dst_var, int32_t dim) = 0;
-
- virtual void op_unary_expression(const Operand &dst, UnaryOp op, const Operand &src) = 0;
-
- virtual void op_binary_expression(const Operand &dst, const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-
- virtual void op_assign(const Operand &dst_name, const Operand &src_name) = 0;
-
- virtual void
- op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) = 0;
-
- virtual void op_binary_elementwise_function(const Operand &dst_name,
- BinaryFunction func,
- const Operand &first_name,
- const Operand &second_name) = 0;
-
- virtual void op_ternary_elementwise_function(const Operand &dst_name,
- TernaryFunction func,
- const Operand &first_name,
- const Operand &second_name,
- const Operand &third_name) = 0;
-
- virtual void op_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-
- virtual void op_else_if_header(const Operand &lhs, BinaryOp op, const Operand &rhs) = 0;
-
- virtual void op_else_header() = 0;
-
- virtual void op_for_loop_header(const Operand &var_name,
- BinaryOp cond_op,
- const Operand &cond_value,
- const Operand &update_var,
- AssignmentOp update_op,
- const Operand &update_value) = 0;
-
- virtual void op_load_indirect(const TensorOperand &tensor,
- const Operand &dst,
- const Operand &x,
- const Operand &y_indirect,
- const Operand &z,
- const Operand &b = Operand("0", OperandType::ScalarInt32)) = 0;
-
- virtual void op_load_immediate(const TensorOperand &tensor,
- const Operand &dst,
- const Operand &x,
- const Operand &y,
- const Operand &z,
- const Operand &b = Operand("0", OperandType::ScalarInt32),
- const Operand &dilation_y = Operand("1", OperandType::ScalarInt32)) = 0;
-
- virtual void op_store_immediate(const TensorOperand &tensor,
- const Operand &src,
- const Operand &x,
- const Operand &y,
- const Operand &z,
- const Operand &b = Operand("0", OperandType::ScalarInt32)) = 0;
-
- virtual void op_cast_expression(const Operand &dst, const Operand &src, ConvertPolicy policy) = 0;
-
- virtual void op_return() = 0;
-
- // Utils
- // It is the process of converting
- virtual void util_get_indirect_buffer(const Operand &dst,
- const TensorOperand &tensor,
- const Operand &x,
- const Operand &y,
- const Operand &x_off,
- const Operand &y_off) = 0;
-};
-
-enum class GpuLoadStoreType
-{
- Load = 1,
- Store = 2
-};
-
-class IGpuLoadStoreHelperWriter
-{
-public:
- IGpuLoadStoreHelperWriter(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type)
- : _writer(x), _mapper(mapper), _type(type)
- {
- }
-
- IGpuLoadStoreHelperWriter(const IGpuLoadStoreHelperWriter &) = default;
-
- IGpuLoadStoreHelperWriter &operator=(const IGpuLoadStoreHelperWriter &) = default;
-
- virtual ~IGpuLoadStoreHelperWriter() = default;
-
- virtual void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) = 0;
-
- virtual void write(const std::pair<int32_t, std::string> &y) = 0;
-
- virtual void finalize() = 0;
-
-protected:
- IGpuKernelWriter *_writer;
- GpuTensor3dMapper _mapper;
- GpuLoadStoreType _type;
-};
-
-class ClLoadStoreBufferHelperWriter : public IGpuLoadStoreHelperWriter
-{
-public:
- ClLoadStoreBufferHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
- : IGpuLoadStoreHelperWriter(x, mapper, type)
- {
- }
-
- ClLoadStoreBufferHelperWriter(const ClLoadStoreBufferHelperWriter &) = default;
-
- ClLoadStoreBufferHelperWriter &operator=(const ClLoadStoreBufferHelperWriter &) = default;
-
- static bool validate(IGpuKernelWriter *x, GpuTensor3dMapper mapper, GpuLoadStoreType type, IVectorTile *dst)
- {
- CKW_UNUSED(x, type, dst);
-
- if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::BufferUint8Ptr)
- {
- return false;
- }
- return true;
- }
-
- void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
- {
- assert(validate(_writer, _mapper, _type, dst));
-
- _dst = dst;
- _ls_width_full = dst->format().w;
-
- _coord_x = x->scalar(0, 0).str;
- _coord_z = z->scalar(0, 0).str;
- _coord_b = b->scalar(0, 0).str;
- _coord_orig_z = _coord_z;
-
- out_of_bound_initialize_x(_coord_x);
- out_of_bound_initialize_z(_coord_z);
-
- /*
- meaning of else:
- - x: partial load/store
- - y: no load/store operation
- - z: no load/store operation
- if(x)
- {
- if(z)
- {
- if(y)
- {
- // full load/store width
- }
- else
- {
- // no load/store
- }
- }
- else
- {
- // no load/store
- }
- }
- else
- {
- if(z)
- {
- if(y)
- {
- // partial load/store width
- }
- else
- {
- // no load/store
- }
- }
- else
- {
- // no load/store
- }
- }
- */
- }
-
- void write(const std::pair<int32_t, std::string> &y) override
- {
- int32_t idx_y = y.first;
- std::string coord_y = y.second;
-
- // The only check required is on Y.
- out_of_bound_initialize_y(coord_y);
-
- const std::string dst = _dst->vector(idx_y).str;
- const std::string address = to_ls_buffer_address(_coord_x, coord_y, _coord_z, _coord_b);
- const std::string ls_buf = to_ls_buffer(_type, _ls_width_full, dst, address);
-
- _writer->write_text(ls_buf);
- _writer->write_text(";\n");
-
- out_of_bound_finalize_y(dst);
-
- // The left over load/store will be written in the finalize stage
- if (_ls_width_part.size() != 0)
- {
- int32_t w = 0;
- for (auto &p : _ls_width_part)
- {
- const std::string dst0 = _dst->vector(w, p, idx_y).str;
- const std::string coord_x = _coord_x + " + " + std::to_string(w);
- const std::string address = to_ls_buffer_address(coord_x, coord_y, _coord_z, _coord_b);
- const std::string ls_buf0 = to_ls_buffer(_type, p, dst0, address);
- _leftovers_x.push_back(std::make_pair(std::make_pair(dst0, coord_y), ls_buf0));
-
- w += p;
- }
- }
- }
-
- void finalize() override
- {
- out_of_bound_finalize_z();
- out_of_bound_finalize_x();
- }
-
-private:
- IVectorTile *_dst{nullptr};
- int32_t _ls_width_full{0};
- std::vector<int32_t> _ls_width_part{};
- std::vector<std::pair<std::pair<std::string, std::string>, std::string>> _leftovers_x{};
- std::string _coord_x{};
- std::string _coord_z{};
- std::string _coord_orig_z{};
- std::string _coord_b{};
-
- void out_of_bound_initialize_x(std::string &coord)
- {
- if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
- {
- auto tensor_format = _mapper.tensor_argument()->format();
- auto shape = tensor_format.shape;
-
- _ls_width_part = decompose_leftover_ls_vector_width(shape[0] % _ls_width_full);
- if (_ls_width_part.size() != 0)
- {
- _writer->write_text("if(" + coord + " > 0)\n");
- _writer->compound_statement_begin();
- }
- }
- };
-
- void out_of_bound_finalize_x()
- {
- if (_mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
- {
- if (_ls_width_part.size() != 0)
- {
- _writer->compound_statement_end();
- _writer->write_text("else\n");
- _writer->compound_statement_begin();
-
- out_of_bound_initialize_z(_coord_orig_z);
- for (auto &i : _leftovers_x)
- {
- out_of_bound_initialize_y(i.first.second);
- _writer->write_text(i.second);
- _writer->write_text(";\n");
- out_of_bound_finalize_y(i.first.first);
- }
- out_of_bound_finalize_z();
- _writer->compound_statement_end();
- }
- }
- };
-
- void out_of_bound_initialize_y(std::string &coord)
- {
- std::string max = "";
-
- const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
- switch (address_mode_y)
- {
- case TensorSamplerAddressModeY::Skip:
- case TensorSamplerAddressModeY::ClampToBorder:
- // NOTE: This line should not be moved outside of the switch statement.
- // The reason for that is because when we query the component, the component is marked as used
- // and added to the list of arguments of the kernel. Since, not in all cases this component is required,
- // we should request the component only when used
- max = _mapper.tensor_component_y();
- _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
- _writer->compound_statement_begin();
- break;
- case TensorSamplerAddressModeY::SkipMinEdgeOnly:
- case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
- _writer->write_text("if(" + coord + " >= 0)\n");
- _writer->compound_statement_begin();
- break;
- case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
- case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
- max = _mapper.tensor_component_y();
- _writer->write_text("if(" + coord + " < " + max + ")\n");
- _writer->compound_statement_begin();
- break;
- case TensorSamplerAddressModeY::ClampToNearest:
- max = _mapper.tensor_component_y();
- coord = "clamp(" + coord + ", 0, " + max + " - 1)";
- break;
- case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
- max = _mapper.tensor_component_y();
- coord = "min(" + coord + ", " + max + " - 1)";
- break;
- case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
- coord = "max(" + coord + ", 0)";
- break;
- case TensorSamplerAddressModeY::None:
- break;
- default:
- std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
- assert(false);
- }
- };
-
- void out_of_bound_finalize_y(const std::string &dst)
- {
- const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
- switch (address_mode_y)
- {
- case TensorSamplerAddressModeY::ClampToBorder:
- case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
- case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
- case TensorSamplerAddressModeY::Skip:
- case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
- case TensorSamplerAddressModeY::SkipMinEdgeOnly:
- _writer->compound_statement_end();
- break;
- case TensorSamplerAddressModeY::None:
- break;
-
- default:
- assert(false);
- }
-
- switch (address_mode_y)
- {
- case TensorSamplerAddressModeY::ClampToBorder:
- case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
- case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
- _writer->write_text("else\n");
- _writer->compound_statement_begin();
- _writer->write_text(dst);
- _writer->write_text(" = 0.0f;\n");
- _writer->compound_statement_end();
- break;
- case TensorSamplerAddressModeY::None:
- break;
-
- default:
- assert(false);
- }
- };
-
- void out_of_bound_initialize_z(std::string &coord)
- {
- std::string max = "";
-
- const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
-
- switch (address_mode_z)
- {
- case TensorSamplerAddressModeZ::Skip:
- max = _mapper.tensor_component_z();
- _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
- _writer->compound_statement_begin();
- break;
- case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
- _writer->write_text("if(" + coord + " >= 0)\n");
- _writer->compound_statement_begin();
- break;
- case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
- max = _mapper.tensor_component_z();
- _writer->write_text("if(" + coord + " < " + max + ")\n");
- _writer->compound_statement_begin();
- break;
- case TensorSamplerAddressModeZ::ClampToNearest:
- max = _mapper.tensor_component_z();
- coord = "clamp(" + coord + ", 0, " + max + " - 1)";
- break;
- case TensorSamplerAddressModeZ::ClampToMaxEdgeOnly:
- max = _mapper.tensor_component_z();
- coord = "min(" + coord + ", " + max + " - 1)";
- break;
- case TensorSamplerAddressModeZ::ClampToMinEdgeOnly:
- coord = "max(" + coord + ", 0)";
- break;
- case TensorSamplerAddressModeZ::None:
- break;
- default:
- std::cout << "Unsupported address mode for write_out_of_bound_check_yz" << std::endl;
- assert(false);
- }
- };
-
- void out_of_bound_finalize_z()
- {
- const auto address_mode_z = _mapper.gpu_sampler().address_mode_z;
-
- switch (address_mode_z)
- {
- case TensorSamplerAddressModeZ::Skip:
- case TensorSamplerAddressModeZ::SkipMinEdgeOnly:
- case TensorSamplerAddressModeZ::SkipMaxEdgeOnly:
- _writer->compound_statement_end();
- break;
- case TensorSamplerAddressModeZ::None:
- break;
-
- default:
- assert(false);
- }
- };
-
- std::vector<int32_t> decompose_leftover_ls_vector_width(int32_t ls_leftover_vector_width) const
- {
- std::vector<int32_t> x;
-
- switch (ls_leftover_vector_width)
- {
- case 0:
- break;
- case 1:
- case 2:
- case 3:
- case 4:
- case 8:
- case 16:
- x.push_back(ls_leftover_vector_width);
- break;
- case 5:
- x.push_back(4);
- x.push_back(1);
- break;
- case 6:
- x.push_back(4);
- x.push_back(2);
- break;
- case 7:
- x.push_back(4);
- x.push_back(3);
- break;
- case 9:
- x.push_back(8);
- x.push_back(1);
- break;
- case 10:
- x.push_back(8);
- x.push_back(2);
- break;
- case 11:
- x.push_back(8);
- x.push_back(3);
- break;
- case 12:
- x.push_back(8);
- x.push_back(4);
- break;
- case 13:
- x.push_back(8);
- x.push_back(4);
- x.push_back(1);
- break;
- case 14:
- x.push_back(8);
- x.push_back(4);
- x.push_back(2);
- break;
- case 15:
- x.push_back(8);
- x.push_back(4);
- x.push_back(3);
- break;
-
- default:
- assert(false);
- }
- return x;
- }
-
- std::string
- to_ls_buffer(GpuLoadStoreType type, int32_t vector_width, const std::string &data, const std::string &address)
- {
- switch (type)
- {
- case GpuLoadStoreType::Load:
- if (vector_width != 1)
- {
- return data + " = vload" + std::to_string(vector_width) + "(0, " + address + ")";
- }
- else
- {
- return data + " = *(" + address + ")";
- }
- break;
- case GpuLoadStoreType::Store:
- if (vector_width != 1)
- {
- return "vstore" + std::to_string(vector_width) + "(" + data + ", 0, " + address + ")";
- }
- else
- {
- return "*(" + address + ") = " + data;
- }
- break;
- default:
- std::cout << "Unsupported GpuLoadStoreType" << std::endl;
- assert(false);
- return "";
- }
- }
-
- std::string
- to_ls_buffer_address(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
- {
- auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
- assert(tensor_storage == GpuTensorStorage::BufferUint8Ptr);
- const std::string ptr_buf = _mapper.tensor_argument()->storage(tensor_storage);
- const std::string dst_type = get_cl_data_type(_dst->format().dt, 1);
-
- std::string address;
- address += "(__global ";
- address += dst_type;
- address += "*)(";
- address += ptr_buf;
- if (x != "0" && (_mapper.is_one_component_x() != true))
- {
- address += " + (";
- address += x + ") * sizeof(" + dst_type + ")";
- }
- if (y != "0")
- {
- const std::string stride_y = _mapper.tensor_component_stride_y();
- address += " + (";
- address += y + ")";
- address += " * ";
- address += stride_y;
- }
- if (z != "0")
- {
- const std::string stride_z = _mapper.tensor_component_stride_z();
- address += " + (";
- address += z + ")";
- address += " * ";
- address += stride_z;
- }
- if (b != "0" && (_mapper.is_one_component_batch() != true))
- {
- const std::string stride_b = _mapper.tensor_component_stride_batch();
- address += " + (";
- address += b + ")";
- address += " * ";
- address += stride_b;
- }
- address += ")";
- return address;
- }
-};
-
-class ClLoadStoreImage2dHelperWriter : public IGpuLoadStoreHelperWriter
-{
-public:
- static bool validate(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type, IVectorTile *dst)
- {
- CKW_UNUSED(x);
-
- if (dst->format().w != 4)
- {
- return false;
- }
- if (mapper.gpu_sampler().address_mode_x != TensorSamplerAddressModeX::None)
- {
- return false;
- }
- if (mapper.gpu_sampler().address_mode_z != TensorSamplerAddressModeZ::None)
- {
- return false;
- }
- if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dReadOnly && type == GpuLoadStoreType::Load)
- {
- return false;
- }
- if (mapper.gpu_sampler().storage != GpuSamplerTensorStorage::Image2dWriteOnly &&
- type == GpuLoadStoreType::Store)
- {
- return false;
- }
- if ((dst->format().dt != DataType::Fp32) && (dst->format().dt != DataType::Fp16))
- {
- return false;
- }
- return true;
- /*
- - x: Only GpuSamplerAddressModeX::None is supported and vector length = 4
- - z: Only GpuSamplerAddressModeZ::None is supported
- */
- }
-
- ClLoadStoreImage2dHelperWriter(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
- : IGpuLoadStoreHelperWriter(x, mapper, type)
- {
- }
-
- ClLoadStoreImage2dHelperWriter(const ClLoadStoreImage2dHelperWriter &) = default;
-
- ClLoadStoreImage2dHelperWriter &operator=(const ClLoadStoreImage2dHelperWriter &) = default;
-
- void initialize(IVectorTile *dst, IVectorTile *x, IVectorTile *z, IVectorTile *b) override
- {
- assert(validate(_writer, _mapper, _type, dst));
-
- _dst = dst;
- _ls_width_full = dst->format().w;
- _coord_x = x->scalar(0, 0).str;
- _coord_z = z->scalar(0, 0).str;
- _coord_b = b->scalar(0, 0).str;
-
- /*
- if(y)
- {
- // full load/store width
- }
- else
- {
- // no load/store
- }
- */
- }
-
- void write(const std::pair<int32_t, std::string> &y) override
- {
- int32_t idx_y = y.first;
- std::string coord_y = y.second;
-
- // The only check required is on Y.
- out_of_bound_initialize_y(coord_y);
-
- const std::string dst = _dst->vector(idx_y).str;
- const std::string sampler = to_ls_image2d_sampler();
- const std::string coord = to_ls_image2d_coord(_coord_x, coord_y, _coord_z, _coord_b);
- const std::string ls_buf = to_ls_image2d(_type, _ls_width_full, dst, sampler, coord);
-
- _writer->write_text(ls_buf);
- _writer->write_text(";\n");
-
- out_of_bound_finalize_y(dst);
- }
-
- void finalize() override
- {
- }
-
-private:
- IVectorTile *_dst{nullptr};
- int32_t _ls_width_full{0};
- std::string _coord_x{};
- std::string _coord_z{};
- std::string _coord_b{};
-
- void out_of_bound_initialize_y(std::string &coord)
- {
- std::string max = "";
-
- const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
- switch (address_mode_y)
- {
- case TensorSamplerAddressModeY::Skip:
- max = _mapper.tensor_component_y();
- _writer->write_text("if((" + coord + " >= 0) && (" + coord + " < " + max + "))\n");
- _writer->compound_statement_begin();
- break;
- case TensorSamplerAddressModeY::SkipMinEdgeOnly:
- _writer->write_text("if(" + coord + " >= 0)\n");
- _writer->compound_statement_begin();
- break;
- case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
- max = _mapper.tensor_component_y();
- _writer->write_text("if(" + coord + " < " + max + ")\n");
- _writer->compound_statement_begin();
- break;
- case TensorSamplerAddressModeY::ClampToBorder:
- case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
- case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
- case TensorSamplerAddressModeY::ClampToNearest:
- case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
- case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
- case TensorSamplerAddressModeY::None:
- break;
- default:
- std::cout << "Unsupported address mode for write_out_of_bound_check_y" << std::endl;
- assert(false);
- }
- };
-
- void out_of_bound_finalize_y(const std::string &dst)
- {
- CKW_UNUSED(dst);
-
- const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
- switch (address_mode_y)
- {
- case TensorSamplerAddressModeY::Skip:
- case TensorSamplerAddressModeY::SkipMinEdgeOnly:
- case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
- _writer->compound_statement_end();
- break;
-
- default:
- assert(false);
- }
- };
-
- std::string to_ls_image2d(GpuLoadStoreType type,
- int32_t vector_width,
- const std::string &data,
- const std::string &sampler,
- const std::string &coord)
- {
- CKW_UNUSED(vector_width);
-
- auto tensor_storage = static_cast<GpuTensorStorage>(_mapper.gpu_sampler().storage);
- const std::string image2d_obj = _mapper.tensor_argument()->storage(tensor_storage);
- const std::string post_fix = _dst->format().dt == DataType::Fp32 ? "f" : "h";
-
- switch (type)
- {
- case GpuLoadStoreType::Load:
- return data + " = read_image" + post_fix + "(" + image2d_obj + ", " + sampler + ", " + coord + ")";
- break;
- case GpuLoadStoreType::Store:
- return "write_image" + post_fix + "(" + image2d_obj + ", " + coord + ", " + data + ")";
- default:
- assert(false);
- std::cout << "Unsupported GpuLoadStoreType" << std::endl;
- assert(false);
- return "";
- }
- }
-
- std::string to_ls_image2d_sampler() const
- {
- const auto address_mode_y = _mapper.gpu_sampler().address_mode_y;
-
- switch (address_mode_y)
- {
- case TensorSamplerAddressModeY::None:
- return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST";
- case TensorSamplerAddressModeY::Skip:
- case TensorSamplerAddressModeY::SkipMinEdgeOnly:
- case TensorSamplerAddressModeY::SkipMaxEdgeOnly:
- case TensorSamplerAddressModeY::ClampToBorder:
- case TensorSamplerAddressModeY::ClampToBorderMinEdgeOnly:
- case TensorSamplerAddressModeY::ClampToBorderMaxEdgeOnly:
- return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST";
- case TensorSamplerAddressModeY::ClampToNearest:
- case TensorSamplerAddressModeY::ClampToMaxEdgeOnly:
- case TensorSamplerAddressModeY::ClampToMinEdgeOnly:
- return "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST";
- default:
- std::cout << "Unsupported address_mode_coord" << std::endl;
- assert(false);
- return "";
- }
- }
-
- std::string
- to_ls_image2d_coord(const std::string &x, const std::string &y, const std::string &z, const std::string &b) const
- {
- std::string coord_x = "(" + x + ") >> 2";
- std::string coord_y = "(";
-
- if (y != "0")
- {
- coord_y += y;
- }
- if (z != "0" && (_mapper.is_one_component_z() != true))
- {
- const std::string dim = _mapper.tensor_component_y();
- coord_y += " + (";
- coord_y += z + ")";
- coord_y += " * ";
- coord_y += dim;
- }
- if (b != "0" && (_mapper.is_one_component_batch() != true))
- {
- const std::string dim0 = _mapper.tensor_component_y();
- const std::string dim1 = _mapper.tensor_component_z();
- coord_y += " + (";
- coord_y += b + ")";
- coord_y += " * ";
- coord_y += dim0;
- coord_y += " * ";
- coord_y += dim1;
- }
- coord_y += ")";
- return "(int2)(" + coord_x + ", " + coord_y + ")";
- }
-};
-
-/** IGpuLoadStoreHelperWriter factory class */
-class ClLoadStoreHelperWriterFactory final
-{
-public:
- /** Static method to call the IGpuLoadStoreHelperWriter class accordingly with the tensor storage set in the mapper
- *
- *
- * @return IGpuLoadStoreHelperWriter
- */
- static std::unique_ptr<IGpuLoadStoreHelperWriter>
- create(IGpuKernelWriter *x, const GpuTensor3dMapper &mapper, GpuLoadStoreType type)
- {
- const auto tensor_storage = mapper.gpu_sampler().storage;
- switch (tensor_storage)
- {
- case GpuSamplerTensorStorage::BufferUint8Ptr:
- return std::make_unique<ClLoadStoreBufferHelperWriter>(x, mapper, type);
- case GpuSamplerTensorStorage::Image2dReadOnly:
- case GpuSamplerTensorStorage::Image2dWriteOnly:
- return std::make_unique<ClLoadStoreImage2dHelperWriter>(x, mapper, type);
- default:
- std::cout << "Unsupported Gpu tensor storage" << std::endl;
- assert(false);
- return nullptr;
- }
- }
-};
-
-// This utility method needs to go in utils.h
-inline bool is_tile_scalar(const IVectorTile *x)
-{
- return x->format().w == 1 && x->format().h == 1;
-}
-
-class ClKernelWriter : public IGpuKernelWriter
-{
-public:
- ClKernelWriter(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
- {
- _data = x;
- _attr = attr;
- }
-
- ClKernelWriter(const ClKernelWriter &) = default;
-
- ClKernelWriter &operator=(const ClKernelWriter &) = default;
-
- // A IdSpaced ID is a term used to describe a fragment that is registered in ICode to ensure
- // there are no conflicts or ambiguity in the code
- void set_IdSpace(int32_t id) override
- {
- _data->tiles.set_IdSpace(id);
- _data->arguments.set_IdSpace(id);
- }
-
- void import_tile(const std::string &dst_name, const IVectorTile *src) override
- {
- _data->tiles.insert(dst_name, src);
- }
-
- void declare_argument(const std::string &name, const TensorInfo &tensor) override
- {
- assert(_data->arguments[name] == nullptr);
- _data->arguments.insert(name, tensor, _attr->return_tensor_component_by_value);
- }
-
- void declare_tile(const std::string &name, const TileInfo &format) override
- {
- assert(_data->tiles[name] == nullptr);
- _data->tiles.insert(name, format);
-
- IVectorTile *x = _data->tiles[name];
-
- for (auto &t : x->underlying_source_variables())
- {
- _data->code += t.type.str + " " + t.str + ";\n";
- }
- }
-
- void
- declare_const_tile(const std::string &name, const std::vector<std::vector<std::string>> &in, DataType dt) override
- {
- assert(_data->tiles[name] == nullptr);
- _data->tiles.insert(name, in, dt);
- // Note: A constant does not need to be declared in the code
- }
-
- void write_text(const std::string &x) override
- {
- _data->code += x;
- }
-
- void compound_statement_begin() override
- {
- _data->tiles.increment_registry_level();
- _data->code += "{\n";
- }
-
- void compound_statement_end() override
- {
- _data->tiles.decrement_registry_level();
- _data->code += "}\n";
- }
-
- void op_get_global_id(const Operand &dst_var, int32_t dim) override
- {
- assert(dst_var.type() == OperandType::Tile);
- assert(_data->tiles.has_tile(dst_var.value()));
- assert(_data->tiles[dst_var.value()]->format().w == 1 &&
- _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
-
- auto var = _data->tiles[dst_var.value()];
-
- _data->code += var->scalar(0, 0).str;
- _data->code += " = get_global_id(";
- _data->code += std::to_string(dim);
- _data->code += ");\n";
- };
-
- void op_get_global_coord(const Operand &o_dst,
- const Operand &o_step,
- const TensorOperand &o_tensor,
- int32_t dim) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- auto dst = operands.unpack(o_dst);
- auto step = operands.unpack(o_step);
-
- // Validation: Check that x, y and z are scalar
-
- TensorOperandUnpacker tensor_operands(_data->arguments);
- auto tensor = tensor_operands.unpack(o_tensor);
- auto gpu_sampler = o_tensor.sampler();
-
- GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
- switch (dim)
- {
- case 0:
- if (mapper.is_one_component_x())
- {
- _data->code += dst->scalar(0, 0).str;
- _data->code += " = 0;\n";
- }
- else
- {
- if (mapper.gpu_sampler().address_mode_x == TensorSamplerAddressModeX::OverlappingMin)
- {
- // Validation: Check: fixed tensor shape
- // TO BE CHANGED
- _data->code += dst->scalar(0, 0).str;
- _data->code += " = get_global_id(0) * ";
- _data->code += step->scalar(0, 0).str;
- _data->code += ";\n";
- }
- else
- {
- _data->code += dst->scalar(0, 0).str;
- _data->code += " = get_global_id(0) * ";
- _data->code += step->scalar(0, 0).str;
- _data->code += ";\n";
- }
- }
- break;
- case 1:
- if (mapper.is_one_component_y())
- {
- _data->code += dst->scalar(0, 0).str;
- _data->code += " = 0;\n";
- }
- else
- {
- if (mapper.gpu_sampler().address_mode_y == TensorSamplerAddressModeY::OverlappingMin)
- {
- }
- else
- {
- _data->code += dst->scalar(0, 0).str;
- _data->code += " = get_global_id(1) * ";
- _data->code += step->scalar(0, 0).str;
- _data->code += ";\n";
- }
- }
- break;
- case 2:
- if (mapper.is_one_component_z())
- {
- _data->code += dst->scalar(0, 0).str;
- _data->code += " = 0;\n";
- }
- else
- {
- _data->code += dst->scalar(0, 0).str;
- _data->code += " = get_global_id(2) * ";
- _data->code += step->scalar(0, 0).str;
- _data->code += ";\n";
- }
- break;
- default:
- break;
- }
- };
-
- void op_get_global_batch(const Operand &o_dst, const TensorOperand &o_tensor) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *dst = operands.unpack(o_dst);
-
- TensorOperandUnpacker tensor_operands(_data->arguments);
- IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor);
- auto gpu_sampler = o_tensor.sampler();
-
- GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
- if (mapper.is_one_component_batch())
- {
- _data->code += dst->scalar(0, 0).str;
- _data->code += " = 0;\n";
- }
- else
- {
- std::cout << "Unsupported batched computation" << std::endl;
- assert(false);
- }
- };
-
- void op_get_global_size(const Operand &dst_var, int32_t dim) override
- {
- assert(dst_var.type() == OperandType::Tile);
- assert(_data->tiles.has_tile(dst_var.value()));
- assert(_data->tiles[dst_var.value()]->format().w == 1 &&
- _data->tiles[dst_var.value()]->format().h == 1); // It must be a scalar variable
-
- auto var = _data->tiles[dst_var.value()];
-
- _data->code += var->scalar(0, 0).str;
- _data->code += " = get_global_size(";
- _data->code += std::to_string(dim);
- _data->code += ");\n";
- }
-
- void op_unary_expression(const Operand &dst_name, UnaryOp op, const Operand &src_name) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *src = operands.unpack(src_name);
- const IVectorTile *dst = operands.unpack(dst_name);
-
- const int32_t dst_w = dst->format().w;
- const int32_t dst_h = dst->format().h;
- const int32_t src_w = src->format().w;
- const std::string dt = dst->underlying_source_variables()[0].type.str;
-
- const bool broadcast_src_x = dst_w != 1 && src_w == 1;
-
- const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
-
- // Broadcasting on Y is automatic
- for (int32_t y = 0; y < dst_h; ++y)
- {
- _data->code += dst->vector(y).str;
- _data->code += " = ";
- _data->code += to_string(op);
- _data->code += src_prefix + src->vector(y).str;
- _data->code += ";\n";
- }
- }
-
- void op_binary_expression(const Operand &dst_name,
- const Operand &lhs_name,
- BinaryOp op,
- const Operand &rhs_name) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *lhs = operands.unpack(lhs_name);
- const IVectorTile *rhs = operands.unpack(rhs_name);
- const IVectorTile *dst = operands.unpack(dst_name);
-
- const int32_t dst_w = dst->format().w;
- const int32_t dst_h = dst->format().h;
- assert(lhs != nullptr);
- const int32_t lhs_w = lhs->format().w;
- const int32_t rhs_w = rhs->format().w;
-
- if (op == BinaryOp::MatMul_Nt_T)
- {
- assert((dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16));
- for (int32_t y = 0; y < dst_h; ++y)
- {
- for (int32_t x = 0; x < dst_w; ++x)
- {
- for (int32_t k = 0; k < lhs_w; ++k)
- {
- _data->code += dst->scalar(x, y).str;
- _data->code += " = fma(";
- _data->code += lhs->scalar(k, y).str;
- _data->code += ", ";
- _data->code += rhs->scalar(k, x).str;
- _data->code += ", ";
- _data->code += dst->scalar(x, y).str;
- _data->code += ");\n";
- }
- }
- }
-
- return;
- }
-
- const bool broadcast_lhs_x = dst_w != 1 && lhs_w == 1;
- const bool broadcast_rhs_x = dst_w != 1 && rhs_w == 1;
-
- const std::string lhs_prefix =
- broadcast_lhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
- const std::string rhs_prefix =
- broadcast_rhs_x ? "(" + dst->underlying_source_variables()[0].type.str + ")" : "";
- const std::string op_str = to_string(op);
-
- // Broadcasting on Y is automatic
- for (int32_t y = 0; y < dst_h; ++y)
- {
- _data->code += dst->vector(y).str;
- _data->code += " = ";
- _data->code += lhs_prefix + lhs->vector(y).str;
- _data->code += " ";
- _data->code += op_str;
- _data->code += " ";
- _data->code += rhs_prefix + rhs->vector(y).str;
- _data->code += ";\n";
- }
- };
-
- void op_cast_expression(const Operand &o_dst, const Operand &o_src, ConvertPolicy policy) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *src = operands.unpack(o_src);
- const IVectorTile *dst = operands.unpack(o_dst);
- // const int32_t dst_w = dst->format().w;
- const int32_t dst_h = dst->format().h;
- const std::string dt = dst->underlying_source_variables()[0].type.str;
- const bool is_float = (dst->format().dt == DataType::Fp32) || (dst->format().dt == DataType::Fp16);
- const std::string sat = ((policy == ConvertPolicy::Saturate && !is_float) ? "_sat" : "");
-
- // Broadcasting on Y is automatic
- for (int32_t y = 0; y < dst_h; ++y)
- {
- _data->code += dst->vector(y).str;
- _data->code += " = convert_" + dt + sat + "(";
- _data->code += src->vector(y).str;
- _data->code += ");\n";
- }
- };
-
- void op_assign(const Operand &dst_name, const Operand &src_name) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *src = operands.unpack(src_name);
- const IVectorTile *dst = operands.unpack(dst_name);
-
- const int32_t dst_w = dst->format().w;
- const int32_t dst_h = dst->format().h;
- const int32_t src_w = src->format().w;
- const std::string dt = dst->underlying_source_variables()[0].type.str;
-
- const bool broadcast_src_x = dst_w != 1 && src_w == 1;
-
- const std::string src_prefix = broadcast_src_x ? "(" + dt + ")" : "";
-
- // Broadcasting on Y is automatic
- for (int32_t y = 0; y < dst_h; ++y)
- {
- _data->code += dst->vector(y).str;
- _data->code += " = ";
- _data->code += src_prefix + src->vector(y).str;
- _data->code += ";\n";
- }
- }
-
- void op_unary_elementwise_function(const Operand &dst_name, UnaryFunction func, const Operand &src_name) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *src = operands.unpack(src_name);
- const IVectorTile *dst = operands.unpack(dst_name);
-
- const int32_t dst_h = dst->format().h;
- const std::string dt = dst->underlying_source_variables()[0].type.str;
-
- // Always perform an explicit cast. This automatically covers at least the 2 scenarios:
- // 1. Widen a scalar into a vector type. This enables scalar-vector broadcasting
- // 2. Ensure non-ambiguity over function overloads.
- // E.g. a constant tile may be accidentally initialized with a double literal. By casting it to single float,
- // it avoids ambiguous function calls
- const std::string src_prefix = "(" + dt + ")";
-
- // Broadcasting on Y is automatic
- for (int32_t y = 0; y < dst_h; ++y)
- {
- _data->code += dst->vector(y).str;
- _data->code += " = ";
-
- switch (func)
- {
- case UnaryFunction::Exp:
- _data->code += "exp(";
- break;
- case UnaryFunction::Tanh:
- _data->code += "tanh(";
- break;
- case UnaryFunction::Sqrt:
- _data->code += "sqrt(";
- break;
- case UnaryFunction::Erf:
- _data->code += "erf(";
- break;
- case UnaryFunction::Fabs:
- _data->code += "fabs(";
- break;
- case UnaryFunction::Log:
- _data->code += "log(";
- break;
- case UnaryFunction::SizeOf:
- _data->code += "sizeof(";
- break;
- case UnaryFunction::Round:
- _data->code += "round(";
- break;
- case UnaryFunction::Floor:
- _data->code += "floor(";
- break;
- default:
- CKW_ASSERT_MSG(false, "Unexpected UnaryFunction used.");
- }
-
- _data->code += src_prefix + src->vector(y).str;
- _data->code += ");\n";
- }
- }
-
- void op_binary_elementwise_function(const Operand &dst_name,
- BinaryFunction func,
- const Operand &first_name,
- const Operand &second_name) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *first = operands.unpack(first_name);
- const IVectorTile *second = operands.unpack(second_name);
- const IVectorTile *dst = operands.unpack(dst_name);
-
- const int32_t dst_h = dst->format().h;
- const auto datatype = dst->underlying_source_variables()[0].type;
- const std::string datatype_str = datatype.str;
-
- // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
- const std::string first_prefix = "(" + datatype_str + ")";
- const std::string second_prefix = "(" + datatype_str + ")";
-
- const bool is_float = (datatype.dt == DataType::Fp32 || datatype.dt == DataType::Fp16);
-
- // Broadcasting on Y is automatic
- for (int32_t y = 0; y < dst_h; ++y)
- {
- _data->code += dst->vector(y).str;
- _data->code += " = ";
-
- switch (func)
- {
- case BinaryFunction::Min:
- _data->code += is_float ? "fmin(" : "min(";
- break;
- case BinaryFunction::Max:
- _data->code += is_float ? "fmax(" : "max(";
- break;
- default:
- CKW_ASSERT_MSG(false, "Unexpected BinaryFunction used.");
- }
-
- _data->code += first_prefix + first->vector(y).str;
- _data->code += ", ";
- _data->code += second_prefix + second->vector(y).str;
- _data->code += ");\n";
- }
- }
-
- void op_ternary_elementwise_function(const Operand &dst_name,
- TernaryFunction func,
- const Operand &first_name,
- const Operand &second_name,
- const Operand &third_name) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *first = operands.unpack(first_name);
- const IVectorTile *second = operands.unpack(second_name);
- const IVectorTile *third = operands.unpack(third_name);
- const IVectorTile *dst = operands.unpack(dst_name);
-
- const int32_t dst_h = dst->format().h;
- const std::string dt = dst->underlying_source_variables()[0].type.str;
-
- // Always perform an explicit cast. See similar comments in op_unary_elementwise_function
- const std::string first_prefix = "(" + dt + ")";
- const std::string second_prefix = "(" + dt + ")";
- const std::string third_prefix = "(" + dt + ")";
-
- // Broadcasting on Y is automatic
- for (int32_t y = 0; y < dst_h; ++y)
- {
- _data->code += dst->vector(y).str;
- _data->code += " = ";
-
- switch (func)
- {
- case TernaryFunction::Select:
- _data->code += "select(";
- break;
- case TernaryFunction::Clamp:
- _data->code += "clamp(";
- break;
- default:
- CKW_ASSERT_MSG(false, "Unexpected TernaryFunction used.");
- }
-
- _data->code += first_prefix + first->vector(y).str;
- _data->code += ", ";
- _data->code += second_prefix + second->vector(y).str;
- _data->code += ", ";
- _data->code += third_prefix + third->vector(y).str;
- _data->code += ");\n";
- }
- }
-
- void op_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *lhs = operands.unpack(o_lhs);
- const IVectorTile *rhs = operands.unpack(o_rhs);
-
- assert(is_tile_scalar(lhs));
- assert(is_tile_scalar(rhs));
-
- _data->code += "if(";
- _data->code += lhs->scalar(0, 0).str;
- _data->code += " ";
- _data->code += to_string(op);
- _data->code += " ";
- _data->code += rhs->scalar(0, 0).str;
- _data->code += ")\n";
- }
-
- void op_else_if_header(const Operand &o_lhs, BinaryOp op, const Operand &o_rhs) override
- {
- _data->code += "else ";
- op_if_header(o_lhs, op, o_rhs);
- }
-
- void op_else_header() override
- {
- _data->code += "else\n";
- }
-
- void op_for_loop_header(const Operand &var_name,
- BinaryOp cond_op,
- const Operand &cond_value_name,
- const Operand &update_var_name,
- AssignmentOp update_op,
- const Operand &update_value_name) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *var = operands.unpack(var_name);
- const IVectorTile *cond_value = operands.unpack(cond_value_name);
- const IVectorTile *update_var = operands.unpack(update_var_name);
- const IVectorTile *update_value = operands.unpack(update_value_name);
-
- const int32_t dst_w = var->format().w;
- const int32_t dst_h = var->format().h;
-
- // It must be a scalar variable
- CKW_UNUSED(dst_w, dst_h);
- assert(dst_w == 1);
- assert(dst_h == 1);
-
- _data->code += "for(; ";
- _data->code += var->scalar(0, 0).str;
- _data->code += " ";
- _data->code += to_string(cond_op);
- _data->code += " " + cond_value->scalar(0, 0).str + "; ";
- _data->code += update_var->scalar(0, 0).str;
- _data->code += " ";
- _data->code += to_string(update_op);
- _data->code += " " + update_value->scalar(0, 0).str + ")";
- _data->code += "\n";
- }
-
- void op_load_immediate(const TensorOperand &o_tensor,
- const Operand &o_dst,
- const Operand &o_x,
- const Operand &o_y,
- const Operand &o_z,
- const Operand &o_batch_idx,
- const Operand &dilation_y) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
-
- // Not const as it requires changes to 'load_writer'.
- IVectorTile *dst = operands.unpack(o_dst);
- IVectorTile *x = operands.unpack(o_x);
- IVectorTile *y = operands.unpack(o_y);
- IVectorTile *z = operands.unpack(o_z);
- IVectorTile *dil_y = operands.unpack(dilation_y);
- IVectorTile *b = operands.unpack(o_batch_idx);
-
- TensorOperandUnpacker tensor_operands(_data->arguments);
- IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor);
- auto gpu_sampler = o_tensor.sampler();
-
- GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
- auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
-
- // Initialize the constant part
- load_writer->initialize(dst, x, z, b);
-
- for (int i = 0; i < dst->format().h; ++i)
- {
- std::string coord_y = y->scalar(0, 0).str + " + " + std::to_string(i);
- if (dil_y->scalar(0, 0).str != "1")
- {
- coord_y += " * " + dil_y->scalar(0, 0).str;
- }
- load_writer->write(std::make_pair(i, coord_y));
- }
-
- load_writer->finalize();
- }
-
- void op_load_indirect(const TensorOperand &o_tensor,
- const Operand &o_dst,
- const Operand &o_x,
- const Operand &o_indirect_h,
- const Operand &o_z,
- const Operand &o_batch_idx) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
-
- // Not const as it requires changes to 'load_writer'.
- IVectorTile *dst = operands.unpack(o_dst);
- IVectorTile *x = operands.unpack(o_x);
- IVectorTile *y_ind = operands.unpack(o_indirect_h);
- IVectorTile *z = operands.unpack(o_z);
- IVectorTile *b = operands.unpack(o_batch_idx);
-
- TensorOperandUnpacker tensor_operands(_data->arguments);
- IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor);
- auto gpu_sampler = o_tensor.sampler();
-
- GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
- auto load_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Load);
-
- // Initialize the constant part
- load_writer->initialize(dst, x, z, b);
-
- for (int i = 0; i < dst->format().h; ++i)
- {
- load_writer->write(std::make_pair(i, y_ind->scalar(0, i).str));
- }
-
- load_writer->finalize();
- }
-
- void op_store_immediate(const TensorOperand &tensor_name,
- const Operand &src_name,
- const Operand &x_name,
- const Operand &y_name,
- const Operand &z_name,
- const Operand &batch_index_name) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
-
- // Not const as it requires changes to 'load_writer'.
- IVectorTile *src = operands.unpack(src_name);
- IVectorTile *x = operands.unpack(x_name);
- IVectorTile *y = operands.unpack(y_name);
- IVectorTile *z = operands.unpack(z_name);
- IVectorTile *b = operands.unpack(batch_index_name);
-
- TensorOperandUnpacker tensor_operands(_data->arguments);
- IGpuTensorArgument *tensor = tensor_operands.unpack(tensor_name);
- auto gpu_sampler = tensor_name.sampler();
-
- GpuTensor3dMapper mapper(tensor, gpu_sampler);
-
- auto store_writer = ClLoadStoreHelperWriterFactory::create(this, mapper, GpuLoadStoreType::Store);
-
- // Initialize the constant part
- store_writer->initialize(src, x, z, b);
-
- int32_t tile_h = src->format().h;
-
- for (int m0 = tile_h - 1; m0 >= 0; m0--)
- {
- store_writer->write(std::make_pair(m0, y->scalar(0, 0).str + " + " + std::to_string(m0)));
- }
-
- store_writer->finalize();
- }
-
- void op_return() override
- {
- _data->code += "return;\n";
- }
-
- void util_get_indirect_buffer(const Operand &o_dst,
- const TensorOperand &o_tensor,
- const Operand &o_x,
- const Operand &o_y,
- const Operand &o_x_off,
- const Operand &o_y_off) override
- {
- OperandUnpacker operands(_data->tiles, _data->arguments);
- const IVectorTile *dst = operands.unpack(o_dst);
- const IVectorTile *x = operands.unpack(o_x);
- const IVectorTile *y = operands.unpack(o_y);
- const IVectorTile *x_off = operands.unpack(o_x_off);
- const IVectorTile *y_off = operands.unpack(o_y_off);
-
- TensorOperandUnpacker tensor_operands(_data->arguments);
- IGpuTensorArgument *tensor = tensor_operands.unpack(o_tensor);
-
- assert(dst->format().w == 1);
- assert(x->format().w == 1);
- assert(y->format().w == 1);
- assert(x_off->format().w == 1);
- assert(y_off->format().w == 1);
- assert(dst->format().dt == DataType::Int32);
- assert(x->format().dt == DataType::Int32);
- assert(y->format().dt == DataType::Int32);
- assert(x_off->format().dt == DataType::Int32);
- assert(y_off->format().dt == DataType::Int32);
-
- const std::string width = tensor->component(TensorComponentType::Dim1);
- const std::string height = tensor->component(TensorComponentType::Dim2);
- const std::string wxh = tensor->component(TensorComponentType::Dim1xDim2);
- /*
- int x_s;
- int y_s;
- x_s = (xi_0 + x_k);
- y_s = (yi_0 + y_k);
- mi_0 = x_s + y_s * width + b * widthxheight;
- mi_0 = select(-1, mi_0, x_s >= 0);
- mi_0 = select(-1, mi_0, y_s >= 0);
- mi_0 = select(-1, mi_0, x_s < 128);
- mi_0 = select(-1, mi_0, y_s < 128);
- */
- compound_statement_begin();
- declare_tile("_x_s", TileInfo(DataType::Int32));
- declare_tile("_y_s", TileInfo(DataType::Int32));
- auto x_s = operands.unpack(Operand("_x_s"));
- auto y_s = operands.unpack(Operand("_y_s"));
- for (int i = 0; i < dst->format().h; ++i)
- {
- // x_s = (xi_0 + x_k);
- // y_s = (yi_0 + y_k);
- _data->code += x_s->scalar(0, i).str;
- _data->code += " = (";
- _data->code += x->scalar(0, i).str;
- _data->code += " + ";
- _data->code += x_off->scalar(0, i).str;
- _data->code += ");\n";
- _data->code += y_s->scalar(0, i).str;
- _data->code += " = (";
- _data->code += y->scalar(0, i).str;
- _data->code += " + ";
- _data->code += y_off->scalar(0, i).str;
- _data->code += ");\n";
- // mi_0 = x_s + y_s * width;
- _data->code += dst->scalar(0, i).str;
- _data->code += " = ";
- _data->code += x_s->scalar(0, i).str;
- _data->code += " + ";
- _data->code += y_s->scalar(0, i).str;
- _data->code += " * " + width + ";\n";
- // mi_0 = select(wxh, mi_0, x_s >= 0);
- _data->code += dst->scalar(0, i).str;
- _data->code += " = select(-1, ";
- _data->code += dst->scalar(0, i).str;
- _data->code += ", ";
- _data->code += x_s->scalar(0, i).str;
- _data->code += " >= 0);\n";
- // mi_0 = select(wxh, mi_0, x_s < width);
- _data->code += dst->scalar(0, i).str;
- _data->code += " = select(-1, ";
- _data->code += dst->scalar(0, i).str;
- _data->code += ", ";
- _data->code += x_s->scalar(0, i).str;
- _data->code += " < ";
- _data->code += width + ");\n";
- // mi_0 = select(wxh, mi_0, y_s >= 0);
- _data->code += dst->scalar(0, i).str;
- _data->code += " = select(-1, ";
- _data->code += dst->scalar(0, i).str;
- _data->code += ", ";
- _data->code += y_s->scalar(0, i).str;
- _data->code += " >= 0);\n";
- // mi_0 = select(wxh, mi_0, y_s < height);
- _data->code += dst->scalar(0, i).str;
- _data->code += " = select(-1, ";
- _data->code += dst->scalar(0, i).str;
- _data->code += ", ";
- _data->code += y_s->scalar(0, i).str;
- _data->code += " < ";
- _data->code += height + ");\n";
- }
- compound_statement_end();
- }
-
-private:
- GpuKernelWriterDataHolder *_data{nullptr};
- GpuKernelWriterAttribute *_attr{nullptr};
-};
-
-/** IGpuKernelWriter factory class */
-class GpuKernelWriterFactory final
-{
-public:
- /** Static method to call the IGpuKernelWriter class accordingly with the Gpu programming language
- *
- * @param[in] gpu GPU target
- *
- * @return IGpuKernelWriter
- */
- static std::unique_ptr<IGpuKernelWriter> create(GpuKernelWriterAttribute *attr, GpuKernelWriterDataHolder *x)
- {
- switch (x->programming_language())
- {
- case GpuTargetLanguage::OpenCL:
- return std::make_unique<ClKernelWriter>(attr, x);
- default:
- std::cout << "Unsupported Gpu programming language" << std::endl;
- assert(false);
- return nullptr;
- }
- }
-};
-
-inline int32_t
-adjust_step(TensorSamplerFormat tensor_format, int32_t step, const TensorInfo *tensor_info_id, int32_t idx)
-{
- auto tensor = tensor_info_id->shape;
-
- int32_t dim[3] = {0};
-
- switch (tensor_format)
- {
- case TensorSamplerFormat::C_W_H:
- dim[0] = tensor[0];
- dim[1] = tensor[1];
- dim[2] = tensor[2];
- break;
- case TensorSamplerFormat::C_WH_1:
- dim[0] = tensor[0];
- dim[1] = tensor[1] * tensor[2];
- dim[2] = 1;
- break;
- default:
- std::cout << "Unsupported tensor format" << std::endl;
- assert(false);
- break;
- }
-
- return std::min(step, dim[idx]);
-}
-
-} // namespace prototype
-} // namespace ckw
-
-#endif // CKW_PROTOTYPE_SRC_PROTOTYPE_H