diff options
-rw-r--r-- | examples/cl_sgemm.cpp | 212 | ||||
-rw-r--r-- | utils/GraphUtils.cpp | 1 | ||||
-rw-r--r-- | utils/Utils.cpp | 37 | ||||
-rw-r--r-- | utils/Utils.h | 378 |
4 files changed, 583 insertions, 45 deletions
diff --git a/examples/cl_sgemm.cpp b/examples/cl_sgemm.cpp new file mode 100644 index 0000000000..8808f7ebf5 --- /dev/null +++ b/examples/cl_sgemm.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL /* Needed by Utils.cpp to handle OpenCL exceptions properly */ +#error "This example needs to be built with -DARM_COMPUTE_CL" +#endif /* ARM_COMPUTE_CL */ + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLFunctions.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/CLTuner.h" +#include "utils/Utils.h" + +using namespace arm_compute; +using namespace utils; + +void main_cl_sgemm(int argc, const char **argv) +{ + NPYLoader npy0, npy1, npy2; + CLImage src0, src1, src2, dst; + int alpha = 1, beta = 0; + + CLTuner tuner; + CLScheduler::get().default_init(&tuner); + + std::ifstream stream; + if(argc > 1) + { + stream.open(argv[1], std::fstream::in); + } + + if(argc < 3 || (argc < 4 && stream.bad())) + { + // Print help + std::cout << "Usage: 1) ./build/cl_sgemm input_matrix_1.npy input_matrix_2.npy [input_matrix_3.npy] [alpha = 1] [beta = 0]\n"; + std::cout << " 2) ./build/cl_sgemm M N K [alpha = 1] [beta = 0]\n\n"; + std::cout << "Too few or no input_matrices provided, creating random 5x7, 3x5 and 3x7 matrices\n\n"; + + src0.allocator()->init(TensorInfo(5, 7, Format::F32)); + src1.allocator()->init(TensorInfo(3, 5, Format::F32)); + src2.allocator()->init(TensorInfo(3, 7, Format::F32)); + } + else + { + if(stream.good()) /* case file1.npy file2.npy [file3.npy] [alpha = 1] [beta = 0] */ + { + npy0.open(argv[1]); + npy0.init_tensor(src0, Format::F32); + npy1.open(argv[2]); + npy1.init_tensor(src1, Format::F32); + + if(argc > 3) + { + stream.close(); + stream.clear(); + stream.open(argv[3], std::fstream::in); + if(stream.good()) /* case with third file */ + { + npy2.open(argv[3]); + npy2.init_tensor(src2, Format::F32); + + if(argc > 4) + { + alpha = strtol(argv[4], nullptr, 10); + + if(argc > 5) + { + beta = strtol(argv[5], nullptr, 10); + } + } + } + else /* case without third file */ + { + alpha = strtol(argv[3], nullptr, 10); + + if(argc > 4) + { + beta = strtol(argv[4], nullptr, 10); + } + } + } + } + else /* case M N K [alpha = 1] [beta = 0] */ + { + size_t M = strtol(argv[1], nullptr, 10); + size_t N = strtol(argv[2], nullptr, 10); + size_t K = strtol(argv[3], nullptr, 10); + + src0.allocator()->init(TensorInfo(K, M, Format::F32)); + src1.allocator()->init(TensorInfo(N, K, Format::F32)); + src2.allocator()->init(TensorInfo(N, M, Format::F32)); + + if(argc > 4) + { + alpha = strtol(argv[4], nullptr, 10); + + if(argc > 5) + { + beta = strtol(argv[5], nullptr, 10); + } + } + } + } + + init_sgemm_output(dst, src0, src1, Format::F32); + + // Configure function + CLGEMM sgemm; + sgemm.configure(&src0, &src1, (src2.info()->total_size() > 0) ? &src2 : nullptr, &dst, alpha, beta); + + // Allocate all the images + src0.allocator()->allocate(); + src1.allocator()->allocate(); + dst.allocator()->allocate(); + + // Fill the input images with either the data provided or random data + if(npy0.is_open()) + { + npy0.fill_tensor(src0); + npy1.fill_tensor(src1); + + if(npy2.is_open()) + { + src2.allocator()->allocate(); + npy2.fill_tensor(src2); + } + } + else + { + src2.allocator()->allocate(); + + fill_random_tensor(src0, -1.f, 1.f); + fill_random_tensor(src1, -1.f, 1.f); + fill_random_tensor(src2, -1.f, 1.f); + } + + // Dummy run for CLTuner + sgemm.run(); + + auto start = std::chrono::high_resolution_clock::now(); + + // Execute the function + sgemm.run(); + + // Make sure all the OpenCL jobs are done executing: + CLScheduler::get().sync(); + + auto stop = std::chrono::high_resolution_clock::now(); + + if(!npy0.is_open()) /* If the inputs were not files, print the results */ + { + std::cout << "\nMatrix 1:" << std::endl; + src0.map(true); + src0.print(std::cout, IOFormatInfo()); + src0.unmap(); + + std::cout << "Matrix 2:" << std::endl; + src1.map(true); + src1.print(std::cout, IOFormatInfo()); + src1.unmap(); + + std::cout << "Matrix 3:" << std::endl; + src2.map(true); + src2.print(std::cout, IOFormatInfo()); + src2.unmap(); + + std::cout << "Alpha:" << alpha << "\n\n"; + std::cout << "Beta:" << beta << "\n\n"; + + std::cout << "Output Matrix:" << std::endl; + dst.map(true); + dst.print(std::cout, IOFormatInfo()); + dst.unmap(); + } + else /* Save to .npy file */ + { + save_to_npy(dst, "sgemm_out.npy", npy0.is_fortran()); + } + + auto delta = std::chrono::duration_cast<std::chrono::microseconds>(stop - start); + std::cout << "Time elapsed: " << delta.count() << "us." << std::endl; +} + +/** Main program for sgemm test + * + * @param[in] argc Number of arguments + * @param[in] argv Arguments ( [optional] Matrix A, [optional] Matrix B, [optional] Matrix C, [optional] alpha, [optional] beta ) + */ +int main(int argc, const char **argv) +{ + return utils::run_example(argc, argv, main_cl_sgemm); +} diff --git a/utils/GraphUtils.cpp b/utils/GraphUtils.cpp index bcfc0f7994..15767632c8 100644 --- a/utils/GraphUtils.cpp +++ b/utils/GraphUtils.cpp @@ -32,7 +32,6 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/PixelValue.h" -#include "libnpy/npy.hpp" #include <algorithm> #include <iomanip> diff --git a/utils/Utils.cpp b/utils/Utils.cpp index 5316690a3d..b99afb441c 100644 --- a/utils/Utils.cpp +++ b/utils/Utils.cpp @@ -167,5 +167,42 @@ std::tuple<unsigned int, unsigned int, int> parse_ppm_header(std::ifstream &fs) return std::make_tuple(width, height, max_val); } + +std::tuple<std::vector<unsigned long>, bool, std::string> parse_npy_header(std::ifstream &fs) //NOLINT +{ + std::vector<unsigned long> shape; // NOLINT + + // Check magic bytes and version number + unsigned char v_major = 0; + unsigned char v_minor = 0; + npy::read_magic(fs, &v_major, &v_minor); + + // Read header + std::string header; + if(v_major == 1 && v_minor == 0) + { + header = npy::read_header_1_0(fs); + } + else if(v_major == 2 && v_minor == 0) + { + header = npy::read_header_2_0(fs); + } + else + { + ARM_COMPUTE_ERROR("Unsupported file format version"); + } + + // Parse header + bool fortran_order = false; + std::string typestr; + npy::ParseHeader(header, typestr, &fortran_order, shape); + + if(!fortran_order) + { + std::reverse(shape.begin(), shape.end()); + } + + return std::make_tuple(shape, fortran_order, typestr); +} } // namespace utils } // namespace arm_compute diff --git a/utils/Utils.h b/utils/Utils.h index c88de0e16b..1f3d971917 100644 --- a/utils/Utils.h +++ b/utils/Utils.h @@ -30,6 +30,7 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "arm_compute/runtime/Tensor.h" +#include "libnpy/npy.hpp" #include "support/ToolchainSupport.h" #ifdef ARM_COMPUTE_CL @@ -41,6 +42,10 @@ #include <cstring> #include <fstream> #include <iostream> +#include <random> +#include <string> +#include <tuple> +#include <vector> namespace arm_compute { @@ -80,6 +85,66 @@ void draw_detection_rectangle(arm_compute::ITensor *tensor, const arm_compute::D */ std::tuple<unsigned int, unsigned int, int> parse_ppm_header(std::ifstream &fs); +/** Parse the npy header from an input file stream. At the end of the execution, + * the file position pointer will be located at the first pixel stored in the npy file //TODO + * + * @param[in] fs Input file stream to parse + * + * @return The width and height stored in the header of the NPY file + */ +std::tuple<std::vector<unsigned long>, bool, std::string> parse_npy_header(std::ifstream &fs); + +/** Obtain numpy type string from DataType. + * + * @param[in] data_type Data type. + * + * @return numpy type string. + */ +inline std::string get_typestring(DataType data_type) +{ + // Check endianness + const unsigned int i = 1; + const char *c = reinterpret_cast<const char *>(&i); + std::string endianness; + if(*c == 1) + { + endianness = std::string("<"); + } + else + { + endianness = std::string(">"); + } + const std::string no_endianness("|"); + + switch(data_type) + { + case DataType::U8: + return no_endianness + "u" + support::cpp11::to_string(sizeof(uint8_t)); + case DataType::S8: + return no_endianness + "i" + support::cpp11::to_string(sizeof(int8_t)); + case DataType::U16: + return endianness + "u" + support::cpp11::to_string(sizeof(uint16_t)); + case DataType::S16: + return endianness + "i" + support::cpp11::to_string(sizeof(int16_t)); + case DataType::U32: + return endianness + "u" + support::cpp11::to_string(sizeof(uint32_t)); + case DataType::S32: + return endianness + "i" + support::cpp11::to_string(sizeof(int32_t)); + case DataType::U64: + return endianness + "u" + support::cpp11::to_string(sizeof(uint64_t)); + case DataType::S64: + return endianness + "i" + support::cpp11::to_string(sizeof(int64_t)); + case DataType::F32: + return endianness + "f" + support::cpp11::to_string(sizeof(float)); + case DataType::F64: + return endianness + "f" + support::cpp11::to_string(sizeof(double)); + case DataType::SIZET: + return endianness + "u" + support::cpp11::to_string(sizeof(size_t)); + default: + ARM_COMPUTE_ERROR("NOT SUPPORTED!"); + } +} + /** Maps a tensor if needed * * @param[in] tensor Tensor to be mapped @@ -350,6 +415,159 @@ private: unsigned int _width, _height; }; +class NPYLoader +{ +public: + NPYLoader() + : _fs(), _shape(), _fortran_order(false), _typestring() + { + } + + /** Open a NPY file and reads its metadata + * + * @param[in] npy_filename File to open + */ + void open(const std::string &npy_filename) + { + ARM_COMPUTE_ERROR_ON(is_open()); + try + { + _fs.exceptions(std::ifstream::failbit | std::ifstream::badbit); + _fs.open(npy_filename, std::ios::in | std::ios::binary); + + std::tie(_shape, _fortran_order, _typestring) = parse_npy_header(_fs); + } + catch(const std::ifstream::failure &e) + { + ARM_COMPUTE_ERROR("Accessing %s: %s", npy_filename.c_str(), e.what()); + } + } + /** Return true if a NPY file is currently open */ + bool is_open() + { + return _fs.is_open(); + } + + /** Return true if a NPY file is in fortran order */ + bool is_fortran() + { + return _fortran_order; + } + + /** Initialise an image's metadata with the dimensions of the NPY file currently open + * + * @param[out] tensor Tensor to initialise + * @param[in] format Format to use for the image + */ + template <typename T> + void init_tensor(T &tensor, arm_compute::Format format) + { + ARM_COMPUTE_ERROR_ON(!is_open()); + ARM_COMPUTE_ERROR_ON(format != arm_compute::Format::F32); + + // Use the size of the input NPY tensor + TensorShape shape; + shape.set_num_dimensions(_shape.size()); + for(size_t i = 0; i < _shape.size(); ++i) + { + shape.set(i, _shape.at(i)); + } + + arm_compute::TensorInfo tensor_info(shape, format); + tensor.allocator()->init(tensor_info); + } + + /** Fill a tensor with the content of the currently open NPY file. + * + * @note If the tensor is a CLTensor, the function maps and unmaps the tensor + * + * @param[in,out] tensor Tensor to fill (Must be allocated, and of matching dimensions with the opened NPY). + */ + template <typename T> + void fill_tensor(T &tensor) + { + ARM_COMPUTE_ERROR_ON(!is_open()); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(&tensor, arm_compute::Format::F32); + try + { + // Map buffer if creating a CLTensor + map(tensor, true); + + // Check if the file is large enough to fill the tensor + const size_t current_position = _fs.tellg(); + _fs.seekg(0, std::ios_base::end); + const size_t end_position = _fs.tellg(); + _fs.seekg(current_position, std::ios_base::beg); + + ARM_COMPUTE_ERROR_ON_MSG((end_position - current_position) < tensor.info()->tensor_shape().total_size() * tensor.info()->element_size(), + "Not enough data in file"); + ARM_COMPUTE_UNUSED(end_position); + + // Check if the typestring matches the given one + std::string expect_typestr = get_typestring(tensor.info()->data_type()); + ARM_COMPUTE_ERROR_ON_MSG(_typestring != expect_typestr, "Typestrings mismatch"); + + // Validate tensor shape + ARM_COMPUTE_ERROR_ON_MSG(_shape.size() != tensor.shape().num_dimensions(), "Tensor ranks mismatch"); + if(_fortran_order) + { + for(size_t i = 0; i < _shape.size(); ++i) + { + ARM_COMPUTE_ERROR_ON_MSG(tensor.shape()[i] != _shape[i], "Tensor dimensions mismatch"); + } + } + else + { + for(size_t i = 0; i < _shape.size(); ++i) + { + ARM_COMPUTE_ERROR_ON_MSG(tensor.shape()[i] != _shape[_shape.size() - i - 1], "Tensor dimensions mismatch"); + } + } + + switch(tensor.info()->format()) + { + case arm_compute::Format::F32: + { + // Read data + if(tensor.info()->padding().empty()) + { + // If tensor has no padding read directly from stream. + _fs.read(reinterpret_cast<char *>(tensor.buffer()), tensor.info()->total_size()); + } + else + { + // If tensor has padding accessing tensor elements through execution window. + Window window; + window.use_tensor_dimensions(tensor.info()->tensor_shape()); + + execute_window_loop(window, [&](const Coordinates & id) + { + _fs.read(reinterpret_cast<char *>(tensor.ptr_to_element(id)), tensor.info()->element_size()); + }); + } + + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported format"); + } + + // Unmap buffer if creating a CLTensor + unmap(tensor); + } + catch(const std::ifstream::failure &e) + { + ARM_COMPUTE_ERROR("Loading NPY file: %s", e.what()); + } + } + +private: + std::ifstream _fs; + std::vector<unsigned long> _shape; + bool _fortran_order; + std::string _typestring; +}; + /** Template helper function to save a tensor image to a PPM file. * * @note Only U8 and RGB888 formats supported. @@ -430,6 +648,83 @@ void save_to_ppm(T &tensor, const std::string &ppm_filename) } } +/** Template helper function to save a tensor image to a NPY file. + * + * @note Only F32 format supported. + * @note Only works with 2D tensors. + * @note If the input tensor is a CLTensor, the function maps and unmaps the image + * + * @param[in] tensor The tensor to save as NPY file + * @param[in] npy_filename Filename of the file to create. + * @param[in] fortran_order If true, save matrix in fortran order. + */ +template <typename T> +void save_to_npy(T &tensor, const std::string &npy_filename, bool fortran_order) +{ + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(&tensor, arm_compute::Format::F32); + ARM_COMPUTE_ERROR_ON(tensor.info()->num_dimensions() > 2); + + std::ofstream fs; + + try + { + fs.exceptions(std::ofstream::failbit | std::ofstream::badbit | std::ofstream::eofbit); + fs.open(npy_filename, std::ios::out | std::ios::binary); + + const unsigned int width = tensor.info()->tensor_shape()[0]; + const unsigned int height = tensor.info()->tensor_shape()[1]; + unsigned long shape[2]; + + if(!fortran_order) + { + shape[0] = height, shape[1] = width; + } + else + { + shape[0] = width, shape[1] = height; + } + + // Map buffer if creating a CLTensor + map(tensor, true); + + switch(tensor.info()->format()) + { + case arm_compute::Format::F32: + { + std::vector<float> tmp; /* Used only to get the typestring */ + npy::Typestring typestring_o{ tmp }; + std::string typestring = typestring_o.str(); + + std::ofstream stream(npy_filename, std::ofstream::binary); + npy::WriteHeader(stream, typestring, fortran_order, 2, shape); + + arm_compute::Window window; + window.set(arm_compute::Window::DimX, arm_compute::Window::Dimension(0, width, 1)); + window.set(arm_compute::Window::DimY, arm_compute::Window::Dimension(0, height, 1)); + + arm_compute::Iterator in(&tensor, window); + + arm_compute::execute_window_loop(window, [&](const arm_compute::Coordinates & id) + { + stream.write(reinterpret_cast<const char *>(in.ptr()), sizeof(float)); + }, + in); + + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported format"); + } + + // Unmap buffer if creating a CLTensor + unmap(tensor); + } + catch(const std::ofstream::failure &e) + { + ARM_COMPUTE_ERROR("Writing %s: (%s)", npy_filename.c_str(), e.what()); + } +} + /** Load the tensor with pre-trained data from a binary file * * @param[in] tensor The tensor to be filled. Data type supported: F32. @@ -484,56 +779,51 @@ void load_trained_data(T &tensor, const std::string &filename) } } -/** Obtain numpy type string from DataType. - * - * @param[in] data_type Data type. - * - * @return numpy type string. - */ -inline std::string get_typestring(DataType data_type) +template <typename T> +void fill_random_tensor(T &tensor, float lower_bound, float upper_bound) { - // Check endianness - const unsigned int i = 1; - const char *c = reinterpret_cast<const char *>(&i); - std::string endianness; - if(*c == 1) - { - endianness = std::string("<"); - } - else - { - endianness = std::string(">"); - } - const std::string no_endianness("|"); + std::random_device rd; + std::mt19937 gen(rd()); - switch(data_type) + TensorShape shape(tensor.info()->dimension(0), tensor.info()->dimension(1)); + + Window window; + window.set(Window::DimX, Window::Dimension(0, shape.x(), 1)); + window.set(Window::DimY, Window::Dimension(0, shape.y(), 1)); + + map(tensor, true); + + Iterator it(&tensor, window); + + switch(tensor.info()->format()) { - case DataType::U8: - return no_endianness + "u" + support::cpp11::to_string(sizeof(uint8_t)); - case DataType::S8: - return no_endianness + "i" + support::cpp11::to_string(sizeof(int8_t)); - case DataType::U16: - return endianness + "u" + support::cpp11::to_string(sizeof(uint16_t)); - case DataType::S16: - return endianness + "i" + support::cpp11::to_string(sizeof(int16_t)); - case DataType::U32: - return endianness + "u" + support::cpp11::to_string(sizeof(uint32_t)); - case DataType::S32: - return endianness + "i" + support::cpp11::to_string(sizeof(int32_t)); - case DataType::U64: - return endianness + "u" + support::cpp11::to_string(sizeof(uint64_t)); - case DataType::S64: - return endianness + "i" + support::cpp11::to_string(sizeof(int64_t)); - case DataType::F32: - return endianness + "f" + support::cpp11::to_string(sizeof(float)); - case DataType::F64: - return endianness + "f" + support::cpp11::to_string(sizeof(double)); - case DataType::SIZET: - return endianness + "u" + support::cpp11::to_string(sizeof(size_t)); + case arm_compute::Format::F32: + { + std::uniform_real_distribution<float> dist(lower_bound, upper_bound); + + execute_window_loop(window, [&](const Coordinates & id) + { + *reinterpret_cast<float *>(it.ptr()) = dist(gen); + }, + it); + + break; + } default: - ARM_COMPUTE_ERROR("NOT SUPPORTED!"); + { + ARM_COMPUTE_ERROR("Unsupported format"); + } } + + unmap(tensor); +} + +template <typename T> +void init_sgemm_output(T &dst, T &src0, T &src1, arm_compute::Format format) +{ + dst.allocator()->init(TensorInfo(src1.info()->dimension(0), src0.info()->dimension(1), format)); } + } // namespace utils } // namespace arm_compute #endif /* __UTILS_UTILS_H__*/ |