diff options
author | Louis Verhaard <louis.verhaard@arm.com> | 2020-12-03 12:26:25 +0100 |
---|---|---|
committer | Louis Verhaard <louis.verhaard@arm.com> | 2020-12-11 12:24:25 +0100 |
commit | 9bfe0f86ea525055954b160a3c678024743030ec (patch) | |
tree | 384bac88fcc613abd357fbbf266ea30e34cbe526 /ethosu | |
parent | 93719a9b8c160de3acf047eacb9196f13167bddb (diff) | |
download | ethos-u-vela-9bfe0f86ea525055954b160a3c678024743030ec.tar.gz |
MLBEDSW-1373: Added search based allocator
Added a new tensor allocator that is based on searching,
implemented in C++ (C++11 compatible).
Change-Id: Ie96e9fcfc8e6c58d1fa53911f37de290eeba88cf
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
Diffstat (limited to 'ethosu')
-rw-r--r-- | ethosu/tensor_allocator/makefile | 50 | ||||
-rw-r--r-- | ethosu/tensor_allocator/search_allocator.cpp | 266 | ||||
-rw-r--r-- | ethosu/tensor_allocator/search_allocator.h | 185 | ||||
-rw-r--r-- | ethosu/tensor_allocator/tensor_allocator_main.cpp | 77 | ||||
-rw-r--r-- | ethosu/tensor_allocator/tensor_allocatormodule.cpp | 92 | ||||
-rw-r--r-- | ethosu/tensor_allocator/test/test_tensor_allocator.py | 30 | ||||
-rw-r--r-- | ethosu/vela/greedy_allocation.py | 26 | ||||
-rw-r--r-- | ethosu/vela/nn_graph.py | 1 | ||||
-rw-r--r-- | ethosu/vela/tensor_allocation.py | 50 | ||||
-rw-r--r-- | ethosu/vela/vela.py | 2 |
10 files changed, 751 insertions, 28 deletions
diff --git a/ethosu/tensor_allocator/makefile b/ethosu/tensor_allocator/makefile new file mode 100644 index 00000000..9636eb94 --- /dev/null +++ b/ethosu/tensor_allocator/makefile @@ -0,0 +1,50 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +# Description: +# Makefile to build tensor_allocator_main + +UNAME=$(shell uname -o) + +CXXFLAGS=--std=c++11 -pedantic-errors -Wall -Werror -Wdate-time +CXXFLAGS+=-fwrapv -fstack-protector-strong -flto -fuse-linker-plugin -ffat-lto-objects -fPIC + +ifeq ($(DEBUG),1) + CXXFLAGS+=-g -O0 -DDEBUG +else + CXXFLAGS+=-O2 +endif + +LIBSRCS=tensor_allocator_main.cpp search_allocator.cpp +LIBHDRS=search_allocator.h + +ifeq ($(UNAME),Cygwin) + TENSOR_ALLOCATOR_EXE=tensor_allocator_main.exe +else + TENSOR_ALLOCATOR_EXE=tensor_allocator_main +endif + +all: tensor_allocator_exe + +.PHONY: tensor_allocator_exe +tensor_allocator_exe: $(TENSOR_ALLOCATOR_EXE) + +clean: + rm -f $(TENSOR_ALLOCATOR_EXE) + +$(TENSOR_ALLOCATOR_EXE): $(LIBSRCS) $(LIBHDRS) makefile + g++ $(CXXFLAGS) $(LIBSRCS) -o $(TENSOR_ALLOCATOR_EXE) diff --git a/ethosu/tensor_allocator/search_allocator.cpp b/ethosu/tensor_allocator/search_allocator.cpp new file mode 100644 index 00000000..ce5c46de --- /dev/null +++ b/ethosu/tensor_allocator/search_allocator.cpp @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Description: + * Implementation of the search-based allocator. + */ + +#include <algorithm> +#include <cstdint> +#include <set> +#include <vector> + +#include "search_allocator.h" + +SearchAllocator::SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit) { + lrs = live_ranges; + uint32_t max_end_time = 0; + for (size_t i = 0; i < lrs.size(); ++i) { + auto &lr = lrs[i]; + lr.id = i; + max_end_time = std::max(max_end_time, lr.end_time); + } + lrs_at_time.resize(max_end_time + 1); + size_at_time.resize(max_end_time + 1); + neighbours.resize(lrs.size()); + // Calculate which live ranges are active at every timestamp + for (size_t t = 0; t <= max_end_time; ++t) { + lrs_at_time[t].clear(); + } + for (auto &lr : lrs) { + for (auto t = lr.start_time; t <= lr.end_time; ++t) { + lrs_at_time[t].push_back(&lr); + } + } + min_required_size = 0; + for (size_t t = 0; t <= max_end_time; ++t) { + // Calculate minimum needed size at each timestamp + uint32_t size_at_t = 0; + for (auto &lr : lrs_at_time[t]) { + size_at_t += lr->size; + } + size_at_time[t] = size_at_t; + min_required_size = std::max(size_at_t, min_required_size); + // Calculate all neighbours + for (size_t i = 0; i < lrs_at_time[t].size(); ++i) { + auto lr1 = lrs_at_time[t][i]; + auto &nb1 = neighbours[lr1->id]; + for (size_t j = i + 1; j < lrs_at_time[t].size(); ++j) { + auto lr2 = lrs_at_time[t][j]; + if (find(nb1.begin(), nb1.end(), lr2) == nb1.end()) { + nb1.push_back(lr2); + neighbours[lr2->id].push_back(lr1); + } + } + } + } + target_size = std::max(min_required_size, available_size); + // Calculate the urgency of each live range + lr_urgency.resize(lrs.size()); + for (size_t i = 0; i < lrs.size(); ++i) { + auto &lr = lrs[i]; + uint32_t urgency = 0; + for (size_t t = lr.start_time; t <= lr.end_time; ++t) { + urgency = std::max(size_at_time[t], urgency); + } + lr_urgency[i] = urgency; + } + best_size = UINT32_MAX; +} + +uint32_t SearchAllocator::allocate(std::vector<uint32_t> &output) { + output.clear(); + nr_iterations = 0; + std::vector<size_t> indices; + // Initial solution, using a heuristic allocator + for (size_t i = 0; i < lrs.size(); ++i) { + indices.push_back(i); + } + sort_indices_on_prio(indices); + // Allocate the initial solution + best_size = UINT32_MAX; + best_size = allocate_indices(indices); + if (best_size <= target_size) { + // The heuristic allocation returned an optimal solution. + // No need to search. + } else { + // Try to improve the heuristic allocation + search(indices, best_size, MAX_ITERATIONS); + } + output.clear(); + for (auto &lr : lrs) { + output.push_back(lr.address); + } + return best_size; +} + +void SearchAllocator::allocate_lr(LiveRange &lr) const { + uint32_t address = 0; + int predecessor = NO_PREDECESSOR; + bool fits = false; + while (!fits) { + fits = true; + // Find neighbours that overlap with address + for (auto lr2_p : neighbours[lr.id]) { + if (lr2_p->address == NOT_ALLOCATED || lr2_p->end_address <= address) { + continue; + } + if (lr2_p->overlaps(address, lr.size)) { + // Overlap found; increase address + fits = false; + address = lr2_p->end_address; + predecessor = lr2_p->id; + } + } + } + lr.address = address; + lr.end_address = address + lr.size; + lr.predecessor = predecessor; +} + +uint32_t SearchAllocator::allocate_indices(const std::vector<size_t> &indices) { + ++nr_iterations; + std::vector<size_t> count(indices.size()); + for (auto &lr : lrs) { + lr.address = NOT_ALLOCATED; + } + uint32_t size = 0; + for (size_t turn = 0; size <= best_size && turn < indices.size(); ++turn) { + auto &lr = lrs[indices[turn]]; + allocate_lr(lr); + lr.turn = turn; + size = std::max(size, lr.end_address); + } + return size; +} + +void SearchAllocator::sort_indices_on_prio(std::vector<size_t> &indices) const { + std::sort(indices.begin(), indices.end(), + [this] (size_t const& a, size_t const& b) { + if (lr_urgency[a] != lr_urgency[b]) { + return lr_urgency[a] > lr_urgency[b]; + } + auto &lr1 = lrs[a]; + auto &lr2 = lrs[b]; + auto duration1 = lr1.end_time - lr1.start_time; + auto duration2 = lr2.end_time - lr2.start_time; + if (duration1 != duration2) { + return duration1 > duration2; + } + if (lr1.start_time != lr2.start_time) { + return lr1.start_time < lr2.start_time; + } + if (lr1.size != lr2.size) { + return lr1.size > lr2.size; + } + return lr1.id < lr2.id; + }); +} + +void SearchAllocator::add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const { + turns.insert(lr.turn); + int id = lr.id; + while (lrs[id].predecessor != NO_PREDECESSOR) { + id = lrs[id].predecessor; + turns.insert(lrs[id].turn); + } +} + +void SearchAllocator::attempt_bottleneck_fix(std::vector<size_t> &indices) { + // Find the bottleneck + LiveRange *max_lr = &lrs[0]; + for (auto &lr : lrs) { + if (lr.end_address > max_lr->end_address) { + max_lr = &lr; + } + } + // Find all live ranges that affected the placement of the bottleneck live range. + // This consists of two types of live ranges: + // - direct neighbours of the bottleneck live range + // - direct and indirect predecessors of these neighbours + bottleneck + // The turns at which these live ranges were allocated are put in the turns vector. + std::set<size_t> turns; + add_predecessor_turns(turns, *max_lr); + for (auto lr_p : neighbours[max_lr->id]) { + add_predecessor_turns(turns, *lr_p); + } + // Non-direct neighbours that interfere with the allocation of the bottleneck are the + // immediate cause for gaps in the allocation, and are selected with higher probability. + std::vector<size_t> turn_list; + std::vector<size_t> non_nb_turn_list; + for (auto turn : turns) { + turn_list.push_back(turn); + auto &lr = lrs[indices[turn]]; + if (!max_lr->is_neighbour(lr)) { + non_nb_turn_list.push_back(turn); + } + } + size_t ix1; + if (rng() % 100 < 30 && !non_nb_turn_list.empty()) { + // Pick a live range from the "non-neighbour list" + ix1 = non_nb_turn_list[rng() % non_nb_turn_list.size()]; + } else { + // Pick any affecting live range. + ix1 = turn_list[rng() % turn_list.size()]; + } + size_t ix2 = turn_list[rng() % turn_list.size() - 1]; + if (ix1 == ix2) { + ix2 = turn_list[turn_list.size() - 1]; + } + // Swap indices + std::swap(indices[ix1], indices[ix2]); +} + +void SearchAllocator::search(std::vector<size_t> &indices, uint32_t initial_size, int iterations) { + std::vector<size_t> best_indices = indices; + std::vector<LiveRange> best_lrs = lrs; + for (int i = 0; i < iterations; ++i) { + // Reorder the indices + attempt_bottleneck_fix(indices); + // Allocate the reordered indices and check if it gave an improvement + auto new_size = allocate_indices(indices); + if (new_size <= best_size) { + // The new allocation produced a new best result; remember it + best_size = new_size; + best_indices = indices; + best_lrs = lrs; + if (best_size <= target_size) { + // Target reached; stop + return; + } + } else { + // The new allocation produced worse result; undo the change + indices = best_indices; + lrs = best_lrs; + } + } + lrs = best_lrs; +} + +uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output) { + // Convert input to vector of live ranges + std::vector<LiveRange> lrs; + for (size_t i = 0; i < input.size(); i += 3) { + LiveRange lr; + lr.start_time = input[i]; + lr.end_time = input[i+1]; + lr.size = input[i+2]; + lrs.push_back(lr); + } + SearchAllocator allocator(lrs, available_size); + return allocator.allocate(output); +} diff --git a/ethosu/tensor_allocator/search_allocator.h b/ethosu/tensor_allocator/search_allocator.h new file mode 100644 index 00000000..0ef8688e --- /dev/null +++ b/ethosu/tensor_allocator/search_allocator.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Description: + * Declaration of the search-based allocator. + */ + +#ifndef __SEARCH_ALLOCATOR_H +#define __SEARCH_ALLOCATOR_H + +#include <algorithm> +#include <cstdint> +#include <random> +#include <set> +#include <vector> + +/** + * Live range + */ +struct LiveRange { + /** Start time (input to the allocator algorithm) */ + uint32_t start_time; + /** End time, inclusive (input to the allocator algorithm) */ + uint32_t end_time; + /** Size in bytes (input to the allocator algorithm) */ + uint32_t size; + /** Index of this live range */ + int id; + /** Allocated address (the main output from the allocator algorithm) */ + uint32_t address; + /** End address, exclusive */ + uint32_t end_address; + /** id of predecessor live range (predecessor's end address == this lr's address) */ + int predecessor; + /** Turn at which the live range was allocated */ + size_t turn; + + bool overlaps(uint32_t addr2, uint32_t size2) const { + return address < addr2 + size2 && addr2 < end_address; + } + bool is_neighbour(const LiveRange &lr) const { + return start_time <= lr.end_time && lr.start_time <= end_time; + } +}; + +/** + * Implements tensor allocator using state space exploration. + * + * The basic algorithm is: + * + * Use a heuristic allocator to find an initial allocation + * while allocation is not optimal and iterations < MAX_ITERATIONS { + * find the "bottleneck": the live range with highest end address + * find all live ranges that affected the allocation of the bottleneck + * swap the order of any two affecting live ranges + * reallocate tensors using the reordered live ranges + * if the new allocation is better: keep it, else set allocation to previous allocation + * } + */ +class SearchAllocator { +private: + static constexpr int MAX_ITERATIONS = 500; + static constexpr uint32_t NOT_ALLOCATED = UINT32_MAX; + /** Used for live ranges allocated at address 0 */ + static constexpr int NO_PREDECESSOR = -1; + /** Contains the live ranges */ + std::vector<LiveRange> lrs; + /** Contains active live ranges at each timestamp */ + std::vector<std::vector<LiveRange*>> lrs_at_time; + /** + * Contains neighbours of each live range (indexed by lr.id), i.e. + * live ranges with overlapping start/end time. + */ + std::vector<std::vector<LiveRange*>> neighbours; + /** + * At each timestamp: accumulated size of active live ranges + */ + std::vector<uint32_t> size_at_time; + /** + * For each live range: max value of size_at_time (only used in the heuristic allocation) + */ + std::vector<uint32_t> lr_urgency; + /** + * The minimum possible size, assuming all live ranges can be perfectly allocated + */ + uint32_t min_required_size; + /** + * The available size (input to algorithm). + */ + uint32_t available_size; + /** The algorithm stops once the target size has been achieved */ + uint32_t target_size; + /** The highest end address of the best found allocation */ + uint32_t best_size; + /** Number of performed iterations */ + size_t nr_iterations = 0; + /** Random number generator; use default seed (which is well-defined) */ + std::mt19937 rng; +public: + SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit); + /** + * Runs the allocation algorithm. Finishes when the target size has been + * reached or when maximum iterations have been run. + * The allocated addresses are placed in the output vector, in the same + * order as the input vector. + * + * Implementation note: the algorithm produces reproduceable results by using + * a well-defined random number generator with well-defined default seed, + * and using a fixed number of iterations. + */ + uint32_t allocate(std::vector<uint32_t> &output); + uint32_t get_min_required_size() const { + return min_required_size; + } + size_t get_nr_iterations() const { + return nr_iterations; + } +private: + /** + * Allocates the given live range at the smallest possible address + */ + void allocate_lr(LiveRange &lr) const; + /** + * Allocates the live ranges in the order indicated by the indices; + * allocates each live range at the lowest possible address. + */ + uint32_t allocate_indices(const std::vector<size_t> &indices); + /** Sorts live ranges based on heuristics, used for the initial allocation */ + void sort_indices_on_prio(std::vector<size_t> &indices) const; + /** Adds the given live range + predecessors to the turns vector */ + void add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const; + /** + * Finds the "bottleneck", the live range with highest end address, and reorders the indices + * such that a next allocation might lower the memory usage. + * + * --------- + * | | + * | D | + * | | + * ---------------------------------- + * | B | + * ------------------------------- + * | | + * |A| --- + * | | |C| + * | | | | + * --------------------------------------- + * + * In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck. + * The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C), + * and all direct and indirect predecessors of D and its neighbours + * (i.e. A, which is the predecessor of B, and indirect predecessor of D). + * + * By permuting the order in which the affecting live ranges are allocated, the bottleneck might + * be lowered. In the above example, almost any permutation would lower the bottleneck. + * + * Note that there is room to improve the efficiency of the algorithm. + * One way could be to first allocate all direct neighbours of the bottleneck + * (i.e. B, C, D) and then the other affecting live ranges (i.e. A). The algorithm currently does + * not actively try this, as it may lead to allocation loops (A could become the new bottle-neck); + * it just uses a higher probability of selecting A. + */ + void attempt_bottleneck_fix(std::vector<size_t> &indices); + /** Search for a solution, using the given indices as initial solution. */ + void search(std::vector<size_t> &indices, uint32_t initial_size, int iterations); +}; + +/** Wrapper function to perform live range allocation */ +uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output); + +#endif // __SEARCH_ALLOCATOR_H diff --git a/ethosu/tensor_allocator/tensor_allocator_main.cpp b/ethosu/tensor_allocator/tensor_allocator_main.cpp new file mode 100644 index 00000000..27d96ef7 --- /dev/null +++ b/ethosu/tensor_allocator/tensor_allocator_main.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <cstdint> +#include <iostream> +#include <vector> + +#include "search_allocator.h" + +using namespace std; + +/** + * Reads live ranges from the input, and then performs allocation. + * The input has format: + +<number of live ranges> +<start_time> <end_time> <size> +... + + * e.g.: +4 +0 20 4096 +2 8 16000 +4 10 800 +6 20 1024 + */ +int main() { + int lr_count; + cin >> lr_count; + cin.ignore(); + vector<uint32_t> input; + vector<uint32_t> output; + for (int i = 0; i < lr_count; ++i) { + LiveRange lr; + cin >> lr.start_time >> lr.end_time >> lr.size; + lr.id = i; + cin.ignore(); + input.push_back(lr.start_time); + input.push_back(lr.end_time); + input.push_back(lr.size); + } + vector<LiveRange> lrs; + for (size_t i = 0; i < input.size(); i += 3) { + LiveRange lr; + lr.start_time = input[i]; + lr.end_time = input[i+1]; + lr.size = input[i+2]; + lrs.push_back(lr); + } + SearchAllocator allocator(lrs, 0); + uint32_t result = allocator.allocate(output); + printf("Output:\n"); + for (int i = 0; i < lr_count; ++i) { + printf("%4d: %6d %4d-%4d size %6d\n", i, output[i], input[3*i], input[3*i+1], input[3*i+2]); + } + uint32_t min_size = allocator.get_min_required_size(); + double overhead = 100.0 * (result - min_size)/(double)min_size; + printf("Total size: %d (%1.1f K), minimum required size: %d, overhead: %1.2f%%\n", + result, result/1024.0, min_size, overhead); + printf("Search used %ld iterations\n", (long)allocator.get_nr_iterations()); + return 0; +} diff --git a/ethosu/tensor_allocator/tensor_allocatormodule.cpp b/ethosu/tensor_allocator/tensor_allocatormodule.cpp new file mode 100644 index 00000000..79ee95ad --- /dev/null +++ b/ethosu/tensor_allocator/tensor_allocatormodule.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define PY_SSIZE_T_CLEAN +#include <Python.h> + +#include <vector> +#include "search_allocator.h" + + + +/** + * C++ extension wrapper for allocate + * + * This method is exposed directly in python with the arguments with a + * prototype of the form: + * + * output = tensor_allocator.allocate(input, available_size=0) + * + * input: [int] + * available_size: int + * output: [int] + */ +static PyObject *method_allocate (PyObject *self, PyObject *args) +{ + /* Object to hold the input integer list. */ + PyObject *input_list_object; + + /* Object to hold the available size */ + int available_size = 0; + + /* Arguments to the method are delivered as a tuple, unpack the + * tuple to get the individual arguments, note the second is + * optional. + */ + if (!PyArg_ParseTuple(args, "O|i", &input_list_object, &available_size)) { + return NULL; + } + + /* Unpack the length of the input integer list. */ + int input_length = PyObject_Length (input_list_object); + if (input_length < 0) { + input_length = 0; + } + std::vector<uint32_t> input; + std::vector<uint32_t> output; + for (int i = 0; i < input_length; ++i) { + PyObject *obj = PyList_GetItem(input_list_object, i); + uint32_t value = (uint32_t)PyLong_AsLong(obj); + input.push_back(value); + } + allocate(input, available_size, output); + PyObject *output_list = PyList_New(output.size()); + for (size_t i = 0; i < output.size(); ++i) { + PyList_SetItem(output_list, i, PyLong_FromLong(output[i])); + } + return output_list; +} + +/** tensor_allocator method descriptors. */ +static PyMethodDef tensor_allocator_methods[] = { + {"allocate", method_allocate, METH_VARARGS, "Python interface for allocate"}, + {NULL, NULL, 0, NULL} +}; + +/** tensor_allocator module descriptor. */ +static struct PyModuleDef tensor_allocatormodule = { + PyModuleDef_HEAD_INIT, + "tensor_allocator", + "Python interface for tensor_allocator", + -1, + tensor_allocator_methods +}; + +PyMODINIT_FUNC PyInit_tensor_allocator(void) { + return PyModule_Create(&tensor_allocatormodule); +} diff --git a/ethosu/tensor_allocator/test/test_tensor_allocator.py b/ethosu/tensor_allocator/test/test_tensor_allocator.py new file mode 100644 index 00000000..b9b547fc --- /dev/null +++ b/ethosu/tensor_allocator/test/test_tensor_allocator.py @@ -0,0 +1,30 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Description: +# Unit tests for tensor_allocator. +from ethosu import tensor_allocator + + +def test_allocate(): + """Tests the search allocator""" + # Create an input that requires a search to produce a perfect allocation. + # Should result in size 16016 + lrs = [(0, 100, 8000), (0, 1, 8016), (100, 110, 2000), (108, 110, 4000), (109, 110, 6000)] + input = [x for lr in lrs for x in lr] + res = tensor_allocator.allocate(input, 0) + assert len(res) == len(lrs) + assert max(addr + lr[2] for addr, lr in zip(res, lrs)) == 16016 diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py index 58d948c2..b0395def 100644 --- a/ethosu/vela/greedy_allocation.py +++ b/ethosu/vela/greedy_allocation.py @@ -16,7 +16,6 @@ # Description: # Allocate tensor addresses using a greedy algorithm. from . import numeric_util -from .errors import AllocationError class GreedyAllocator: @@ -70,33 +69,8 @@ class GreedyAllocator: self.alloc(new_lr) - self.verify_allocation(alignment) return self.memory_required - def verify_allocation(self, alignment): - lrs = list(self.live_ranges.ranges.values()) - for n in lrs: - for tens in n.tensors: - if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list): - # This is a CPU tensor, verify alignment - if tens.address % alignment != 0: - raise AllocationError("Tensor {} not aligned to {} bytes".format(tens.name, alignment)) - - for m in lrs: - if n != m and n.overlaps_ranges(m): - overlap, tens_n, tens_m = n.overlaps_address(m) - if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address): - raise AllocationError( - "Overlapping buffers: {}: {} -> {} and {}: {} -> {}".format( - n.name, - tens_n.address, - tens_n.address + n.size, - m.name, - tens_m.address, - tens_m.address + m.size, - ) - ) - def allocate_live_ranges(nng, arch, live_ranges, mem_area, alignment, verbose_allocation=False): g = GreedyAllocator(nng, arch, live_ranges, mem_area) diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py index b2877851..0ae3de9a 100644 --- a/ethosu/vela/nn_graph.py +++ b/ethosu/vela/nn_graph.py @@ -36,6 +36,7 @@ class PassPlacement(enum.Enum): class TensorAllocator(enum.Enum): LinearAlloc = 1 Greedy = 2 + Search = 3 def __str__(self): return self.name diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index d1a33728..9736ca22 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -24,11 +24,13 @@ from . import live_range from . import numeric_util from .errors import AllocationError from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges +from .live_range import LiveRangeGraph from .nn_graph import TensorAllocator from .tensor import MemArea from .tensor import MemType from .tensor import Tensor from .tensor import TensorPurpose +from ethosu import tensor_allocator def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum): @@ -61,7 +63,29 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation return total_sz -def verify_alignment(live_ranges, alignment): +def search_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int: + # Allocates using the search-based allocator (implemented in C++) + input = [] + lrs = [] + lr_set = set() + for lr in live_ranges.ranges.values(): + lr_set.add((lr.start_time, lr.end_time, lr)) + lr_list = sorted(lr_set) + # Create a single array of ints containing start/end/size of the live ranges + for start, end, lr in lr_list: + input += [start, end, numeric_util.round_up(lr.size, alloc_granularity)] + lrs.append(lr) + addresses = tensor_allocator.allocate(input, 0) + # The result is a list containing the allocated addresses + total_sz = 0 + for lr, address in zip(lrs, addresses): + total_sz = max(total_sz, address + lr.size) + lr.set_address(address) + verify_allocation(live_ranges, alloc_granularity) + return total_sz + + +def verify_alignment(live_ranges: LiveRangeGraph, alignment: int): for lr in live_ranges.ranges.values(): for tens in lr.tensors: if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list): @@ -70,6 +94,27 @@ def verify_alignment(live_ranges, alignment): raise AllocationError("Tensor {} not aligned to {} bytes".format(tens.name, alignment)) +def verify_allocation(live_ranges: LiveRangeGraph, alignment: int): + lrs = list(live_ranges.ranges.values()) + for n in lrs: + verify_alignment(live_ranges, alignment) + + for m in lrs: + if n != m and n.overlaps_ranges(m): + overlap, tens_n, tens_m = n.overlaps_address(m) + if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address): + raise AllocationError( + "Overlapping buffers: {}: {} -> {} and {}: {} -> {}".format( + n.name, + tens_n.address, + tens_n.address + n.size, + m.name, + tens_m.address, + tens_m.address + m.size, + ) + ) + + def mark_sram_used_for_cascaded_passes(sg, lrs): end_pos = max(ps.time for ps in sg.cascaded_passes) + 2 mem_usage = np.zeros(end_pos, dtype=np.int64) @@ -137,8 +182,11 @@ def allocate_tensors( tens_alloc = tensor_allocator if tens_alloc == TensorAllocator.Greedy: total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment, verbose_allocation) + verify_allocation(lrs, cpu_tensor_alignment) elif tens_alloc == TensorAllocator.LinearAlloc: total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment) + elif tens_alloc == TensorAllocator.Search: + total_sz = search_allocate_live_ranges(lrs, cpu_tensor_alignment) else: assert 0 alloc_ok = max_size is None or total_sz <= max_size diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index 37de1ed2..d27eef0e 100644 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -282,7 +282,7 @@ def main(args=None): ) parser.add_argument( "--tensor-allocator", - default=TensorAllocator.Greedy, + default=TensorAllocator.Search, type=lambda s: TensorAllocator[s], choices=list(TensorAllocator), help="Tensor Allocator algorithm (default: %(default)s)", |