From d70025250fc49997801ea3a6ce83f2fa29a09d78 Mon Sep 17 00:00:00 2001 From: Louis Verhaard Date: Wed, 20 Jan 2021 17:23:54 +0100 Subject: MLBEDSW-3808: Ported search allocator to python - Straight port of the C++ implementation to python. - Renamed the allocator from "Search" to "HillClimb" Change-Id: I50797d541f326d0264daf79bf7866aef32350a60 Signed-off-by: Louis Verhaard --- OPTIONS.md | 4 +- README.md | 6 +- ethosu/tensor_allocator/makefile | 50 ---- ethosu/tensor_allocator/search_allocator.cpp | 267 ------------------ ethosu/tensor_allocator/search_allocator.h | 181 ------------ ethosu/tensor_allocator/tensor_allocator_main.cpp | 77 ----- ethosu/tensor_allocator/tensor_allocatormodule.cpp | 105 ------- .../tensor_allocator/test/test_tensor_allocator.py | 63 ----- ethosu/vela/hillclimb_allocation.py | 310 +++++++++++++++++++++ ethosu/vela/nn_graph.py | 2 +- ethosu/vela/tensor_allocation.py | 28 +- ethosu/vela/test/test_hillclimb_allocation.py | 63 +++++ ethosu/vela/vela.py | 2 +- setup.py | 9 +- 14 files changed, 391 insertions(+), 776 deletions(-) delete mode 100644 ethosu/tensor_allocator/makefile delete mode 100644 ethosu/tensor_allocator/search_allocator.cpp delete mode 100644 ethosu/tensor_allocator/search_allocator.h delete mode 100644 ethosu/tensor_allocator/tensor_allocator_main.cpp delete mode 100644 ethosu/tensor_allocator/tensor_allocatormodule.cpp delete mode 100644 ethosu/tensor_allocator/test/test_tensor_allocator.py create mode 100644 ethosu/vela/hillclimb_allocation.py create mode 100644 ethosu/vela/test/test_hillclimb_allocation.py diff --git a/OPTIONS.md b/OPTIONS.md index e191d4ee..1293f17d 100644 --- a/OPTIONS.md +++ b/OPTIONS.md @@ -183,8 +183,8 @@ vela network.tflite --config my_vela_cfg.ini --memory-mode My_Mem_Mode Specify which allocator algorithm to use for non-constant NPU and CPU tensor allocation. **Type: String** -**Default: Search** -**Choices: [Greedy, LinearAlloc, Search]** +**Default: HillClimb** +**Choices: [Greedy, LinearAlloc, HillClimb]** ```bash vela network.tflite --tensor-allocator=LinearAlloc diff --git a/README.md b/README.md index 81bb0a08..1d6d073e 100644 --- a/README.md +++ b/README.md @@ -54,9 +54,9 @@ source code from [ML Platform](https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela). Both methods will automatically install all the required dependencies. -**Note:** For installing on Microsoft Windows you need to have a C99 and C++11 -capable toolchain installed. The recommended and tested toolchain is Microsoft -Visual C++ 14.x Build Tools, see +**Note:** For installing on Microsoft Windows you need to have a C99 capable +toolchain installed. The recommended and tested toolchain is Microsoft Visual +C++ 14.x Build Tools, see ### PyPi diff --git a/ethosu/tensor_allocator/makefile b/ethosu/tensor_allocator/makefile deleted file mode 100644 index 9636eb94..00000000 --- a/ethosu/tensor_allocator/makefile +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -# Description: -# Makefile to build tensor_allocator_main - -UNAME=$(shell uname -o) - -CXXFLAGS=--std=c++11 -pedantic-errors -Wall -Werror -Wdate-time -CXXFLAGS+=-fwrapv -fstack-protector-strong -flto -fuse-linker-plugin -ffat-lto-objects -fPIC - -ifeq ($(DEBUG),1) - CXXFLAGS+=-g -O0 -DDEBUG -else - CXXFLAGS+=-O2 -endif - -LIBSRCS=tensor_allocator_main.cpp search_allocator.cpp -LIBHDRS=search_allocator.h - -ifeq ($(UNAME),Cygwin) - TENSOR_ALLOCATOR_EXE=tensor_allocator_main.exe -else - TENSOR_ALLOCATOR_EXE=tensor_allocator_main -endif - -all: tensor_allocator_exe - -.PHONY: tensor_allocator_exe -tensor_allocator_exe: $(TENSOR_ALLOCATOR_EXE) - -clean: - rm -f $(TENSOR_ALLOCATOR_EXE) - -$(TENSOR_ALLOCATOR_EXE): $(LIBSRCS) $(LIBHDRS) makefile - g++ $(CXXFLAGS) $(LIBSRCS) -o $(TENSOR_ALLOCATOR_EXE) diff --git a/ethosu/tensor_allocator/search_allocator.cpp b/ethosu/tensor_allocator/search_allocator.cpp deleted file mode 100644 index 5c7492b4..00000000 --- a/ethosu/tensor_allocator/search_allocator.cpp +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Description: - * Implementation of the search-based allocator. - */ - -#include -#include -#include -#include - -#include "search_allocator.h" - -SearchAllocator::SearchAllocator(const std::vector &live_ranges, uint32_t size_limit) { - lrs = live_ranges; - uint32_t max_end_time = 0; - for (size_t i = 0; i < lrs.size(); ++i) { - auto &lr = lrs[i]; - lr.id = static_cast(i); - max_end_time = std::max(max_end_time, lr.end_time); - } - lrs_at_time.resize(max_end_time + 1); - size_at_time.resize(max_end_time + 1); - neighbours.resize(lrs.size()); - // Calculate which live ranges are active at every timestamp - for (size_t t = 0; t <= max_end_time; ++t) { - lrs_at_time[t].clear(); - } - for (auto &lr : lrs) { - for (auto t = lr.start_time; t <= lr.end_time; ++t) { - lrs_at_time[t].push_back(&lr); - } - } - min_required_size = 0; - for (size_t t = 0; t <= max_end_time; ++t) { - // Calculate minimum needed size at each timestamp - uint32_t size_at_t = 0; - for (auto &lr : lrs_at_time[t]) { - size_at_t += lr->size; - } - size_at_time[t] = size_at_t; - min_required_size = std::max(size_at_t, min_required_size); - // Calculate all neighbours - for (size_t i = 0; i < lrs_at_time[t].size(); ++i) { - auto lr1 = lrs_at_time[t][i]; - auto &nb1 = neighbours[lr1->id]; - for (size_t j = i + 1; j < lrs_at_time[t].size(); ++j) { - auto lr2 = lrs_at_time[t][j]; - if (find(nb1.begin(), nb1.end(), lr2) == nb1.end()) { - nb1.push_back(lr2); - neighbours[lr2->id].push_back(lr1); - } - } - } - } - target_size = std::max(min_required_size, size_limit); - // Calculate the urgency of each live range - lr_urgency.resize(lrs.size()); - for (size_t i = 0; i < lrs.size(); ++i) { - auto &lr = lrs[i]; - uint32_t urgency = 0; - for (size_t t = lr.start_time; t <= lr.end_time; ++t) { - urgency = std::max(size_at_time[t], urgency); - } - lr_urgency[i] = urgency; - } - best_size = UINT32_MAX; -} - -uint32_t SearchAllocator::allocate(std::vector &output) { - output.clear(); - nr_iterations = 0; - std::vector indices; - // Initial solution, using a heuristic allocator - for (size_t i = 0; i < lrs.size(); ++i) { - indices.push_back(i); - } - sort_indices_on_prio(indices); - // Allocate the initial solution - best_size = UINT32_MAX; - best_size = allocate_indices(indices); - if (best_size <= target_size) { - // The heuristic allocation returned an optimal solution. - // No need to search. - } else { - // Try to improve the heuristic allocation - search(indices, best_size, MAX_ITERATIONS); - } - output.clear(); - for (auto &lr : lrs) { - output.push_back(lr.address); - } - return best_size; -} - -void SearchAllocator::allocate_lr(LiveRange &lr) const { - uint32_t address = 0; - int predecessor = NO_PREDECESSOR; - bool fits = false; - while (!fits) { - fits = true; - // Find neighbours that overlap with address - for (auto lr2_p : neighbours[lr.id]) { - if (lr2_p->address == NOT_ALLOCATED || lr2_p->end_address <= address) { - continue; - } - if (lr2_p->overlaps(address, lr.size)) { - // Overlap found; increase address - fits = false; - address = lr2_p->end_address; - predecessor = lr2_p->id; - } - } - } - lr.address = address; - lr.end_address = address + lr.size; - lr.predecessor = predecessor; -} - -uint32_t SearchAllocator::allocate_indices(const std::vector &indices) { - ++nr_iterations; - std::vector count(indices.size()); - for (auto &lr : lrs) { - lr.address = NOT_ALLOCATED; - } - uint32_t size = 0; - for (size_t turn = 0; size <= best_size && turn < indices.size(); ++turn) { - auto &lr = lrs[indices[turn]]; - allocate_lr(lr); - lr.turn = turn; - size = std::max(size, lr.end_address); - } - return size; -} - -void SearchAllocator::sort_indices_on_prio(std::vector &indices) const { - std::sort(indices.begin(), indices.end(), - [this] (size_t const& a, size_t const& b) { - if (lr_urgency[a] != lr_urgency[b]) { - return lr_urgency[a] > lr_urgency[b]; - } - auto &lr1 = lrs[a]; - auto &lr2 = lrs[b]; - auto duration1 = lr1.end_time - lr1.start_time; - auto duration2 = lr2.end_time - lr2.start_time; - if (duration1 != duration2) { - return duration1 > duration2; - } - if (lr1.start_time != lr2.start_time) { - return lr1.start_time < lr2.start_time; - } - if (lr1.size != lr2.size) { - return lr1.size > lr2.size; - } - return lr1.id < lr2.id; - }); -} - -void SearchAllocator::add_predecessor_turns(std::set &turns, const LiveRange &lr) const { - turns.insert(lr.turn); - int id = lr.id; - while (lrs[id].predecessor != NO_PREDECESSOR) { - id = lrs[id].predecessor; - turns.insert(lrs[id].turn); - } -} - -void SearchAllocator::attempt_bottleneck_fix(std::vector &indices) { - // Find the bottleneck - LiveRange *max_lr = &lrs[0]; - for (auto &lr : lrs) { - if (lr.end_address > max_lr->end_address) { - max_lr = &lr; - } - } - // Find all live ranges that affected the placement of the bottleneck live range. - // This consists of two types of live ranges: - // - direct neighbours of the bottleneck live range - // - direct and indirect predecessors of these neighbours + bottleneck - // The turns at which these live ranges were allocated are put in the turns vector. - std::set turns; - add_predecessor_turns(turns, *max_lr); - for (auto lr_p : neighbours[max_lr->id]) { - add_predecessor_turns(turns, *lr_p); - } - // Non-direct neighbours that interfere with the allocation of the bottleneck are the - // immediate cause for gaps in the allocation, and are selected with higher probability. - std::vector turn_list; - std::vector non_nb_turn_list; - for (auto turn : turns) { - turn_list.push_back(turn); - auto &lr = lrs[indices[turn]]; - if (!max_lr->is_neighbour(lr)) { - non_nb_turn_list.push_back(turn); - } - } - size_t ix1; - if (rng() % 100 < 30 && !non_nb_turn_list.empty()) { - // Pick a live range from the "non-neighbour list" - ix1 = non_nb_turn_list[rng() % non_nb_turn_list.size()]; - } else { - // Pick any affecting live range. - ix1 = turn_list[rng() % turn_list.size()]; - } - // Note: turn_list has always at least 2 elements for bottlenecks - size_t ix2 = turn_list[rng() % (turn_list.size() - 1)]; - if (ix1 == ix2) { - ix2 = turn_list[turn_list.size() - 1]; - } - // Swap indices - std::swap(indices[ix1], indices[ix2]); -} - -void SearchAllocator::search(std::vector &indices, uint32_t initial_size, int iterations) { - std::vector best_indices = indices; - std::vector best_lrs = lrs; - for (int i = 0; i < iterations; ++i) { - // Reorder the indices - attempt_bottleneck_fix(indices); - // Allocate the reordered indices and check if it gave an improvement - auto new_size = allocate_indices(indices); - if (new_size <= best_size) { - // The new allocation produced a new best result; remember it - best_size = new_size; - best_indices = indices; - best_lrs = lrs; - if (best_size <= target_size) { - // Target reached; stop - return; - } - } else { - // The new allocation produced worse result; undo the change - indices = best_indices; - lrs = best_lrs; - } - } - lrs = best_lrs; -} - -uint32_t allocate(const std::vector &input, int available_size, std::vector &output) { - // Convert input to vector of live ranges - std::vector lrs; - for (size_t i = 0; i < input.size(); i += 3) { - LiveRange lr; - lr.start_time = input[i]; - lr.end_time = input[i+1]; - lr.size = input[i+2]; - lrs.push_back(lr); - } - SearchAllocator allocator(lrs, available_size); - return allocator.allocate(output); -} diff --git a/ethosu/tensor_allocator/search_allocator.h b/ethosu/tensor_allocator/search_allocator.h deleted file mode 100644 index 6c750151..00000000 --- a/ethosu/tensor_allocator/search_allocator.h +++ /dev/null @@ -1,181 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Description: - * Declaration of the search-based allocator. - */ - -#ifndef __SEARCH_ALLOCATOR_H -#define __SEARCH_ALLOCATOR_H - -#include -#include -#include -#include -#include - -/** - * Live range - */ -struct LiveRange { - /** Start time (input to the allocator algorithm) */ - uint32_t start_time; - /** End time, inclusive (input to the allocator algorithm) */ - uint32_t end_time; - /** Size in bytes (input to the allocator algorithm) */ - uint32_t size; - /** Index of this live range */ - int id; - /** Allocated address (the main output from the allocator algorithm) */ - uint32_t address; - /** End address, exclusive */ - uint32_t end_address; - /** id of predecessor live range (predecessor's end address == this lr's address) */ - int predecessor; - /** Turn at which the live range was allocated */ - size_t turn; - - bool overlaps(uint32_t addr2, uint32_t size2) const { - return address < addr2 + size2 && addr2 < end_address; - } - bool is_neighbour(const LiveRange &lr) const { - return start_time <= lr.end_time && lr.start_time <= end_time; - } -}; - -/** - * Implements tensor allocator using state space exploration. - * - * The basic algorithm is: - * - * Use a heuristic allocator to find an initial allocation - * while allocation is not optimal and iterations < MAX_ITERATIONS { - * find the "bottleneck": the live range with highest end address - * find all live ranges that affected the allocation of the bottleneck - * swap the order of any two affecting live ranges - * reallocate tensors using the reordered live ranges - * if the new allocation is better: keep it, else set allocation to previous allocation - * } - */ -class SearchAllocator { -private: - static constexpr int MAX_ITERATIONS = 500; - static constexpr uint32_t NOT_ALLOCATED = UINT32_MAX; - /** Used for live ranges allocated at address 0 */ - static constexpr int NO_PREDECESSOR = -1; - /** Contains the live ranges */ - std::vector lrs; - /** Contains active live ranges at each timestamp */ - std::vector> lrs_at_time; - /** - * Contains neighbours of each live range (indexed by lr.id), i.e. - * live ranges with overlapping start/end time. - */ - std::vector> neighbours; - /** - * At each timestamp: accumulated size of active live ranges - */ - std::vector size_at_time; - /** - * For each live range: max value of size_at_time (only used in the heuristic allocation) - */ - std::vector lr_urgency; - /** - * The minimum possible size, assuming all live ranges can be perfectly allocated - */ - uint32_t min_required_size; - /** The algorithm stops once the target size has been achieved */ - uint32_t target_size; - /** The highest end address of the best found allocation */ - uint32_t best_size; - /** Number of performed iterations */ - size_t nr_iterations = 0; - /** Random number generator; use default seed (which is well-defined) */ - std::mt19937 rng; -public: - SearchAllocator(const std::vector &live_ranges, uint32_t size_limit); - /** - * Runs the allocation algorithm. Finishes when the target size has been - * reached or when maximum iterations have been run. - * The allocated addresses are placed in the output vector, in the same - * order as the input vector. - * - * Implementation note: the algorithm produces reproduceable results by using - * a well-defined random number generator with well-defined default seed, - * and using a fixed number of iterations. - */ - uint32_t allocate(std::vector &output); - uint32_t get_min_required_size() const { - return min_required_size; - } - size_t get_nr_iterations() const { - return nr_iterations; - } -private: - /** - * Allocates the given live range at the smallest possible address - */ - void allocate_lr(LiveRange &lr) const; - /** - * Allocates the live ranges in the order indicated by the indices; - * allocates each live range at the lowest possible address. - */ - uint32_t allocate_indices(const std::vector &indices); - /** Sorts live ranges based on heuristics, used for the initial allocation */ - void sort_indices_on_prio(std::vector &indices) const; - /** Adds the given live range + predecessors to the turns vector */ - void add_predecessor_turns(std::set &turns, const LiveRange &lr) const; - /** - * Finds the "bottleneck", the live range with highest end address, and reorders the indices - * such that a next allocation might lower the memory usage. - * - * --------- - * | | - * | D | - * | | - * ---------------------------------- - * | B | - * ------------------------------- - * | | - * |A| --- - * | | |C| - * | | | | - * --------------------------------------- - * - * In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck. - * The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C), - * and all direct and indirect predecessors of D and its neighbours - * (i.e. A, which is the predecessor of B, and indirect predecessor of D). - * - * By permuting the order in which the affecting live ranges are allocated, the bottleneck might - * be lowered. In the above example, almost any permutation would lower the bottleneck. - * - * Note that there is room to improve the efficiency of the algorithm. - * One way could be to first allocate all direct neighbours of the bottleneck - * (i.e. B, C, D) and then the other affecting live ranges (i.e. A). The algorithm currently does - * not actively try this, as it may lead to allocation loops (A could become the new bottle-neck); - * it just uses a higher probability of selecting A. - */ - void attempt_bottleneck_fix(std::vector &indices); - /** Search for a solution, using the given indices as initial solution. */ - void search(std::vector &indices, uint32_t initial_size, int iterations); -}; - -/** Wrapper function to perform live range allocation */ -uint32_t allocate(const std::vector &input, int available_size, std::vector &output); - -#endif // __SEARCH_ALLOCATOR_H diff --git a/ethosu/tensor_allocator/tensor_allocator_main.cpp b/ethosu/tensor_allocator/tensor_allocator_main.cpp deleted file mode 100644 index 27d96ef7..00000000 --- a/ethosu/tensor_allocator/tensor_allocator_main.cpp +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include - -#include "search_allocator.h" - -using namespace std; - -/** - * Reads live ranges from the input, and then performs allocation. - * The input has format: - - - -... - - * e.g.: -4 -0 20 4096 -2 8 16000 -4 10 800 -6 20 1024 - */ -int main() { - int lr_count; - cin >> lr_count; - cin.ignore(); - vector input; - vector output; - for (int i = 0; i < lr_count; ++i) { - LiveRange lr; - cin >> lr.start_time >> lr.end_time >> lr.size; - lr.id = i; - cin.ignore(); - input.push_back(lr.start_time); - input.push_back(lr.end_time); - input.push_back(lr.size); - } - vector lrs; - for (size_t i = 0; i < input.size(); i += 3) { - LiveRange lr; - lr.start_time = input[i]; - lr.end_time = input[i+1]; - lr.size = input[i+2]; - lrs.push_back(lr); - } - SearchAllocator allocator(lrs, 0); - uint32_t result = allocator.allocate(output); - printf("Output:\n"); - for (int i = 0; i < lr_count; ++i) { - printf("%4d: %6d %4d-%4d size %6d\n", i, output[i], input[3*i], input[3*i+1], input[3*i+2]); - } - uint32_t min_size = allocator.get_min_required_size(); - double overhead = 100.0 * (result - min_size)/(double)min_size; - printf("Total size: %d (%1.1f K), minimum required size: %d, overhead: %1.2f%%\n", - result, result/1024.0, min_size, overhead); - printf("Search used %ld iterations\n", (long)allocator.get_nr_iterations()); - return 0; -} diff --git a/ethosu/tensor_allocator/tensor_allocatormodule.cpp b/ethosu/tensor_allocator/tensor_allocatormodule.cpp deleted file mode 100644 index 52f1c690..00000000 --- a/ethosu/tensor_allocator/tensor_allocatormodule.cpp +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2020 Arm Limited. All rights reserved. - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#define PY_SSIZE_T_CLEAN -#include -#include - -#include -#include "search_allocator.h" - - - -/** - * C++ extension wrapper for allocate - * - * This method is exposed directly in python with the arguments with a - * prototype of the form: - * - * output = tensor_allocator.allocate(input, available_size=0) - * - * input: [int] - * available_size: int - * output: [int] - */ -static PyObject *method_allocate (PyObject *self, PyObject *args) -{ - /* Object to hold the input integer list. */ - PyObject *input_list_object; - - /* Object to hold the available size */ - int available_size = 0; - - /* Arguments to the method are delivered as a tuple, unpack the - * tuple to get the individual arguments, note the second is - * optional. - */ - if (!PyArg_ParseTuple(args, "O|i", &input_list_object, &available_size)) { - return NULL; - } - - /* Unpack the length of the input integer list. */ - auto input_length = PyObject_Length(input_list_object); - if (input_length < 0) { - return NULL; - } - if (input_length % 3 != 0) { - PyErr_SetString(PyExc_ValueError, "Input length must be multiple of 3"); - return NULL; - } - std::vector input; - std::vector output; - for (int i = 0; i < input_length; ++i) { - PyObject *obj = PyList_GetItem(input_list_object, i); - if (!PyLong_Check(obj)) { - PyErr_SetString(PyExc_ValueError, "Illegal value in input"); - return NULL; - } - auto value = PyLong_AsLong(obj); - if (value < 0 || value > UINT32_MAX) { - PyErr_SetString(PyExc_ValueError, "Input value out of bounds"); - return NULL; - } - input.push_back(value); - } - allocate(input, available_size, output); - PyObject *output_list = PyList_New(output.size()); - for (size_t i = 0; i < output.size(); ++i) { - PyList_SetItem(output_list, i, PyLong_FromLong(output[i])); - } - return output_list; -} - -/** tensor_allocator method descriptors. */ -static PyMethodDef tensor_allocator_methods[] = { - {"allocate", method_allocate, METH_VARARGS, "Python interface for allocate"}, - {NULL, NULL, 0, NULL} -}; - -/** tensor_allocator module descriptor. */ -static struct PyModuleDef tensor_allocatormodule = { - PyModuleDef_HEAD_INIT, - "tensor_allocator", - "Python interface for tensor_allocator", - -1, - tensor_allocator_methods -}; - -PyMODINIT_FUNC PyInit_tensor_allocator(void) { - return PyModule_Create(&tensor_allocatormodule); -} diff --git a/ethosu/tensor_allocator/test/test_tensor_allocator.py b/ethosu/tensor_allocator/test/test_tensor_allocator.py deleted file mode 100644 index 1011279c..00000000 --- a/ethosu/tensor_allocator/test/test_tensor_allocator.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. -# -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the License); you may -# not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an AS IS BASIS, WITHOUT -# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Description: -# Unit tests for tensor_allocator. -import pytest - -from ethosu import tensor_allocator - -test_data = [ - ([(0, 100, 8000), (0, 1, 8016), (100, 110, 2000), (108, 110, 4000), (109, 110, 6000)], 16016), - ( - [ - (0, 23, 131072), - (4, 5, 65568), - (4, 9, 8192), - (8, 30, 15360), - (10, 11, 65568), - (10, 15, 4096), - (16, 17, 65552), - (16, 21, 2048), - (22, 23, 32784), - (22, 27, 1024), - ], - 216096, - ), -] - - -@pytest.mark.parametrize("lrs, expected_size", test_data) -def test_allocate(lrs, expected_size): - """Tests the search allocator""" - input = [x for lr in lrs for x in lr] - res = tensor_allocator.allocate(input, 0) - assert len(res) == len(lrs) - assert max(addr + lr[2] for addr, lr in zip(res, lrs)) == expected_size - - -def test_allocate_empty_input(): - assert [] == tensor_allocator.allocate([], 0) - - -invalid_input_test_data = [None, 3, [1, 2, 16, 9, 15], [1, 5, None], [-1, 0, 16], [1, 2, 10000000000]] - - -@pytest.mark.parametrize("input", invalid_input_test_data) -def test_allocate_invalid_input(input): - """Tests the search allocator with invalid input data""" - with pytest.raises(Exception): - tensor_allocator.allocate(input, 0) diff --git a/ethosu/vela/hillclimb_allocation.py b/ethosu/vela/hillclimb_allocation.py new file mode 100644 index 00000000..de53ab83 --- /dev/null +++ b/ethosu/vela/hillclimb_allocation.py @@ -0,0 +1,310 @@ +# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Description: +# Tensor allocator based on a hill-climb search +import random +from typing import List +from typing import Set + +from .live_range import LiveRange + + +class LiveRangeInfo: + def __init__(self, id: int, start_time: int, end_time: int, size: int): + # Index of this live range + self.id = id + # Start time (input to the allocator algorithm) + self.start_time = start_time + # End time, inclusive (input to the allocator algorithm) + self.end_time = end_time + # Size in bytes (input to the allocator algorithm) + self.size = size + # Allocated address (the main output from the allocator algorithm) + self.address: int = 0 + # End address, exclusive + self.end_address: int = 0 + # id of predecessor live range (predecessor's end address == this lr's address) + self.predecessor: int = 0 + # Turn at which the live range was allocated + self.turn: int = 0 + # Max value of size_at_time (only used in the heuristic allocation) + self.urgency = 0 + self.neighbours: List["LiveRangeInfo"] = [] + + def overlaps(self, addr2: int, size2: int) -> int: + return self.address < addr2 + size2 and addr2 < self.end_address + + def is_neighbour(self, lr: "LiveRangeInfo") -> bool: + return self.start_time <= lr.end_time and lr.start_time <= self.end_time + + def __str__(self): + return "".format( + self.id, self.start_time, self.end_time, self.size, self.address + ) + + def __lt__(self, other) -> bool: + if self.urgency != other.urgency: + return self.urgency > other.urgency + duration1 = self.end_time - self.start_time + duration2 = other.end_time - other.start_time + if duration1 != duration2: + return duration1 > duration2 + + if self.start_time != other.start_time: + return self.start_time < other.start_time + + if self.size != other.size: + return self.size > other.size + + return self.id < other.id + + +class HillClimbAllocator: + """ + Implements tensor allocator using a hill climb search. + + The basic algorithm is: + + Use a heuristic allocator to find an initial allocation + while allocation is not optimal and iterations < MAX_ITERATIONS: + find the "bottleneck": the live range with highest end address + find all live ranges that affected the allocation of the bottleneck + swap the order of any two affecting live ranges + reallocate tensors using the reordered live ranges + if the new allocation is better: keep it, else set allocation to previous allocation + """ + + MAX_ITERATIONS = 500 + NOT_ALLOCATED = -1 + # Used for live ranges allocated at address 0 + NO_PREDECESSOR = -1 + + def __init__(self, live_ranges: List[LiveRange]): + # Contains the live ranges + self.lrs: List[LiveRangeInfo] = [ + LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size) for id, lr in enumerate(live_ranges) + ] + self.lrs_at_time = [] + # The available size (input to algorithm). + self.available_size: int = 0 + # The algorithm stops once the target size has been achieved + self.target_size: int = 0 + # The highest end address of the best found allocation + self.best_size: int = 1 << 63 + # For each live range: max value of size_at_time (only used in the heuristic allocation) + self.lr_urgency = len(self.lrs) * [0] + nr_time_slots = 1 + max(lr.end_time for lr in self.lrs) + # Contains active live ranges at each timestamp + self.lrs_at_time = [[] for i in range(nr_time_slots)] + for lr in self.lrs: + for t in range(lr.start_time, lr.end_time + 1): + self.lrs_at_time[t].append(lr) + # At each timestamp: accumulated size of active live ranges + size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)] + # The minimum possible size, assuming all live ranges can be perfectly allocated + self.min_required_size: int = max(size_at_time) + # Calculate all neighbours + the urgency of each live range + for lr in self.lrs: + lr.urgency = 0 + lr.neighbours = [] + neighbours = set() + for t in range(lr.start_time, lr.end_time + 1): + lr.urgency = max(size_at_time[t], lr.urgency) + for lr2 in self.lrs_at_time[t]: + if lr2 not in neighbours and lr != lr2: + neighbours.add(lr2) + lr.neighbours.append(lr2) + + def allocate_lr(self, lr: LiveRangeInfo): + """ + Allocates the given live range at the smallest possible address + """ + address = 0 + predecessor = HillClimbAllocator.NO_PREDECESSOR + fits = False + while not fits: + fits = True + # Find neighbours that overlap with address + for lr2 in lr.neighbours: + if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address: + continue + if lr2.overlaps(address, lr.size): + # Overlap found increase address + fits = False + address = lr2.end_address + predecessor = lr2.id + lr.address = address + lr.end_address = address + lr.size + lr.predecessor = predecessor + + def allocate_indices(self, indices: List[int]): + """ + Allocates the live ranges in the order indicated by the indices; + allocates each live range at the lowest possible address. + """ + for lr in self.lrs: + lr.address = HillClimbAllocator.NOT_ALLOCATED + size = 0 + for turn, index in enumerate(indices): + lr = self.lrs[index] + self.allocate_lr(lr) + lr.turn = turn + size = max(size, lr.end_address) + if size > self.best_size: + # This allocation is worse than the best known allocation; + # no need to continue + break + return size + + def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo): + """ + Adds the given live range + predecessors to the turns vector. + Note: the turn_set is for quick detection of duplicates, + the turn_list is to get reproduceable results + """ + if lr.turn not in turn_set: + turn_set.add(lr.turn) + turn_list.append(lr.turn) + id = lr.id + while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR: + id = self.lrs[id].predecessor + turn = self.lrs[id].turn + if turn not in turn_set: + turn_set.add(turn) + turn_list.append(turn) + + def attempt_bottleneck_fix(self, indices: List[int]): + """ + Finds the "bottleneck", the live range with highest end address, and reorders the indices + such that a next allocation might lower the memory usage. + + --------- + | | + | D | + | | + ---------------------------------- + | B | + ------------------------------- + | | + |A| --- + | | |C| + | | | | + --------------------------------------- + + In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck. + The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C), + and all direct and indirect predecessors of D and its neighbours + (i.e. A, which is the predecessor of B, and indirect predecessor of D). + + By permuting the order in which the affecting live ranges are allocated, the bottleneck might + be lowered. In the above example, almost any permutation would lower the bottleneck. + """ + # Find the bottleneck + max_lr = self.lrs[0] + for lr in self.lrs[1:]: + if lr.end_address > max_lr.end_address: + max_lr = lr + + # Find all live ranges that affected the placement of the bottleneck live range. + # This consists of two types of live ranges: + # - direct neighbours of the bottleneck live range + # - direct and indirect predecessors of these neighbours + bottleneck + # The turns at which these live ranges were allocated are put in the turns set. + turn_set = set() + turn_list = list() + self.add_predecessor_turns(turn_set, turn_list, max_lr) + for lr2 in max_lr.neighbours: + self.add_predecessor_turns(turn_set, turn_list, lr2) + + # Non-direct neighbours that interfere with the allocation of the bottleneck are the + # immediate cause for gaps in the allocation, and are selected with higher probability. + non_nb_turn_list = [] + for turn in turn_list: + lr = self.lrs[indices[turn]] + if not max_lr.is_neighbour(lr): + non_nb_turn_list.append(turn) + assert turn_list + # Pick from non-neighbour list with 30% probability + # (magic number based on tuning) + if random.randint(0, 100) < 30 and non_nb_turn_list: + # Pick a live range from the "non-neighbour list" + ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)] + else: + # Pick any affecting live range. + ix1 = turn_list[random.randint(0, len(turn_list) - 1)] + + ix2 = turn_list[random.randint(0, len(turn_list) - 2)] + if ix1 == ix2: + ix2 = turn_list[-1] + # Swap indices + indices[ix1], indices[ix2] = indices[ix2], indices[ix1] + + def search(self, indices: List[int], iterations: int): + """ + Search for a solution, using the given indices as initial solution. + """ + best_indices = indices[:] + for _ in range(iterations): + # Reorder the indices + self.attempt_bottleneck_fix(indices) + # Allocate the reordered indices and check if it gave an improvement + new_size = self.allocate_indices(indices) + if new_size <= self.best_size: + # The new allocation produced a new best result remember it + self.best_size = new_size + best_indices = indices[:] + self.allocated_addresses = [lr.address for lr in self.lrs] + if self.best_size <= self.min_required_size: + # Target reached stop + return + else: + # The new allocation produced worse result undo the change + indices = best_indices[:] + + def allocate(self) -> List[int]: + """ + Runs the allocation algorithm. Finishes when an optimal solution has been + found or when maximum iterations have been run. + The allocated addresses are placed in the output vector, in the same + order as the input vector. + + Implementation note: the algorithm produces reproduceable results by using + a well-defined random number generator with well-defined default seed, + and using a fixed number of iterations. + """ + random.seed(1) + # Sort indices on priority. Note: self.lrs must be left unchanged + indices = [lr.id for lr in sorted(self.lrs)] + # Allocate the initial solution + self.best_size = self.allocate_indices(indices) + self.allocated_addresses = [lr.address for lr in self.lrs] + if self.best_size > self.min_required_size: + # Try to improve the heuristic allocation + self.search(indices, HillClimbAllocator.MAX_ITERATIONS) + # else the heuristic allocation returned an optimal solution; no search needed + return self.allocated_addresses + + +def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]: + """ + Allocates live ranges using a search based allocator. + Returns the list of allocated addresses (one for each live range) + """ + if not lrs: + return [] + allocator = HillClimbAllocator(lrs) + return allocator.allocate() diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py index db878bc3..c45d0e3e 100644 --- a/ethosu/vela/nn_graph.py +++ b/ethosu/vela/nn_graph.py @@ -38,7 +38,7 @@ class PassPlacement(enum.Enum): class TensorAllocator(enum.Enum): LinearAlloc = 1 Greedy = 2 - Search = 3 + HillClimb = 3 def __str__(self): return self.name diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py index 0a7da5da..b7057f0b 100644 --- a/ethosu/vela/tensor_allocation.py +++ b/ethosu/vela/tensor_allocation.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 # @@ -20,6 +20,7 @@ import math import numpy as np +from . import hillclimb_allocation from . import live_range from . import numeric_util from .errors import AllocationError @@ -30,7 +31,6 @@ from .tensor import MemArea from .tensor import MemType from .tensor import Tensor from .tensor import TensorPurpose -from ethosu import tensor_allocator def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum): @@ -63,22 +63,14 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation return total_sz -def search_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int: - # Allocates using the search-based allocator (implemented in C++) - input = [] - lrs = [] - lr_set = set() - for lr in live_ranges.ranges.values(): - lr_set.add((lr.start_time, lr.end_time, lr)) - lr_list = sorted(lr_set) - # Create a single array of ints containing start/end/size of the live ranges - for start, end, lr in lr_list: - input += [start, end, numeric_util.round_up(lr.size, alloc_granularity)] - lrs.append(lr) - addresses = tensor_allocator.allocate(input, 0) +def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int: + # Allocates using the hill climb allocator + lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()} + lr_list = [lr for _, _, lr in lr_set] + addresses = hillclimb_allocation.allocate_live_ranges(lr_list) # The result is a list containing the allocated addresses total_sz = 0 - for lr, address in zip(lrs, addresses): + for lr, address in zip(lr_list, addresses): total_sz = max(total_sz, address + lr.size) lr.set_address(address) verify_allocation(live_ranges, alloc_granularity) @@ -179,8 +171,8 @@ def allocate_tensors( verify_allocation(lrs, cpu_tensor_alignment) elif tens_alloc == TensorAllocator.LinearAlloc: total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment) - elif tens_alloc == TensorAllocator.Search: - total_sz = search_allocate_live_ranges(lrs, cpu_tensor_alignment) + elif tens_alloc == TensorAllocator.HillClimb: + total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment) else: assert 0 alloc_ok = max_size is None or total_sz <= max_size diff --git a/ethosu/vela/test/test_hillclimb_allocation.py b/ethosu/vela/test/test_hillclimb_allocation.py new file mode 100644 index 00000000..8a56c3f2 --- /dev/null +++ b/ethosu/vela/test/test_hillclimb_allocation.py @@ -0,0 +1,63 @@ +# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Description: +# Unit tests for hillclimb_allocator. +import pytest + +from ethosu.vela.hillclimb_allocation import allocate_live_ranges +from ethosu.vela.live_range import LiveRange + + +test_data = [ + ([(0, 100, 8000), (0, 1, 8016), (100, 110, 2000), (108, 110, 4000), (109, 110, 6000)], 16016), + ( + [ + (0, 23, 131072), + (4, 5, 65568), + (4, 9, 8192), + (8, 30, 15360), + (10, 11, 65568), + (10, 15, 4096), + (16, 17, 65552), + (16, 21, 2048), + (22, 23, 32784), + (22, 27, 1024), + ], + 216096, + ), +] + + +def live_range(start_time, end_time, size): + lr = LiveRange(None, 1) + lr.start_time = start_time + lr.end_time = end_time + lr.size = size + return lr + + +@pytest.mark.parametrize("lrs, expected_size", test_data) +def test_allocate(lrs, expected_size): + """Tests the search allocator""" + lr_list = [live_range(start, end, size) for start, end, size in lrs] + res = allocate_live_ranges(lr_list) + assert len(res) == len(lrs) + assert max(addr + lr[2] for addr, lr in zip(res, lrs)) == expected_size + + +def test_allocate_empty_input(): + assert [] == allocate_live_ranges([]) diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py index c4510b18..72710025 100644 --- a/ethosu/vela/vela.py +++ b/ethosu/vela/vela.py @@ -317,7 +317,7 @@ def main(args=None): ) parser.add_argument( "--tensor-allocator", - default=TensorAllocator.Search, + default=TensorAllocator.HillClimb, type=lambda s: TensorAllocator[s], choices=list(TensorAllocator), help="Tensor Allocator algorithm (default: %(default)s)", diff --git a/setup.py b/setup.py index f67d42e7..98dfe525 100644 --- a/setup.py +++ b/setup.py @@ -44,12 +44,6 @@ mlw_module = Extension( ["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"], ) -tensor_allocator_module = Extension( - "ethosu.tensor_allocator", - ["ethosu/tensor_allocator/tensor_allocatormodule.cpp", "ethosu/tensor_allocator/search_allocator.cpp"], - depends=["ethosu/tensor_allocator/search_allocator.h"], -) - setup( name="ethos-u-vela", use_scm_version=True, @@ -66,7 +60,6 @@ setup( "Operating System :: POSIX :: Linux", "Operating System :: Microsoft :: Windows", "Programming Language :: C", - "Programming Language :: C++ :: C++11", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", @@ -84,6 +77,6 @@ setup( "lxml>=4.5.1", ], entry_points={"console_scripts": ["vela = ethosu.vela.vela:main"]}, - ext_modules=[mlw_module, tensor_allocator_module], + ext_modules=[mlw_module], setup_requires=["setuptools_scm"], ) -- cgit v1.2.1