aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLouis Verhaard <louis.verhaard@arm.com>2020-12-03 12:26:25 +0100
committerLouis Verhaard <louis.verhaard@arm.com>2020-12-11 12:24:25 +0100
commit9bfe0f86ea525055954b160a3c678024743030ec (patch)
tree384bac88fcc613abd357fbbf266ea30e34cbe526
parent93719a9b8c160de3acf047eacb9196f13167bddb (diff)
downloadethos-u-vela-9bfe0f86ea525055954b160a3c678024743030ec.tar.gz
MLBEDSW-1373: Added search based allocator
Added a new tensor allocator that is based on searching, implemented in C++ (C++11 compatible). Change-Id: Ie96e9fcfc8e6c58d1fa53911f37de290eeba88cf Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
-rw-r--r--OPTIONS.md4
-rw-r--r--ethosu/tensor_allocator/makefile50
-rw-r--r--ethosu/tensor_allocator/search_allocator.cpp266
-rw-r--r--ethosu/tensor_allocator/search_allocator.h185
-rw-r--r--ethosu/tensor_allocator/tensor_allocator_main.cpp77
-rw-r--r--ethosu/tensor_allocator/tensor_allocatormodule.cpp92
-rw-r--r--ethosu/tensor_allocator/test/test_tensor_allocator.py30
-rw-r--r--ethosu/vela/greedy_allocation.py26
-rw-r--r--ethosu/vela/nn_graph.py1
-rw-r--r--ethosu/vela/tensor_allocation.py50
-rw-r--r--ethosu/vela/vela.py2
-rw-r--r--setup.py9
12 files changed, 761 insertions, 31 deletions
diff --git a/OPTIONS.md b/OPTIONS.md
index a97ba4ea..22538a6b 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -183,8 +183,8 @@ vela network.tflite --config my_vela_cfg.ini --memory-mode My_Mem_Mode
Specify which allocator algorithm to use for non-constant NPU and CPU tensor
allocation.
**Type: String**
-**Default: Greedy**
-**Choices: [Greedy, LinearAlloc]**
+**Default: Search**
+**Choices: [Greedy, LinearAlloc, Search]**
```bash
vela network.tflite --tensor-allocator=LinearAlloc
diff --git a/ethosu/tensor_allocator/makefile b/ethosu/tensor_allocator/makefile
new file mode 100644
index 00000000..9636eb94
--- /dev/null
+++ b/ethosu/tensor_allocator/makefile
@@ -0,0 +1,50 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Makefile to build tensor_allocator_main
+
+UNAME=$(shell uname -o)
+
+CXXFLAGS=--std=c++11 -pedantic-errors -Wall -Werror -Wdate-time
+CXXFLAGS+=-fwrapv -fstack-protector-strong -flto -fuse-linker-plugin -ffat-lto-objects -fPIC
+
+ifeq ($(DEBUG),1)
+ CXXFLAGS+=-g -O0 -DDEBUG
+else
+ CXXFLAGS+=-O2
+endif
+
+LIBSRCS=tensor_allocator_main.cpp search_allocator.cpp
+LIBHDRS=search_allocator.h
+
+ifeq ($(UNAME),Cygwin)
+ TENSOR_ALLOCATOR_EXE=tensor_allocator_main.exe
+else
+ TENSOR_ALLOCATOR_EXE=tensor_allocator_main
+endif
+
+all: tensor_allocator_exe
+
+.PHONY: tensor_allocator_exe
+tensor_allocator_exe: $(TENSOR_ALLOCATOR_EXE)
+
+clean:
+ rm -f $(TENSOR_ALLOCATOR_EXE)
+
+$(TENSOR_ALLOCATOR_EXE): $(LIBSRCS) $(LIBHDRS) makefile
+ g++ $(CXXFLAGS) $(LIBSRCS) -o $(TENSOR_ALLOCATOR_EXE)
diff --git a/ethosu/tensor_allocator/search_allocator.cpp b/ethosu/tensor_allocator/search_allocator.cpp
new file mode 100644
index 00000000..ce5c46de
--- /dev/null
+++ b/ethosu/tensor_allocator/search_allocator.cpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Description:
+ * Implementation of the search-based allocator.
+ */
+
+#include <algorithm>
+#include <cstdint>
+#include <set>
+#include <vector>
+
+#include "search_allocator.h"
+
+SearchAllocator::SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit) {
+ lrs = live_ranges;
+ uint32_t max_end_time = 0;
+ for (size_t i = 0; i < lrs.size(); ++i) {
+ auto &lr = lrs[i];
+ lr.id = i;
+ max_end_time = std::max(max_end_time, lr.end_time);
+ }
+ lrs_at_time.resize(max_end_time + 1);
+ size_at_time.resize(max_end_time + 1);
+ neighbours.resize(lrs.size());
+ // Calculate which live ranges are active at every timestamp
+ for (size_t t = 0; t <= max_end_time; ++t) {
+ lrs_at_time[t].clear();
+ }
+ for (auto &lr : lrs) {
+ for (auto t = lr.start_time; t <= lr.end_time; ++t) {
+ lrs_at_time[t].push_back(&lr);
+ }
+ }
+ min_required_size = 0;
+ for (size_t t = 0; t <= max_end_time; ++t) {
+ // Calculate minimum needed size at each timestamp
+ uint32_t size_at_t = 0;
+ for (auto &lr : lrs_at_time[t]) {
+ size_at_t += lr->size;
+ }
+ size_at_time[t] = size_at_t;
+ min_required_size = std::max(size_at_t, min_required_size);
+ // Calculate all neighbours
+ for (size_t i = 0; i < lrs_at_time[t].size(); ++i) {
+ auto lr1 = lrs_at_time[t][i];
+ auto &nb1 = neighbours[lr1->id];
+ for (size_t j = i + 1; j < lrs_at_time[t].size(); ++j) {
+ auto lr2 = lrs_at_time[t][j];
+ if (find(nb1.begin(), nb1.end(), lr2) == nb1.end()) {
+ nb1.push_back(lr2);
+ neighbours[lr2->id].push_back(lr1);
+ }
+ }
+ }
+ }
+ target_size = std::max(min_required_size, available_size);
+ // Calculate the urgency of each live range
+ lr_urgency.resize(lrs.size());
+ for (size_t i = 0; i < lrs.size(); ++i) {
+ auto &lr = lrs[i];
+ uint32_t urgency = 0;
+ for (size_t t = lr.start_time; t <= lr.end_time; ++t) {
+ urgency = std::max(size_at_time[t], urgency);
+ }
+ lr_urgency[i] = urgency;
+ }
+ best_size = UINT32_MAX;
+}
+
+uint32_t SearchAllocator::allocate(std::vector<uint32_t> &output) {
+ output.clear();
+ nr_iterations = 0;
+ std::vector<size_t> indices;
+ // Initial solution, using a heuristic allocator
+ for (size_t i = 0; i < lrs.size(); ++i) {
+ indices.push_back(i);
+ }
+ sort_indices_on_prio(indices);
+ // Allocate the initial solution
+ best_size = UINT32_MAX;
+ best_size = allocate_indices(indices);
+ if (best_size <= target_size) {
+ // The heuristic allocation returned an optimal solution.
+ // No need to search.
+ } else {
+ // Try to improve the heuristic allocation
+ search(indices, best_size, MAX_ITERATIONS);
+ }
+ output.clear();
+ for (auto &lr : lrs) {
+ output.push_back(lr.address);
+ }
+ return best_size;
+}
+
+void SearchAllocator::allocate_lr(LiveRange &lr) const {
+ uint32_t address = 0;
+ int predecessor = NO_PREDECESSOR;
+ bool fits = false;
+ while (!fits) {
+ fits = true;
+ // Find neighbours that overlap with address
+ for (auto lr2_p : neighbours[lr.id]) {
+ if (lr2_p->address == NOT_ALLOCATED || lr2_p->end_address <= address) {
+ continue;
+ }
+ if (lr2_p->overlaps(address, lr.size)) {
+ // Overlap found; increase address
+ fits = false;
+ address = lr2_p->end_address;
+ predecessor = lr2_p->id;
+ }
+ }
+ }
+ lr.address = address;
+ lr.end_address = address + lr.size;
+ lr.predecessor = predecessor;
+}
+
+uint32_t SearchAllocator::allocate_indices(const std::vector<size_t> &indices) {
+ ++nr_iterations;
+ std::vector<size_t> count(indices.size());
+ for (auto &lr : lrs) {
+ lr.address = NOT_ALLOCATED;
+ }
+ uint32_t size = 0;
+ for (size_t turn = 0; size <= best_size && turn < indices.size(); ++turn) {
+ auto &lr = lrs[indices[turn]];
+ allocate_lr(lr);
+ lr.turn = turn;
+ size = std::max(size, lr.end_address);
+ }
+ return size;
+}
+
+void SearchAllocator::sort_indices_on_prio(std::vector<size_t> &indices) const {
+ std::sort(indices.begin(), indices.end(),
+ [this] (size_t const& a, size_t const& b) {
+ if (lr_urgency[a] != lr_urgency[b]) {
+ return lr_urgency[a] > lr_urgency[b];
+ }
+ auto &lr1 = lrs[a];
+ auto &lr2 = lrs[b];
+ auto duration1 = lr1.end_time - lr1.start_time;
+ auto duration2 = lr2.end_time - lr2.start_time;
+ if (duration1 != duration2) {
+ return duration1 > duration2;
+ }
+ if (lr1.start_time != lr2.start_time) {
+ return lr1.start_time < lr2.start_time;
+ }
+ if (lr1.size != lr2.size) {
+ return lr1.size > lr2.size;
+ }
+ return lr1.id < lr2.id;
+ });
+}
+
+void SearchAllocator::add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const {
+ turns.insert(lr.turn);
+ int id = lr.id;
+ while (lrs[id].predecessor != NO_PREDECESSOR) {
+ id = lrs[id].predecessor;
+ turns.insert(lrs[id].turn);
+ }
+}
+
+void SearchAllocator::attempt_bottleneck_fix(std::vector<size_t> &indices) {
+ // Find the bottleneck
+ LiveRange *max_lr = &lrs[0];
+ for (auto &lr : lrs) {
+ if (lr.end_address > max_lr->end_address) {
+ max_lr = &lr;
+ }
+ }
+ // Find all live ranges that affected the placement of the bottleneck live range.
+ // This consists of two types of live ranges:
+ // - direct neighbours of the bottleneck live range
+ // - direct and indirect predecessors of these neighbours + bottleneck
+ // The turns at which these live ranges were allocated are put in the turns vector.
+ std::set<size_t> turns;
+ add_predecessor_turns(turns, *max_lr);
+ for (auto lr_p : neighbours[max_lr->id]) {
+ add_predecessor_turns(turns, *lr_p);
+ }
+ // Non-direct neighbours that interfere with the allocation of the bottleneck are the
+ // immediate cause for gaps in the allocation, and are selected with higher probability.
+ std::vector<size_t> turn_list;
+ std::vector<size_t> non_nb_turn_list;
+ for (auto turn : turns) {
+ turn_list.push_back(turn);
+ auto &lr = lrs[indices[turn]];
+ if (!max_lr->is_neighbour(lr)) {
+ non_nb_turn_list.push_back(turn);
+ }
+ }
+ size_t ix1;
+ if (rng() % 100 < 30 && !non_nb_turn_list.empty()) {
+ // Pick a live range from the "non-neighbour list"
+ ix1 = non_nb_turn_list[rng() % non_nb_turn_list.size()];
+ } else {
+ // Pick any affecting live range.
+ ix1 = turn_list[rng() % turn_list.size()];
+ }
+ size_t ix2 = turn_list[rng() % turn_list.size() - 1];
+ if (ix1 == ix2) {
+ ix2 = turn_list[turn_list.size() - 1];
+ }
+ // Swap indices
+ std::swap(indices[ix1], indices[ix2]);
+}
+
+void SearchAllocator::search(std::vector<size_t> &indices, uint32_t initial_size, int iterations) {
+ std::vector<size_t> best_indices = indices;
+ std::vector<LiveRange> best_lrs = lrs;
+ for (int i = 0; i < iterations; ++i) {
+ // Reorder the indices
+ attempt_bottleneck_fix(indices);
+ // Allocate the reordered indices and check if it gave an improvement
+ auto new_size = allocate_indices(indices);
+ if (new_size <= best_size) {
+ // The new allocation produced a new best result; remember it
+ best_size = new_size;
+ best_indices = indices;
+ best_lrs = lrs;
+ if (best_size <= target_size) {
+ // Target reached; stop
+ return;
+ }
+ } else {
+ // The new allocation produced worse result; undo the change
+ indices = best_indices;
+ lrs = best_lrs;
+ }
+ }
+ lrs = best_lrs;
+}
+
+uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output) {
+ // Convert input to vector of live ranges
+ std::vector<LiveRange> lrs;
+ for (size_t i = 0; i < input.size(); i += 3) {
+ LiveRange lr;
+ lr.start_time = input[i];
+ lr.end_time = input[i+1];
+ lr.size = input[i+2];
+ lrs.push_back(lr);
+ }
+ SearchAllocator allocator(lrs, available_size);
+ return allocator.allocate(output);
+}
diff --git a/ethosu/tensor_allocator/search_allocator.h b/ethosu/tensor_allocator/search_allocator.h
new file mode 100644
index 00000000..0ef8688e
--- /dev/null
+++ b/ethosu/tensor_allocator/search_allocator.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Description:
+ * Declaration of the search-based allocator.
+ */
+
+#ifndef __SEARCH_ALLOCATOR_H
+#define __SEARCH_ALLOCATOR_H
+
+#include <algorithm>
+#include <cstdint>
+#include <random>
+#include <set>
+#include <vector>
+
+/**
+ * Live range
+ */
+struct LiveRange {
+ /** Start time (input to the allocator algorithm) */
+ uint32_t start_time;
+ /** End time, inclusive (input to the allocator algorithm) */
+ uint32_t end_time;
+ /** Size in bytes (input to the allocator algorithm) */
+ uint32_t size;
+ /** Index of this live range */
+ int id;
+ /** Allocated address (the main output from the allocator algorithm) */
+ uint32_t address;
+ /** End address, exclusive */
+ uint32_t end_address;
+ /** id of predecessor live range (predecessor's end address == this lr's address) */
+ int predecessor;
+ /** Turn at which the live range was allocated */
+ size_t turn;
+
+ bool overlaps(uint32_t addr2, uint32_t size2) const {
+ return address < addr2 + size2 && addr2 < end_address;
+ }
+ bool is_neighbour(const LiveRange &lr) const {
+ return start_time <= lr.end_time && lr.start_time <= end_time;
+ }
+};
+
+/**
+ * Implements tensor allocator using state space exploration.
+ *
+ * The basic algorithm is:
+ *
+ * Use a heuristic allocator to find an initial allocation
+ * while allocation is not optimal and iterations < MAX_ITERATIONS {
+ * find the "bottleneck": the live range with highest end address
+ * find all live ranges that affected the allocation of the bottleneck
+ * swap the order of any two affecting live ranges
+ * reallocate tensors using the reordered live ranges
+ * if the new allocation is better: keep it, else set allocation to previous allocation
+ * }
+ */
+class SearchAllocator {
+private:
+ static constexpr int MAX_ITERATIONS = 500;
+ static constexpr uint32_t NOT_ALLOCATED = UINT32_MAX;
+ /** Used for live ranges allocated at address 0 */
+ static constexpr int NO_PREDECESSOR = -1;
+ /** Contains the live ranges */
+ std::vector<LiveRange> lrs;
+ /** Contains active live ranges at each timestamp */
+ std::vector<std::vector<LiveRange*>> lrs_at_time;
+ /**
+ * Contains neighbours of each live range (indexed by lr.id), i.e.
+ * live ranges with overlapping start/end time.
+ */
+ std::vector<std::vector<LiveRange*>> neighbours;
+ /**
+ * At each timestamp: accumulated size of active live ranges
+ */
+ std::vector<uint32_t> size_at_time;
+ /**
+ * For each live range: max value of size_at_time (only used in the heuristic allocation)
+ */
+ std::vector<uint32_t> lr_urgency;
+ /**
+ * The minimum possible size, assuming all live ranges can be perfectly allocated
+ */
+ uint32_t min_required_size;
+ /**
+ * The available size (input to algorithm).
+ */
+ uint32_t available_size;
+ /** The algorithm stops once the target size has been achieved */
+ uint32_t target_size;
+ /** The highest end address of the best found allocation */
+ uint32_t best_size;
+ /** Number of performed iterations */
+ size_t nr_iterations = 0;
+ /** Random number generator; use default seed (which is well-defined) */
+ std::mt19937 rng;
+public:
+ SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit);
+ /**
+ * Runs the allocation algorithm. Finishes when the target size has been
+ * reached or when maximum iterations have been run.
+ * The allocated addresses are placed in the output vector, in the same
+ * order as the input vector.
+ *
+ * Implementation note: the algorithm produces reproduceable results by using
+ * a well-defined random number generator with well-defined default seed,
+ * and using a fixed number of iterations.
+ */
+ uint32_t allocate(std::vector<uint32_t> &output);
+ uint32_t get_min_required_size() const {
+ return min_required_size;
+ }
+ size_t get_nr_iterations() const {
+ return nr_iterations;
+ }
+private:
+ /**
+ * Allocates the given live range at the smallest possible address
+ */
+ void allocate_lr(LiveRange &lr) const;
+ /**
+ * Allocates the live ranges in the order indicated by the indices;
+ * allocates each live range at the lowest possible address.
+ */
+ uint32_t allocate_indices(const std::vector<size_t> &indices);
+ /** Sorts live ranges based on heuristics, used for the initial allocation */
+ void sort_indices_on_prio(std::vector<size_t> &indices) const;
+ /** Adds the given live range + predecessors to the turns vector */
+ void add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const;
+ /**
+ * Finds the "bottleneck", the live range with highest end address, and reorders the indices
+ * such that a next allocation might lower the memory usage.
+ *
+ * ---------
+ * | |
+ * | D |
+ * | |
+ * ----------------------------------
+ * | B |
+ * -------------------------------
+ * | |
+ * |A| ---
+ * | | |C|
+ * | | | |
+ * ---------------------------------------
+ *
+ * In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
+ * The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
+ * and all direct and indirect predecessors of D and its neighbours
+ * (i.e. A, which is the predecessor of B, and indirect predecessor of D).
+ *
+ * By permuting the order in which the affecting live ranges are allocated, the bottleneck might
+ * be lowered. In the above example, almost any permutation would lower the bottleneck.
+ *
+ * Note that there is room to improve the efficiency of the algorithm.
+ * One way could be to first allocate all direct neighbours of the bottleneck
+ * (i.e. B, C, D) and then the other affecting live ranges (i.e. A). The algorithm currently does
+ * not actively try this, as it may lead to allocation loops (A could become the new bottle-neck);
+ * it just uses a higher probability of selecting A.
+ */
+ void attempt_bottleneck_fix(std::vector<size_t> &indices);
+ /** Search for a solution, using the given indices as initial solution. */
+ void search(std::vector<size_t> &indices, uint32_t initial_size, int iterations);
+};
+
+/** Wrapper function to perform live range allocation */
+uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output);
+
+#endif // __SEARCH_ALLOCATOR_H
diff --git a/ethosu/tensor_allocator/tensor_allocator_main.cpp b/ethosu/tensor_allocator/tensor_allocator_main.cpp
new file mode 100644
index 00000000..27d96ef7
--- /dev/null
+++ b/ethosu/tensor_allocator/tensor_allocator_main.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+#include "search_allocator.h"
+
+using namespace std;
+
+/**
+ * Reads live ranges from the input, and then performs allocation.
+ * The input has format:
+
+<number of live ranges>
+<start_time> <end_time> <size>
+...
+
+ * e.g.:
+4
+0 20 4096
+2 8 16000
+4 10 800
+6 20 1024
+ */
+int main() {
+ int lr_count;
+ cin >> lr_count;
+ cin.ignore();
+ vector<uint32_t> input;
+ vector<uint32_t> output;
+ for (int i = 0; i < lr_count; ++i) {
+ LiveRange lr;
+ cin >> lr.start_time >> lr.end_time >> lr.size;
+ lr.id = i;
+ cin.ignore();
+ input.push_back(lr.start_time);
+ input.push_back(lr.end_time);
+ input.push_back(lr.size);
+ }
+ vector<LiveRange> lrs;
+ for (size_t i = 0; i < input.size(); i += 3) {
+ LiveRange lr;
+ lr.start_time = input[i];
+ lr.end_time = input[i+1];
+ lr.size = input[i+2];
+ lrs.push_back(lr);
+ }
+ SearchAllocator allocator(lrs, 0);
+ uint32_t result = allocator.allocate(output);
+ printf("Output:\n");
+ for (int i = 0; i < lr_count; ++i) {
+ printf("%4d: %6d %4d-%4d size %6d\n", i, output[i], input[3*i], input[3*i+1], input[3*i+2]);
+ }
+ uint32_t min_size = allocator.get_min_required_size();
+ double overhead = 100.0 * (result - min_size)/(double)min_size;
+ printf("Total size: %d (%1.1f K), minimum required size: %d, overhead: %1.2f%%\n",
+ result, result/1024.0, min_size, overhead);
+ printf("Search used %ld iterations\n", (long)allocator.get_nr_iterations());
+ return 0;
+}
diff --git a/ethosu/tensor_allocator/tensor_allocatormodule.cpp b/ethosu/tensor_allocator/tensor_allocatormodule.cpp
new file mode 100644
index 00000000..79ee95ad
--- /dev/null
+++ b/ethosu/tensor_allocator/tensor_allocatormodule.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include <vector>
+#include "search_allocator.h"
+
+
+
+/**
+ * C++ extension wrapper for allocate
+ *
+ * This method is exposed directly in python with the arguments with a
+ * prototype of the form:
+ *
+ * output = tensor_allocator.allocate(input, available_size=0)
+ *
+ * input: [int]
+ * available_size: int
+ * output: [int]
+ */
+static PyObject *method_allocate (PyObject *self, PyObject *args)
+{
+ /* Object to hold the input integer list. */
+ PyObject *input_list_object;
+
+ /* Object to hold the available size */
+ int available_size = 0;
+
+ /* Arguments to the method are delivered as a tuple, unpack the
+ * tuple to get the individual arguments, note the second is
+ * optional.
+ */
+ if (!PyArg_ParseTuple(args, "O|i", &input_list_object, &available_size)) {
+ return NULL;
+ }
+
+ /* Unpack the length of the input integer list. */
+ int input_length = PyObject_Length (input_list_object);
+ if (input_length < 0) {
+ input_length = 0;
+ }
+ std::vector<uint32_t> input;
+ std::vector<uint32_t> output;
+ for (int i = 0; i < input_length; ++i) {
+ PyObject *obj = PyList_GetItem(input_list_object, i);
+ uint32_t value = (uint32_t)PyLong_AsLong(obj);
+ input.push_back(value);
+ }
+ allocate(input, available_size, output);
+ PyObject *output_list = PyList_New(output.size());
+ for (size_t i = 0; i < output.size(); ++i) {
+ PyList_SetItem(output_list, i, PyLong_FromLong(output[i]));
+ }
+ return output_list;
+}
+
+/** tensor_allocator method descriptors. */
+static PyMethodDef tensor_allocator_methods[] = {
+ {"allocate", method_allocate, METH_VARARGS, "Python interface for allocate"},
+ {NULL, NULL, 0, NULL}
+};
+
+/** tensor_allocator module descriptor. */
+static struct PyModuleDef tensor_allocatormodule = {
+ PyModuleDef_HEAD_INIT,
+ "tensor_allocator",
+ "Python interface for tensor_allocator",
+ -1,
+ tensor_allocator_methods
+};
+
+PyMODINIT_FUNC PyInit_tensor_allocator(void) {
+ return PyModule_Create(&tensor_allocatormodule);
+}
diff --git a/ethosu/tensor_allocator/test/test_tensor_allocator.py b/ethosu/tensor_allocator/test/test_tensor_allocator.py
new file mode 100644
index 00000000..b9b547fc
--- /dev/null
+++ b/ethosu/tensor_allocator/test/test_tensor_allocator.py
@@ -0,0 +1,30 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Unit tests for tensor_allocator.
+from ethosu import tensor_allocator
+
+
+def test_allocate():
+ """Tests the search allocator"""
+ # Create an input that requires a search to produce a perfect allocation.
+ # Should result in size 16016
+ lrs = [(0, 100, 8000), (0, 1, 8016), (100, 110, 2000), (108, 110, 4000), (109, 110, 6000)]
+ input = [x for lr in lrs for x in lr]
+ res = tensor_allocator.allocate(input, 0)
+ assert len(res) == len(lrs)
+ assert max(addr + lr[2] for addr, lr in zip(res, lrs)) == 16016
diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py
index 58d948c2..b0395def 100644
--- a/ethosu/vela/greedy_allocation.py
+++ b/ethosu/vela/greedy_allocation.py
@@ -16,7 +16,6 @@
# Description:
# Allocate tensor addresses using a greedy algorithm.
from . import numeric_util
-from .errors import AllocationError
class GreedyAllocator:
@@ -70,33 +69,8 @@ class GreedyAllocator:
self.alloc(new_lr)
- self.verify_allocation(alignment)
return self.memory_required
- def verify_allocation(self, alignment):
- lrs = list(self.live_ranges.ranges.values())
- for n in lrs:
- for tens in n.tensors:
- if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
- # This is a CPU tensor, verify alignment
- if tens.address % alignment != 0:
- raise AllocationError("Tensor {} not aligned to {} bytes".format(tens.name, alignment))
-
- for m in lrs:
- if n != m and n.overlaps_ranges(m):
- overlap, tens_n, tens_m = n.overlaps_address(m)
- if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
- raise AllocationError(
- "Overlapping buffers: {}: {} -> {} and {}: {} -> {}".format(
- n.name,
- tens_n.address,
- tens_n.address + n.size,
- m.name,
- tens_m.address,
- tens_m.address + m.size,
- )
- )
-
def allocate_live_ranges(nng, arch, live_ranges, mem_area, alignment, verbose_allocation=False):
g = GreedyAllocator(nng, arch, live_ranges, mem_area)
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index b2877851..0ae3de9a 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -36,6 +36,7 @@ class PassPlacement(enum.Enum):
class TensorAllocator(enum.Enum):
LinearAlloc = 1
Greedy = 2
+ Search = 3
def __str__(self):
return self.name
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index d1a33728..9736ca22 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -24,11 +24,13 @@ from . import live_range
from . import numeric_util
from .errors import AllocationError
from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
+from .live_range import LiveRangeGraph
from .nn_graph import TensorAllocator
from .tensor import MemArea
from .tensor import MemType
from .tensor import Tensor
from .tensor import TensorPurpose
+from ethosu import tensor_allocator
def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
@@ -61,7 +63,29 @@ def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.Allocation
return total_sz
-def verify_alignment(live_ranges, alignment):
+def search_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
+ # Allocates using the search-based allocator (implemented in C++)
+ input = []
+ lrs = []
+ lr_set = set()
+ for lr in live_ranges.ranges.values():
+ lr_set.add((lr.start_time, lr.end_time, lr))
+ lr_list = sorted(lr_set)
+ # Create a single array of ints containing start/end/size of the live ranges
+ for start, end, lr in lr_list:
+ input += [start, end, numeric_util.round_up(lr.size, alloc_granularity)]
+ lrs.append(lr)
+ addresses = tensor_allocator.allocate(input, 0)
+ # The result is a list containing the allocated addresses
+ total_sz = 0
+ for lr, address in zip(lrs, addresses):
+ total_sz = max(total_sz, address + lr.size)
+ lr.set_address(address)
+ verify_allocation(live_ranges, alloc_granularity)
+ return total_sz
+
+
+def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
for lr in live_ranges.ranges.values():
for tens in lr.tensors:
if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
@@ -70,6 +94,27 @@ def verify_alignment(live_ranges, alignment):
raise AllocationError("Tensor {} not aligned to {} bytes".format(tens.name, alignment))
+def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
+ lrs = list(live_ranges.ranges.values())
+ for n in lrs:
+ verify_alignment(live_ranges, alignment)
+
+ for m in lrs:
+ if n != m and n.overlaps_ranges(m):
+ overlap, tens_n, tens_m = n.overlaps_address(m)
+ if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
+ raise AllocationError(
+ "Overlapping buffers: {}: {} -> {} and {}: {} -> {}".format(
+ n.name,
+ tens_n.address,
+ tens_n.address + n.size,
+ m.name,
+ tens_m.address,
+ tens_m.address + m.size,
+ )
+ )
+
+
def mark_sram_used_for_cascaded_passes(sg, lrs):
end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
mem_usage = np.zeros(end_pos, dtype=np.int64)
@@ -137,8 +182,11 @@ def allocate_tensors(
tens_alloc = tensor_allocator
if tens_alloc == TensorAllocator.Greedy:
total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment, verbose_allocation)
+ verify_allocation(lrs, cpu_tensor_alignment)
elif tens_alloc == TensorAllocator.LinearAlloc:
total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
+ elif tens_alloc == TensorAllocator.Search:
+ total_sz = search_allocate_live_ranges(lrs, cpu_tensor_alignment)
else:
assert 0
alloc_ok = max_size is None or total_sz <= max_size
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 37de1ed2..d27eef0e 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -282,7 +282,7 @@ def main(args=None):
)
parser.add_argument(
"--tensor-allocator",
- default=TensorAllocator.Greedy,
+ default=TensorAllocator.Search,
type=lambda s: TensorAllocator[s],
choices=list(TensorAllocator),
help="Tensor Allocator algorithm (default: %(default)s)",
diff --git a/setup.py b/setup.py
index 57cc2aee..805062b0 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,12 @@ mlw_module = Extension(
["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"],
)
+tensor_allocator_module = Extension(
+ "ethosu.tensor_allocator",
+ ["ethosu/tensor_allocator/tensor_allocatormodule.cpp", "ethosu/tensor_allocator/search_allocator.cpp"],
+ depends=["ethosu/tensor_allocator/search_allocator.h"],
+)
+
setup(
name="ethos-u-vela",
use_scm_version=True,
@@ -59,6 +65,7 @@ setup(
"License :: OSI Approved :: Apache Software License",
"Operating System :: POSIX :: Linux",
"Programming Language :: C",
+ "Programming Language :: C++ :: C++11",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
@@ -71,6 +78,6 @@ setup(
python_requires="~=3.6", # We support only 3.6+
install_requires=["flatbuffers==1.11.0", "numpy>=1.16.6", "lxml>=4.5.1"],
entry_points={"console_scripts": ["vela = ethosu.vela.vela:main"]},
- ext_modules=[mlw_module],
+ ext_modules=[mlw_module, tensor_allocator_module],
setup_requires=["setuptools_scm"],
)