aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-04-27 18:49:44 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:50:48 +0000
commitb3b1e0b3e349ae607297bbca3f273d3d0dd19679 (patch)
treeee2e939ab62b4104e364396d18a1169742eae8a4
parent57dac8400d56a4b68975d5563a9540c96d49fe5f (diff)
downloadComputeLibrary-b3b1e0b3e349ae607297bbca3f273d3d0dd19679.tar.gz
COMPMID-1010: Remove RSH profiler header
Change-Id: I2967ec94c3bead0b92ff1d1581ff6afea21c7f04 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/129405 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/convolution/common/profiler.hpp341
-rw-r--r--arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp3
-rw-r--r--src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp96
3 files changed, 25 insertions, 415 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/common/profiler.hpp b/arm_compute/core/NEON/kernels/convolution/common/profiler.hpp
deleted file mode 100644
index c6897e3771..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/profiler.hpp
+++ /dev/null
@@ -1,341 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <cstdio>
-#include <cstring>
-#include <chrono>
-#include <unistd.h>
-
-#ifdef CYCLE_PROFILING
-#include <algorithm>
-#include <cmath>
-#include <map>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-#include "perf.h"
-#endif // CYCLE_PROFILING
-
-#ifdef CYCLE_PROFILING
-class EventIDContainer
-{
- public:
- EventIDContainer() : container_lock(), event_ids()
- {
- }
-
- int get_event_id(const char *id)
- {
- std::lock_guard<std::mutex> lock(container_lock);
- if (!event_ids.count(id)) {
- event_ids.emplace(id, event_ids.size());
- }
- return event_ids[id];
- }
-
- unsigned int size() const
- {
- return event_ids.size();
- }
-
- auto begin()
- {
- return event_ids.begin();
- }
-
- auto end()
- {
- return event_ids.end();
- }
-
- private:
- std::mutex container_lock;
- std::map<const char *, int> event_ids;
-};
-
-
-class ThreadEventCounterContainer
-{
- public:
- ThreadEventCounterContainer() : container_lock(), thread_counter_fds()
- {
- }
-
- int get_counter_fd()
- {
- const auto id = std::this_thread::get_id();
- std::lock_guard<std::mutex> lock(container_lock);
- if (!thread_counter_fds.count(id))
- {
- thread_counter_fds.emplace(id, open_cycle_counter());
- }
- return thread_counter_fds[id];
- }
-
- ~ThreadEventCounterContainer()
- {
- // Close all counter file descriptors
- for (auto& fd : thread_counter_fds)
- {
- close(fd.second);
- }
- }
-
- private:
- std::mutex container_lock;
- std::map<std::thread::id, int> thread_counter_fds;
-};
-#endif // CYCLE_PROFILING
-
-
-class profiler {
-private:
-#ifdef CYCLE_PROFILING
- struct ProfileEntry {
- int event_id;
- long int bytes_read, ops, bytes_written;
- long int duration;
- };
-
- static const int maxevents = 10000;
- ProfileEntry events[maxevents];
- int currentevent;
- std::mutex event_lock;
-
- EventIDContainer event_ids;
- ThreadEventCounterContainer thread_counter_fds;
-
- int get_event_id(const char *id)
- {
- return event_ids.get_event_id(id);
- }
-#endif // CYCLE_PROFILING
-
-public:
-#ifdef CYCLE_PROFILING
- profiler() :
- currentevent(0),
- event_lock(),
- event_ids(),
- thread_counter_fds()
- {
- }
-
- ~profiler() {
- std::lock_guard<std::mutex> lock_events(event_lock);
-
- // Compute performance from recorded events
- struct ProfileResult {
- ProfileResult() : total_calls(0),
- total_duration(0),
- total_bytes_read(0),
- total_ops(0),
- total_bytes_written(0) {
- }
-
- void operator+=(const ProfileEntry &rhs) {
- total_calls++;
- total_duration += rhs.duration;
- total_bytes_read += rhs.bytes_read;
- total_ops += rhs.ops;
- total_bytes_written = rhs.bytes_written;
- }
-
- float avg_duration(void) const {
- return static_cast<float>(total_duration) /
- static_cast<float>(total_calls);
- }
-
- float bytes_read_per_cycle(void) const {
- return static_cast<float>(total_bytes_read) /
- static_cast<float>(total_duration);
- }
-
- float ops_per_cycle(void) const {
- return static_cast<float>(total_ops) /
- static_cast<float>(total_duration);
- }
-
- float bytes_written_per_cycle(void) const {
- return static_cast<float>(total_bytes_written) /
- static_cast<float>(total_duration);
- }
-
- long int total_calls,
- total_duration,
- total_bytes_read,
- total_ops,
- total_bytes_written;
- };
-
- std::vector<ProfileResult> totals;
- totals.resize(event_ids.size());
- for (int i = 0; i < currentevent; i++) {
- const auto &event = events[i];
- totals[event.event_id] += event;
- }
-
- // Get the longest label
- int len_label = 0;
- for (const auto &kv : event_ids) {
- len_label = std::max(len_label, static_cast<int>(strlen(kv.first)));
- }
-
- // Get the longest values for every other field
- const auto get_length_of_field =
- [totals] (const char *title, auto f, auto len) -> size_t {
- size_t l = strlen(title);
- for (const auto &v : totals) {
- l = std::max(l, len(f(v)));
- }
- return l;
- };
-
- // Get the strlen for an int
- const auto intlen = [] (long int x) -> size_t {
- size_t len = 0;
- do {
- x /= 10;
- len++;
- } while (x);
- return len;
- };
-
- // Get the strlen for a float
- const auto floatlen = [] (const int precision) {
- return [precision] (float x) {
- size_t len = 0;
-
- if (!std::isfinite(x)) {
- return static_cast<size_t>(3);
- }
-
- do {
- x /= 10.0f;
- len++;
- } while (x > 1.0f);
- return len + 1 + precision;
- };
- };
-
- const int len_calls = get_length_of_field(
- "Calls", [] (const auto &v) {return v.total_calls;},
- intlen
- );
- const int len_duration = get_length_of_field(
- "Duration", [] (const auto &v) {return v.total_duration;},
- intlen
- );
- const int len_average_duration = get_length_of_field(
- "Average", [] (const auto &v) {return v.avg_duration();},
- floatlen(2)
- );
- const int len_reads_per_cycle = get_length_of_field(
- "Reads / cycle",
- [] (const auto &v) {return v.bytes_read_per_cycle();},
- floatlen(6)
- );
- const int len_ops_per_cycle = get_length_of_field(
- "Ops / cycle",
- [] (const auto &v) {return v.ops_per_cycle();},
- floatlen(6)
- );
- const int len_writes_per_cycle = get_length_of_field(
- "Writes / cycle",
- [] (const auto &v) {return v.bytes_written_per_cycle();},
- floatlen(6)
- );
-
- // Print header
- printf(
- "%*s %*s %*s %*s %*s %*s %*s\n",
- len_label, "",
- len_calls, "Calls",
- len_duration, "Duration",
- len_average_duration, "Average",
- len_reads_per_cycle, "Reads / cycle",
- len_ops_per_cycle, "Ops / cycle",
- len_writes_per_cycle, "Writes / cycle"
- );
- for (const auto &kv : event_ids) {
- const auto id = kv.second;
- printf(
- "%*s %*ld %*ld %*.2f %*.6f %*.6f %*.6f\n",
- len_label, kv.first,
- len_calls, totals[id].total_calls,
- len_duration, totals[id].total_duration,
- len_average_duration, totals[id].avg_duration(),
- len_reads_per_cycle, totals[id].bytes_read_per_cycle(),
- len_ops_per_cycle, totals[id].ops_per_cycle(),
- len_writes_per_cycle, totals[id].bytes_written_per_cycle()
- );
- }
- printf("\n");
- }
-#endif // CYCLE_PROFILING
-
- template <typename T>
- double operator() (const char * event,
- T func,
- long int bytes_read = 0,
- long int ops = 0,
- long int bytes_written = 0) {
-#ifdef CYCLE_PROFILING
- if (currentevent==maxevents) {
- const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
- func();
- const std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
- return std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
- } else {
- const auto countfd = thread_counter_fds.get_counter_fd();
- const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
- start_counter(countfd);
- func();
- long long cycs = stop_counter(countfd);
- const std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
- return std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
-
- // Store the profiling data
- std::lock_guard<std::mutex> lock_events(event_lock);
- events[currentevent++] = {
- get_event_id(event), bytes_read, ops, bytes_written, cycs
- };
-
- return duration_us;
- }
-#else
- (void) event;
- (void) bytes_read;
- (void) ops;
- (void) bytes_written;
- const std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now();
- func();
- const std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now();
- return std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
-#endif // CYCLE_PROFILING
- }
-};
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp
index f3b2bb10ed..dd67e97035 100644
--- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp
+++ b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp
@@ -27,7 +27,6 @@
#include "arm_compute/core/NEON/kernels/convolution/common/alloc.hpp"
#include "arm_compute/core/NEON/kernels/convolution/common/convolution.hpp"
#include "gemm.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/profiler.hpp"
#include "arm_compute/core/NEON/kernels/convolution/common/shims.hpp"
#include "arm_compute/core/NEON/kernels/convolution/common/tensor.hpp"
#include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
@@ -439,8 +438,6 @@ class WinogradGEMM
const int tile_rows; /** Number of rows of tiles. */
const int tile_cols; /** Number of columns of tiles. */
const int M, K, N; /** Sizes of underlying fundamental matrix multiplications. */
-
- profiler prof;
};
};
diff --git a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
index 8f8cd250bf..a0ecaea4d4 100644
--- a/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/winograd_gemm.cpp
@@ -24,6 +24,8 @@
#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp"
#include "arm_compute/core/NEON/kernels/convolution/winograd/batched_blocked_gemm.hpp"
+#include <cstring>
+
using namespace winograd;
/** Get the output shape of a convolution. */
@@ -243,8 +245,7 @@ WinogradGEMM<output_tile_rows, output_tile_cols, kernel_rows, kernel_cols>::Conv
tile_cols(iceildiv(output_shape.n_cols, output_tile_cols)),
M(input_shape.n_batches * tile_rows * tile_cols),
K(kernel_shape.n_input_channels),
- N(kernel_shape.n_output_channels),
- prof()
+ N(kernel_shape.n_output_channels)
{
// Create pointers to the kernel matrices
const int kernel_matrix_size_bytes = get_kernel_matrix_size(kernel_shape);
@@ -317,20 +318,12 @@ Convolution<TOut, TIn>::transform_weights(
kernel_hwio = reinterpret_cast<TIn *>(transform_working_space);
// Re-order the weights from OIHW to HWIO
- this->prof(
- "Weight reorder",
- [&kernel, &kernel_hwio, this] () {
- reorder::ofm_ifm_h_w_to_h_w_ifm_ofm(
- kernel, const_cast<TIn *>(kernel_hwio),
- kernel_shape.n_output_channels,
- kernel_shape.n_input_channels,
- kernel_shape.n_rows,
- kernel_shape.n_cols
- );
- },
- kernel_shape.size() * sizeof(TIn),
- 0,
- kernel_shape.size() * sizeof(TIn)
+ reorder::ofm_ifm_h_w_to_h_w_ifm_ofm(
+ kernel, const_cast<TIn *>(kernel_hwio),
+ kernel_shape.n_output_channels,
+ kernel_shape.n_input_channels,
+ kernel_shape.n_rows,
+ kernel_shape.n_cols
);
}
@@ -344,17 +337,7 @@ Convolution<TOut, TIn>::transform_weights(
);
// Transform the weights into the Winograd domain
- auto kernel_prep = [&] ()
- {
- weights_transform.run(0, weights_transform.get_window());
- };
-
- prof(
- "Kernel Prep", kernel_prep,
- WeightsTransformT::bytes_read(kernel_shape),
- WeightsTransformT::ops_performed(kernel_shape),
- WeightsTransformT::bytes_written(kernel_shape)
- );
+ weights_transform.run(0, weights_transform.get_window());
// Free memory if we allocated it
if (allocated_working_space)
@@ -419,18 +402,12 @@ Convolution<TOut, TIn>::execute(
ws_bytes + N_GEMMS*(in_matrix_stride_bytes + out_matrix_stride_bytes)
);
- this->prof(
- "NCHW -> NHWC",
- [input, input_shape, input_nhwc] () {
- reorder::nchw_to_nhwc(
- input, const_cast<TIn *>(input_nhwc),
- input_shape.n_batches,
- input_shape.n_channels,
- input_shape.n_rows,
- input_shape.n_cols
- );
- },
- input_shape.size(), 0, input_shape.size()
+ reorder::nchw_to_nhwc(
+ input, const_cast<TIn *>(input_nhwc),
+ input_shape.n_batches,
+ input_shape.n_channels,
+ input_shape.n_rows,
+ input_shape.n_cols
);
}
@@ -456,15 +433,7 @@ Convolution<TOut, TIn>::execute(
);
// Transform the input into the Winograd domain
- auto input_prep = [&] () {
- input_transform.run(0, input_transform.get_window());
- };
- prof(
- "Input Prep", input_prep,
- InputTransform<TIn>::bytes_read(input_shape),
- InputTransform<TIn>::ops_performed(input_shape),
- InputTransform<TIn>::bytes_written(input_shape)
- );
+ input_transform.run(0, input_transform.get_window());
// Perform the GEMMs
const int kernel_matrix_stride_bytes = get_kernel_matrix_size(kernel_shape);
@@ -482,8 +451,7 @@ Convolution<TOut, TIn>::execute(
);
for (unsigned int i = 0; i < gemms.get_window(); i++)
{
- auto run_gemm = [&] () { gemms.run(i, i+1); };
- prof("GEMM", run_gemm, 0, 0, 0);
+ gemms.run(i, i+1);
}
// If the output tensor needs to be in NCHW form then store the NHWC output
@@ -510,31 +478,17 @@ Convolution<TOut, TIn>::execute(
output_shape.n_cols,
output_shape.n_channels
);
- auto output_prep = [&] () {
- output_transform.run(0, output_transform.get_window());
- };
- prof(
- "Output Comp", output_prep,
- OutputTransform<TOut>::bytes_read(output_shape),
- OutputTransform<TOut>::ops_performed(output_shape),
- OutputTransform<TOut>::bytes_written(output_shape)
- );
+ output_transform.run(0, output_transform.get_window());
// Reorder the output tensor if it is required to be in NCHW form.
if (input_shape.ordering == NCHW)
{
- prof(
- "NHWC -> NCHW",
- [output_nhwc, output_shape, output] () {
- reorder::nhwc_to_nchw(
- output_nhwc, output,
- output_shape.n_batches,
- output_shape.n_rows,
- output_shape.n_cols,
- output_shape.n_channels
- );
- },
- output_shape.size(), 0, output_shape.size()
+ reorder::nhwc_to_nchw(
+ output_nhwc, output,
+ output_shape.n_batches,
+ output_shape.n_rows,
+ output_shape.n_cols,
+ output_shape.n_channels
);
}