From 3d4968ac573cc206ac1c6adcfd6f1d4689a715d1 Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Mon, 4 Dec 2017 15:03:35 +0000 Subject: COMPMID-687: Winograd refactoring Moved the headers into src/ Added pimpl pattern Change-Id: I227f8b47468d8e14875d710aac8de5eb09463e2a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111765 Reviewed-by: Anthony Barbier Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com --- .../kernels/winograd/transforms/output_2x2_3x3.hpp | 356 --------------------- 1 file changed, 356 deletions(-) delete mode 100644 arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp (limited to 'arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp') diff --git a/arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp b/arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp deleted file mode 100644 index 0992c0bb44..0000000000 --- a/arm_compute/core/NEON/kernels/winograd/transforms/output_2x2_3x3.hpp +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2017 ARM Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#pragma once - -namespace winograd { - /* Transform from the Winograd domain back to the spatial domain. - */ - template - struct Winograd2x2_3x3GemmOutput { - static void execute( - const Tensor4DShape &output_shape, - T* const matrix_base, - const int matrix_stride, - const int matrix_row_stride, - T* const output - ); - - protected: - /* Specialised implementation method. */ - template - static void _execute( - const Tensor4DShape &output_shape, - T *output, - const T *input, - const int matrix_stride, - const int matrix_row_stride - ); - }; - - /* Two-stage implementation of the transformation from the Winograd domain. - * - * First computes Z.F and then computes (Z.F).Z^T. - */ - template - struct Winograd2x2_3x3GemmOutput_TwoStage { - static void execute( - const Tensor4DShape &output_shape, - T* const matrix_base, - const int matrix_stride, - const int matrix_row_stride, - T* const output - ); - - protected: - template - static void compute_zf( - const int n_rows, const int n_channels, - T* const zf, const T* const input[16] - ); - - template - static void compute_zfzT( - const Tensor4DShape &output_shape, - T* const output, const T* const zf - ); - }; -} - -#include "output_2x2_3x3/a64_float.hpp" -// #include "output_2x2_3x3/a64_float_two_stage.hpp" - -/*****************************************************************************/ -/* -template -void winograd::Winograd2x2_3x3GemmOutput::execute( - const Tensor4DShape &output_shape, - const int tile_M, - const int tile_N, - T* const matrix_base, - const int matrix_stride, - const int matrix_row_stride, - T* const output -) { - T* const antipadding = reinterpret_cast(malloc(sizeof(T) * output_shape.n_channels)); - - // Get input pointers - const T* inptrs[16]; - for (int i = 0; i < 16; i++) { - inptrs[i] = matrices[i]; - } - - for (int batch = 0; batch < output_shape.n_batches; batch++) { - for (int tile_i = 0; tile_i < tile_M; tile_i++) { - for (int tile_j = 0; tile_j < tile_N; tile_j++) { - // Get pointers for each of the 4 output cells required for this computation - T* outptrs[4]; - for (int cell_i = 0, c = 0; cell_i < 2; cell_i++) { - for (int cell_j = 0; cell_j < 2; cell_j++, c++) { - const int i = tile_i*2 + cell_i; - const int j = tile_j*2 + cell_j; - - if (i < output_shape.n_rows && j < output_shape.n_cols) { - outptrs[c] = output + ( - (batch*output_shape.n_rows + i) * output_shape.n_cols + - j) * output_shape.n_channels; - } else { - outptrs[c] = antipadding; - } - } // cell_j - } // cell_i - - for (int n = 0; n < output_shape.n_channels; n++) { - // Read 16 values and progress pointers - T v[16]; - for (int i = 0; i < 16; i++) { - v[i] = *(inptrs[i]++); - } - - // Compute output for 4 pixels - *(outptrs[0]++) = v[ 0] + v[ 1] + v[ 2] + - v[ 4] + v[ 5] + v[ 6] + - v[ 8] + v[ 9] + v[10]; - *(outptrs[1]++) = v[ 1] - v[ 2] - v[ 3] + - v[ 5] - v[ 6] - v[ 7] + - v[ 9] - v[10] - v[11]; - *(outptrs[2]++) = v[ 4] + v[ 5] + v[ 6] - - v[ 8] - v[ 9] - v[10] - - v[12] - v[13] - v[14]; - *(outptrs[3]++) = v[ 5] - v[ 6] - v[ 7] - - v[ 9] + v[10] + v[11] - - v[13] + v[14] + v[15]; - } // output_channel - } // tile_j - } // tile_i - } // batch - - free(antipadding); -} -*/ - -/*****************************************************************************/ -/* -template -void winograd::Winograd2x2_3x3GemmOutput_TwoStage::execute( - const Tensor4DShape &output_shape, - T* const matrices[16], T* const output -) { - // Allocate memory for the intermediate matrices - const int tile_M = iceildiv(output_shape.n_rows, 2); - const int tile_N = iceildiv(output_shape.n_cols, 2); - const int n_rows = output_shape.n_batches * tile_M * tile_N; - const int n_channels = output_shape.n_channels; - T* matrices_zf = reinterpret_cast( - calloc(8 * n_rows * n_channels, sizeof(T)) - ); - - // Perform the first stage transform, computing ZF. - // Specializations should dispatch to different methods based on tail size. - compute_zf<0>(n_rows, n_channels, matrices_zf, matrices); - - // Perform the second stage transform, finishing Z F Z^T - variable dispatch - // based on size of the output. Specialisations can also dispatch based on - // the tail-size of the channel. - if (output_shape.n_rows % 2 && output_shape.n_cols % 2) { - compute_zfzT(output_shape, output, matrices_zf); - } else if (output_shape.n_rows % 2) { - compute_zfzT(output_shape, output, matrices_zf); - } else if (output_shape.n_cols % 2) { - compute_zfzT(output_shape, output, matrices_zf); - } else { - compute_zfzT(output_shape, output, matrices_zf); - } - - free(reinterpret_cast(matrices_zf)); -} - -template -template -void winograd::Winograd2x2_3x3GemmOutput_TwoStage::compute_zf( - const int n_rows, const int n_channels, - T* output, const T* const input[16] -) { - // Extract 8 output pointers - T* outptr[8]; - for (int i = 0; i < 8; i++) { - outptr[i] = output + i*n_rows*n_channels; - } - - // Copy the 16 input pointers - const T* inptr[16]; - for (int i = 0; i < 16; i++) { - inptr[i] = input[i]; - } - - // For every row of the matrices - for (int i = 0; i < n_rows; i++) { - // For every channel - for (int j = 0; j < n_channels; j++) { - // Extract values from the input matrices - T val[16]; - for (int n = 0; n < 16; n++) { - val[n] = *(inptr[n]++); - } - - // Compute output values - *(outptr[0]++) = val[0] + val[1] + val[2]; - *(outptr[1]++) = val[1] - val[2] - val[3]; - *(outptr[2]++) = val[4] + val[5] + val[6]; - *(outptr[3]++) = val[5] - val[6] - val[7]; - *(outptr[4]++) = val[8] + val[9] + val[10]; - *(outptr[5]++) = val[9] - val[10] - val[11]; - *(outptr[6]++) = val[12] + val[13] + val[14]; - *(outptr[7]++) = val[13] - val[14] - val[15]; - } - } -} - -template -template -void winograd::Winograd2x2_3x3GemmOutput_TwoStage::compute_zfzT( - const Tensor4DShape &output_shape, - T* const output, const T* const input -) { - // Sizing information - const int tile_M = output_shape.n_rows / 2; - const int tile_N = output_shape.n_cols / 2; - - const int n_rows = (output_shape.n_batches * - (tile_M + (tail_M ? 1 : 0)) * - (tile_N + (tail_N ? 1 : 0))); - const int n_channels = output_shape.n_channels; - - // Extract 8 input pointers - const T* inptr[8]; - for (int i = 0; i < 8; i++) { - inptr[i] = input + i*n_rows*n_channels; - } - - // Extract 4 output pointers - T* outptr00 = output; - T* outptr01 = outptr00 + n_channels; - T* outptr10 = outptr00 + output_shape.n_cols * n_channels; - T* outptr11 = outptr10 + n_channels; - - // Progress over the output tiles, generating output values. - for (int batch = 0; batch < output_shape.n_batches; batch++) { - for (int tile_i = 0; tile_i < tile_M; tile_i++) { - for (int tile_j = 0; tile_j < tile_N; tile_j++) { - for (int channel = 0; channel < n_channels; channel++) { - // Read values from the input pointers - T v[8]; - for (int i = 0; i < 8; i++) { - v[i] = *(inptr[i]++); - } - - // Compute the output values and progress the output pointers. - *(outptr00++) = v[0] + v[2] + v[4]; - *(outptr01++) = v[1] + v[3] + v[5]; - *(outptr10++) = v[2] - v[4] - v[6]; - *(outptr11++) = v[3] - v[5] - v[7]; - } - - // Progress the output pointers to the next column - outptr00 += n_channels; - outptr01 += n_channels; - outptr10 += n_channels; - outptr11 += n_channels; - } - - if (tail_N) { - // Only evaluate the left-most columns of the output - for (int channel = 0; channel < n_channels; channel++) { - // Read values from the input pointers - T v[8]; - for (int i = 0; i < 4; i++) { - v[i * 2] = *inptr[i * 2]; - } - for (int i = 0; i < 8; i++) { - inptr[i]++; - } - - // Compute the output values and progress the output pointers. - *(outptr00++) = v[0] + v[2] + v[4]; - *(outptr10++) = v[2] - v[4] - v[6]; - } - - // Progress the output pointers to the next column - outptr01 += n_channels; // Account for being skipped above - outptr11 += n_channels; // Account for being skipped above - } - - // Progress the output pointers to the next row - outptr00 += output_shape.n_cols * n_channels; - outptr01 += output_shape.n_cols * n_channels; - outptr10 += output_shape.n_cols * n_channels; - outptr11 += output_shape.n_cols * n_channels; - } - - if (tail_M) { - // Only work on the upper row of the output - for (int tile_j = 0; tile_j < tile_N; tile_j++) { - for (int channel = 0; channel < n_channels; channel++) { - // Read values from the input pointers - T v[8]; - for (int i = 0; i < 8; i++) { - v[i] = *(inptr[i]++); - } - - // Compute the output values and progress the output pointers. - *(outptr00++) = v[0] + v[2] + v[4]; - *(outptr01++) = v[1] + v[3] + v[5]; - } - - // Progress the output pointers to the next column - outptr00 += n_channels; - outptr01 += n_channels; - outptr10 += 2 * n_channels; // Account for being skipped above - outptr11 += 2 * n_channels; // Account for being skipped above - } - - if (tail_N) { - // Only evaluate the upper-left cell of the output - for (int channel = 0; channel < n_channels; channel++) { - // Read values from the input pointers - T v[8]; - for (int i = 0; i < 3; i++) { - v[i * 2] = *inptr[i * 2]; - } - for (int i = 0; i < 8; i++) { - inptr[i]++; - } - - // Compute the output values and progress the output pointers. - *(outptr00++) = v[0] + v[2] + v[4]; - } - - // Progress the output pointers to the next column - outptr01 += n_channels; // Account for being skipped above - outptr10 += n_channels; // Account for being skipped above - outptr11 += n_channels; // Account for being skipped above - } - } - } -} -*/ -- cgit v1.2.1