/src/tesseract/src/arch/intsimdmatrix.h
Line | Count | Source |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: intsimdmatrix.h |
3 | | // Description: Base class for 8-bit int SIMD matrix multipliers. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2017, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | /////////////////////////////////////////////////////////////////////// |
17 | | |
18 | | #ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_ |
19 | | #define TESSERACT_ARCH_INTSIMDMATRIX_H_ |
20 | | |
21 | | #include <tesseract/export.h> |
22 | | |
23 | | #include <cstdint> |
24 | | #include <vector> |
25 | | |
26 | | #include "tesstypes.h" |
27 | | |
28 | | namespace tesseract { |
29 | | |
30 | | template <class T> |
31 | | class GENERIC_2D_ARRAY; |
32 | | |
33 | | // Base class for a SIMD function to multiply a matrix by a vector, with sources |
34 | | // of 8-bit signed integer, and result in a double, after appropriate scaling. |
35 | | // Assumes a specific method of multiplication that can be applied to any size |
36 | | // and number of SIMD registers as follows: |
37 | | // int32_t results are computed with num_outputs_per_register_ in each of |
38 | | // max_output_registers_ result registers, repeatedly until it would make too |
39 | | // many results, then the number of registers is halved, and so-on down to a |
40 | | // single result register. The last calculation only outputs the required number |
41 | | // of results instead of writing beyond the bounds. Eg: matrix has 75 outputs, |
42 | | // num_outputs_per_register_ = 4, and max_output_registers_ = 8, |
43 | | // Step 1: 8x4=32 results are computed, |
44 | | // Step 2: 8x4=32 again, total 64, |
45 | | // Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72, |
46 | | // Step 4: 1x3, total 75. |
47 | | // Each step above is computed using a PartialFunc, which runs over the input |
48 | | // vector once. The input is read one registerful of num_inputs_per_register_ |
49 | | // at a time (presumably 4x num_outputs_per_register_ since they are int8_t) |
50 | | // so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_. |
51 | | // Since it is slow (on Intel at least) to horizontally add in a register, |
52 | | // provision is made to process num_inputs_per_group_ inputs at a time, with |
53 | | // the group being replicated num_input_groups_ times and multiplied by a |
54 | | // num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix. |
55 | | // This is most convenient if num_inputs_per_group_ is 4, and the product |
56 | | // sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent |
57 | | // results in the process, but it doesn't have to be implemented that way. |
58 | | // The weights are re-ordered by Init() to be used sequentially by the above |
59 | | // algorithm, followed by the biases, so they can be added at the end. |
60 | | // The base class computes the base C++ implementation. |
61 | | // NOTE that, although the subclasses execute on different SIMD hardware, no |
62 | | // virtual methods are needed, as the constructor sets up everything that |
63 | | // is required to allow the base class implementation to do all the work. |
64 | | struct TESS_API IntSimdMatrix { |
65 | | // Computes a reshaped copy of the weight matrix w. |
66 | | void Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w, |
67 | | int32_t &rounded_num_out) const; |
68 | | |
69 | | // Rounds the size up to a multiple of the input register size (in int8_t). |
70 | 5.43M | int RoundInputs(int size) const { |
71 | 5.43M | return Roundup(size, num_inputs_per_register_); |
72 | 5.43M | } |
73 | | // Rounds the size up to a multiple of the output register size (in int32_t). |
74 | 1.63M | int RoundOutputs(int size) const { |
75 | 1.63M | return Roundup(size, num_outputs_per_register_); |
76 | 1.63M | } |
77 | | |
78 | | // Computes matrix.vector v = Wu. |
79 | | // u is of size W.dim2() - 1 and the output v is of size W.dim1(). |
80 | | // u is imagined to have an extra element at the end with value 1, to |
81 | | // implement the bias, but it doesn't actually have it. |
82 | | // Computes the base C++ implementation. |
83 | | static void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, const std::vector<TFloat> &scales, |
84 | | const int8_t *u, TFloat *v); |
85 | | |
86 | | // Rounds the input up to a multiple of the given factor. |
87 | 1.21G | static int Roundup(int input, int factor) { |
88 | 1.21G | return (input + factor - 1) / factor * factor; |
89 | 1.21G | } |
90 | | |
91 | | // Computes matrix.vector v = Wu. |
92 | | // u is of size W.dim2() - 1 and the output v is of size W.dim1(). |
93 | | // u is imagined to have an extra element at the end with value 1, to |
94 | | // implement the bias, but it doesn't actually have it. |
95 | | // Uses an optimized implementation with partial funcs. |
96 | | // NOTE: The size of the input vector (u) must be padded using |
97 | | // RoundInputs above. |
98 | | // The input will be over-read to the extent of the padding. There are no |
99 | | // alignment requirements. |
100 | | using MatrixDotVectorFunction = void (*)(int, int, const int8_t *, const TFloat *, const int8_t *, |
101 | | TFloat *); |
102 | | MatrixDotVectorFunction matrixDotVectorFunction; |
103 | | |
104 | | // Number of 32 bit outputs held in each register. |
105 | | int num_outputs_per_register_; |
106 | | // Maximum number of registers that we will use to hold outputs. |
107 | | int max_output_registers_; |
108 | | // Number of 8 bit inputs in the inputs register. |
109 | | int num_inputs_per_register_; |
110 | | // Number of inputs in each weight group. |
111 | | int num_inputs_per_group_; |
112 | | // Number of groups of inputs to be broadcast. |
113 | | // num_input_groups_ = num_inputs_per_register_ / num_inputs_per_group_ |
114 | | |
115 | | static const IntSimdMatrix *intSimdMatrix; |
116 | | // Only available with NEON. |
117 | | static const IntSimdMatrix intSimdMatrixNEON; |
118 | | // Only available with RVV. |
119 | | static const IntSimdMatrix intSimdMatrixRVV; |
120 | | // Only available with AVX2 / AVX / FMA / SSE. |
121 | | static const IntSimdMatrix intSimdMatrixAVX2; |
122 | | static const IntSimdMatrix intSimdMatrixSSE; |
123 | | }; |
124 | | |
125 | | } // namespace tesseract |
126 | | |
127 | | #endif // TESSERACT_ARCH_INTSIMDMATRIX_H_ |