/src/tesseract/src/arch/intsimdmatrix.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: intsimdmatrix.cpp |
3 | | // Description: Base class for 8-bit int SIMD matrix multipliers. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2017, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | /////////////////////////////////////////////////////////////////////// |
17 | | |
18 | | #include "intsimdmatrix.h" |
19 | | #include "matrix.h" // for GENERIC_2D_ARRAY |
20 | | #include "simddetect.h" // for SIMDDetect |
21 | | |
22 | | namespace tesseract { |
23 | | |
24 | | const IntSimdMatrix *IntSimdMatrix::intSimdMatrix = nullptr; |
25 | | |
26 | | // Computes a reshaped copy of the weight matrix w. |
27 | | void IntSimdMatrix::Init(const GENERIC_2D_ARRAY<int8_t> &w, std::vector<int8_t> &shaped_w, |
28 | 72 | int32_t &rounded_num_out) const { |
29 | 72 | const int num_out = w.dim1(); |
30 | 72 | const int num_in = w.dim2() - 1; |
31 | | // The rounded-up sizes of the reshaped weight matrix, excluding biases. |
32 | 72 | int rounded_num_in = Roundup(num_in, num_inputs_per_group_); |
33 | 72 | rounded_num_out = RoundOutputs(num_out); |
34 | | // Add the bias and compute the required size. |
35 | 72 | shaped_w.resize((rounded_num_in + 1) * rounded_num_out, 0); |
36 | 72 | int shaped_index = 0; |
37 | 72 | int output = 0; |
38 | | // Each number of registers needs a different format! Iterates over the |
39 | | // different numbers of registers (each a power of 2). |
40 | 360 | for (int num_registers = max_output_registers_; num_registers >= 1; num_registers /= 2) { |
41 | | // The number of outputs that we will generate with this many registers. |
42 | 288 | int num_outputs_per_register_set = num_registers * num_outputs_per_register_; |
43 | | // Use the max number of registers until we have to go fewer. |
44 | 512 | while (output + num_outputs_per_register_set <= rounded_num_out) { |
45 | | // Accumulating outputs in registers saves iterating over the inputs, so |
46 | | // we only have to do it once per output register set. |
47 | 24.3k | for (int input = 0; input < num_in; input += num_inputs_per_group_) { |
48 | | // Iterate over the number of outputs in a register set. |
49 | 1.48M | for (int j = 0; j < num_outputs_per_register_set; ++j) { |
50 | | // Inner-most loop corresponds to the number of inputs in an input |
51 | | // group. |
52 | 7.29M | for (int i = 0; i < num_inputs_per_group_; ++i) { |
53 | 5.83M | int8_t weight = 0; |
54 | 5.83M | if (output + j < num_out && input + i < num_in) { |
55 | 5.83M | weight = w(output + j, input + i); |
56 | 5.83M | } |
57 | 5.83M | shaped_w[shaped_index++] = weight; |
58 | 5.83M | } |
59 | 1.45M | } |
60 | 24.1k | } |
61 | | // Append the bias weights for the register set. |
62 | 13.0k | for (int j = 0; j < num_outputs_per_register_set; ++j) { |
63 | 12.8k | int8_t weight = 0; |
64 | 12.8k | if (output + j < num_out) { |
65 | 12.7k | weight = w(output + j, num_in); |
66 | 12.7k | } |
67 | 12.8k | shaped_w[shaped_index++] = weight; |
68 | 12.8k | } |
69 | 224 | output += num_outputs_per_register_set; |
70 | 224 | } |
71 | 288 | } |
72 | 72 | } |
73 | | |
74 | | // Computes matrix.vector v = Wu. |
75 | | // u is of size W.dim2() - 1 and the output v is of size W.dim1(). |
76 | | // u is imagined to have an extra element at the end with value 1, to |
77 | | // implement the bias, but it doesn't actually have it. |
78 | | void IntSimdMatrix::MatrixDotVector(const GENERIC_2D_ARRAY<int8_t> &w, |
79 | 0 | const std::vector<TFloat> &scales, const int8_t *u, TFloat *v) { |
80 | 0 | int num_out = w.dim1(); |
81 | 0 | int num_in = w.dim2() - 1; |
82 | | // Base implementation. |
83 | 0 | int i; |
84 | | // Break up into chunks of four to facilitate vectorization |
85 | 0 | for (i = 0; i < (num_out / 4) * 4; i += 4) { |
86 | 0 | const int8_t *wi0 = w[i + 0]; |
87 | 0 | const int8_t *wi1 = w[i + 1]; |
88 | 0 | const int8_t *wi2 = w[i + 2]; |
89 | 0 | const int8_t *wi3 = w[i + 3]; |
90 | 0 | int total0 = 0; |
91 | 0 | int total1 = 0; |
92 | 0 | int total2 = 0; |
93 | 0 | int total3 = 0; |
94 | 0 | for (int j = 0; j < num_in; ++j) { |
95 | 0 | total0 += wi0[j] * u[j]; |
96 | 0 | total1 += wi1[j] * u[j]; |
97 | 0 | total2 += wi2[j] * u[j]; |
98 | 0 | total3 += wi3[j] * u[j]; |
99 | 0 | } |
100 | | // Add in the bias and correct for integer values. |
101 | 0 | v[i + 0] = (total0 + wi0[num_in] * INT8_MAX) * scales[i + 0]; |
102 | 0 | v[i + 1] = (total1 + wi1[num_in] * INT8_MAX) * scales[i + 1]; |
103 | 0 | v[i + 2] = (total2 + wi2[num_in] * INT8_MAX) * scales[i + 2]; |
104 | 0 | v[i + 3] = (total3 + wi3[num_in] * INT8_MAX) * scales[i + 3]; |
105 | 0 | } |
106 | | |
107 | | // Capture the remainder mod four |
108 | 0 | for (; i < num_out; ++i) { |
109 | 0 | const int8_t *wi = w[i]; |
110 | 0 | int total = 0; |
111 | 0 | for (int j = 0; j < num_in; ++j) { |
112 | 0 | total += wi[j] * u[j]; |
113 | 0 | } |
114 | | // Add in the bias and correct for integer values. |
115 | 0 | v[i] = (total + wi[num_in] * INT8_MAX) * scales[i]; |
116 | 0 | } |
117 | 0 | } |
118 | | |
119 | | } // namespace tesseract |