/src/tesseract/src/arch/intsimdmatrixsse.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /////////////////////////////////////////////////////////////////////// |
2 | | // File: intsindmatrixsse.cpp |
3 | | // Description: SSE implementation of 8-bit int SIMD matrix multiply. |
4 | | // Author: Ray Smith |
5 | | // |
6 | | // (C) Copyright 2017, Google Inc. |
7 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
8 | | // you may not use this file except in compliance with the License. |
9 | | // You may obtain a copy of the License at |
10 | | // http://www.apache.org/licenses/LICENSE-2.0 |
11 | | // Unless required by applicable law or agreed to in writing, software |
12 | | // distributed under the License is distributed on an "AS IS" BASIS, |
13 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
14 | | // See the License for the specific language governing permissions and |
15 | | // limitations under the License. |
16 | | /////////////////////////////////////////////////////////////////////// |
17 | | |
18 | | #if !defined(__SSE4_1__) |
19 | | # if defined(__i686__) || defined(__x86_64__) |
20 | | # error Implementation only for SSE 4.1 capable architectures |
21 | | # endif |
22 | | #else |
23 | | |
24 | | # include "intsimdmatrix.h" |
25 | | |
26 | | # include <emmintrin.h> |
27 | | # include <smmintrin.h> |
28 | | # include <cstdint> |
29 | | |
30 | | namespace tesseract { |
31 | | |
32 | | // Computes and returns the dot product of the n-vectors u and v. |
33 | | // Uses Intel SSE intrinsics to access the SIMD instruction set. |
34 | 0 | static int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n) { |
35 | 0 | int max_offset = n - 8; |
36 | 0 | int offset = 0; |
37 | | // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit |
38 | | // values, extending to 16 bit, multiplying to make 32 bit results. |
39 | 0 | int32_t result = 0; |
40 | 0 | if (offset <= max_offset) { |
41 | 0 | offset = 8; |
42 | 0 | __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u)); |
43 | 0 | __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v)); |
44 | 0 | __m128i sum = _mm_cvtepi8_epi16(packed1); |
45 | 0 | packed2 = _mm_cvtepi8_epi16(packed2); |
46 | | // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit |
47 | | // ints to make 32 bit results, which are then horizontally added in pairs |
48 | | // to make 4 32 bit results that still fit in a 128 bit register. |
49 | 0 | sum = _mm_madd_epi16(sum, packed2); |
50 | 0 | while (offset <= max_offset) { |
51 | 0 | packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(u + offset)); |
52 | 0 | packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(v + offset)); |
53 | 0 | offset += 8; |
54 | 0 | packed1 = _mm_cvtepi8_epi16(packed1); |
55 | 0 | packed2 = _mm_cvtepi8_epi16(packed2); |
56 | 0 | packed1 = _mm_madd_epi16(packed1, packed2); |
57 | 0 | sum = _mm_add_epi32(sum, packed1); |
58 | 0 | } |
59 | | // Sum the 4 packed 32 bit sums and extract the low result. |
60 | 0 | sum = _mm_hadd_epi32(sum, sum); |
61 | 0 | sum = _mm_hadd_epi32(sum, sum); |
62 | 0 | result = _mm_cvtsi128_si32(sum); |
63 | 0 | } |
64 | 0 | while (offset < n) { |
65 | 0 | result += u[offset] * v[offset]; |
66 | 0 | ++offset; |
67 | 0 | } |
68 | 0 | return result; |
69 | 0 | } |
70 | | |
71 | | // Computes part of matrix.vector v = Wu. Computes 1 result. |
72 | | static void PartialMatrixDotVector1(const int8_t *wi, const TFloat *scales, const int8_t *u, |
73 | 0 | int num_in, TFloat *v) { |
74 | 0 | TFloat total = IntDotProductSSE(u, wi, num_in); |
75 | | // Add in the bias and correct for integer values. |
76 | 0 | *v = (total + wi[num_in] * INT8_MAX) * *scales; |
77 | 0 | } |
78 | | |
79 | | static void matrixDotVector(int dim1, int dim2, const int8_t *wi, const TFloat *scales, |
80 | 0 | const int8_t *u, TFloat *v) { |
81 | 0 | const int num_out = dim1; |
82 | 0 | const int num_in = dim2 - 1; |
83 | 0 | int output = 0; |
84 | |
|
85 | 0 | for (; output < num_out; output++) { |
86 | 0 | PartialMatrixDotVector1(wi, scales, u, num_in, v); |
87 | 0 | wi += dim2; |
88 | 0 | scales++; |
89 | 0 | v++; |
90 | 0 | } |
91 | 0 | } |
92 | | |
93 | | const IntSimdMatrix IntSimdMatrix::intSimdMatrixSSE = { |
94 | | matrixDotVector, |
95 | | // Number of 32 bit outputs held in each register. |
96 | | 1, |
97 | | // Maximum number of registers that we will use to hold outputs. |
98 | | 1, |
99 | | // Number of 8 bit inputs in the inputs register. |
100 | | 1, |
101 | | // Number of inputs in each weight group. |
102 | | 1 |
103 | | }; |
104 | | |
105 | | } // namespace tesseract. |
106 | | |
107 | | #endif |