/src/wireshark/wsutil/ws_mempbrk_sse42.c
Line | Count | Source |
1 | | /* strcspn with SSE4.2 intrinsics |
2 | | Copyright (C) 2009-2014 Free Software Foundation, Inc. |
3 | | Contributed by Intel Corporation. |
4 | | This file is part of the GNU C Library. |
5 | | |
6 | | SPDX-License-Identifier: LGPL-2.1-or-later |
7 | | */ |
8 | | |
9 | | |
10 | | #include "config.h" |
11 | | |
12 | | #ifdef HAVE_SSE4_2 |
13 | | |
14 | | #include <glib.h> |
15 | | #include "ws_cpuid.h" |
16 | | |
17 | | #ifdef _WIN32 |
18 | | #include <tmmintrin.h> |
19 | | #endif |
20 | | |
21 | | #include <nmmintrin.h> |
22 | | #include <string.h> |
23 | | #include "ws_mempbrk.h" |
24 | | #include "ws_mempbrk_int.h" |
25 | | |
26 | | /* __has_feature(address_sanitizer) is used later for Clang, this is for |
27 | | * compatibility with other compilers (such as GCC and MSVC) */ |
28 | | #ifndef __has_feature |
29 | | # define __has_feature(x) 0 |
30 | | #endif |
31 | | |
32 | 71.2k | #define cast_128aligned__m128i(p) ((const __m128i *) (const void *) (p)) |
33 | | |
34 | | /* Helper for variable shifts of SSE registers. |
35 | | Copyright (C) 2010 Free Software Foundation, Inc. |
36 | | */ |
37 | | |
38 | | static const int8_t ___m128i_shift_right[31] = |
39 | | { |
40 | | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, |
41 | | -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
42 | | }; |
43 | | |
44 | | static inline __m128i |
45 | | __m128i_shift_right (__m128i value, unsigned long int offset) |
46 | 28.6k | { |
47 | | /* _mm_loadu_si128() works with unaligned data, cast safe */ |
48 | 28.6k | return _mm_shuffle_epi8 (value, |
49 | 28.6k | _mm_loadu_si128 (cast_128aligned__m128i(___m128i_shift_right + offset))); |
50 | 28.6k | } |
51 | | |
52 | | |
53 | | void |
54 | | ws_mempbrk_sse42_compile(ws_mempbrk_pattern* pattern, const char *needles) |
55 | 287 | { |
56 | 287 | size_t length = strlen(needles); |
57 | | |
58 | 287 | pattern->use_sse42 = ws_cpuid_sse42() && (length <= 16); |
59 | | |
60 | 287 | if (pattern->use_sse42) { |
61 | 273 | pattern->mask = _mm_setzero_si128(); |
62 | 273 | memcpy(&(pattern->mask), needles, length); |
63 | 273 | } |
64 | 287 | } |
65 | | |
66 | | /* We use 0x2: |
67 | | _SIDD_SBYTE_OPS |
68 | | | _SIDD_CMP_EQUAL_ANY |
69 | | | _SIDD_POSITIVE_POLARITY |
70 | | | _SIDD_LEAST_SIGNIFICANT |
71 | | on pcmpistri to compare xmm/mem128 |
72 | | |
73 | | 0 1 2 3 4 5 6 7 8 9 A B C D E F |
74 | | X X X X X X X X X X X X X X X X |
75 | | |
76 | | against xmm |
77 | | |
78 | | 0 1 2 3 4 5 6 7 8 9 A B C D E F |
79 | | A A A A A A A A A A A A A A A A |
80 | | |
81 | | to find out if the first 16byte data element has any byte A and |
82 | | the offset of the first byte. There are 3 cases: |
83 | | |
84 | | 1. The first 16byte data element has the byte A at the offset X. |
85 | | 2. The first 16byte data element has EOS and doesn't have the byte A. |
86 | | 3. The first 16byte data element is valid and doesn't have the byte A. |
87 | | |
88 | | Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: |
89 | | |
90 | | 1 X 1 0/1 0 |
91 | | 2 16 0 1 0 |
92 | | 3 16 0 0 0 |
93 | | |
94 | | We exit from the loop for cases 1 and 2 with jbe which branches |
95 | | when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset |
96 | | X for case 1. */ |
97 | | |
98 | | const char * |
99 | | ws_mempbrk_sse42_exec(const char *haystack, size_t haystacklen, const ws_mempbrk_pattern* pattern, unsigned char *found_needle) |
100 | 31.6k | { |
101 | 31.6k | const char *aligned; |
102 | 31.6k | int offset; |
103 | | |
104 | 31.6k | offset = (int) ((size_t) haystack & 15); |
105 | 31.6k | aligned = (const char *) ((size_t) haystack & -16L); |
106 | 31.6k | if (offset != 0) |
107 | 28.6k | { |
108 | | /* Check partial string. cast safe it's 16B aligned */ |
109 | 28.6k | __m128i value = __m128i_shift_right (_mm_load_si128 (cast_128aligned__m128i(aligned)), offset); |
110 | | |
111 | 28.6k | int length = _mm_cmpistri (pattern->mask, value, 0x2); |
112 | | /* No need to check ZFlag since ZFlag is always 1. */ |
113 | 28.6k | int cflag = _mm_cmpistrc (pattern->mask, value, 0x2); |
114 | | /* XXX: why does this compare value with value? */ |
115 | 28.6k | int idx = _mm_cmpistri (value, value, 0x3a); |
116 | | |
117 | 28.6k | if (cflag) { |
118 | 15.7k | if (found_needle) |
119 | 15.5k | *found_needle = *(haystack + length); |
120 | 15.7k | return haystack + length; |
121 | 15.7k | } |
122 | | |
123 | | /* Find where the NULL terminator is. */ |
124 | 12.9k | if (idx < 16 - offset) |
125 | 6.19k | { |
126 | | /* found NUL @ 'idx', need to switch to slower mempbrk */ |
127 | 6.19k | return ws_mempbrk_portable_exec(haystack + idx + 1, haystacklen - idx - 1, pattern, found_needle); /* haystacklen is bigger than 16 & idx < 16 so no underflow here */ |
128 | 6.19k | } |
129 | 6.75k | aligned += 16; |
130 | 6.75k | haystacklen -= (16 - offset); |
131 | 6.75k | } |
132 | 2.92k | else |
133 | 2.92k | aligned = haystack; |
134 | | |
135 | 14.9k | while (haystacklen >= 16) |
136 | 13.9k | { |
137 | 13.9k | __m128i value = _mm_load_si128 (cast_128aligned__m128i(aligned)); |
138 | 13.9k | int idx = _mm_cmpistri (pattern->mask, value, 0x2); |
139 | 13.9k | int cflag = _mm_cmpistrc (pattern->mask, value, 0x2); |
140 | 13.9k | int zflag = _mm_cmpistrz (pattern->mask, value, 0x2); |
141 | | |
142 | 13.9k | if (cflag) { |
143 | 3.21k | if (found_needle) |
144 | 3.15k | *found_needle = *(aligned + idx); |
145 | 3.21k | return aligned + idx; |
146 | 3.21k | } |
147 | | |
148 | 10.6k | if (zflag) |
149 | 5.44k | { |
150 | | /* found NUL, need to switch to slower mempbrk */ |
151 | 5.44k | return ws_mempbrk_portable_exec(aligned, haystacklen, pattern, found_needle); |
152 | 5.44k | } |
153 | 5.25k | aligned += 16; |
154 | 5.25k | haystacklen -= 16; |
155 | 5.25k | } |
156 | | |
157 | | /* XXX, use mempbrk_slow here? */ |
158 | 1.02k | return ws_mempbrk_portable_exec(aligned, haystacklen, pattern, found_needle); |
159 | 9.68k | } |
160 | | |
161 | | #endif /* HAVE_SSE4_2 */ |
162 | | /* |
163 | | * Editor modelines |
164 | | * |
165 | | * Local Variables: |
166 | | * c-basic-offset: 2 |
167 | | * tab-width: 8 |
168 | | * indent-tabs-mode: nil |
169 | | * End: |
170 | | * |
171 | | * ex: set shiftwidth=2 tabstop=8 expandtab: |
172 | | * :indentSize=2:tabSize=8:noTabs=true: |
173 | | */ |