/src/boost/boost/json/detail/sse2.hpp
Line | Count | Source (jump to first uncovered line) |
1 | | // |
2 | | // Copyright (c) 2019 Peter Dimov (pdimov at gmail dot com), |
3 | | // Vinnie Falco (vinnie.falco@gmail.com) |
4 | | // Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com) |
5 | | // |
6 | | // Distributed under the Boost Software License, Version 1.0. (See accompanying |
7 | | // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
8 | | // |
9 | | // Official repository: https://github.com/boostorg/json |
10 | | // |
11 | | |
12 | | #ifndef BOOST_JSON_DETAIL_SSE2_HPP |
13 | | #define BOOST_JSON_DETAIL_SSE2_HPP |
14 | | |
15 | | #include <boost/json/detail/config.hpp> |
16 | | #include <boost/json/detail/utf8.hpp> |
17 | | #include <cstddef> |
18 | | #include <cstring> |
19 | | #ifdef BOOST_JSON_USE_SSE2 |
20 | | # include <emmintrin.h> |
21 | | # include <xmmintrin.h> |
22 | | # ifdef _MSC_VER |
23 | | # include <intrin.h> |
24 | | # endif |
25 | | #endif |
26 | | |
27 | | namespace boost { |
28 | | namespace json { |
29 | | namespace detail { |
30 | | |
31 | | #ifdef BOOST_JSON_USE_SSE2 |
32 | | |
33 | | template<bool AllowBadUTF8> |
34 | | inline |
35 | | const char* |
36 | | count_valid( |
37 | | char const* p, |
38 | | const char* end) noexcept |
39 | 485k | { |
40 | 485k | __m128i const q1 = _mm_set1_epi8( '\x22' ); // '"' |
41 | 485k | __m128i const q2 = _mm_set1_epi8( '\\' ); // '\\' |
42 | 485k | __m128i const q3 = _mm_set1_epi8( 0x1F ); |
43 | | |
44 | 2.99M | while(end - p >= 16) |
45 | 2.99M | { |
46 | 2.99M | __m128i v1 = _mm_loadu_si128( (__m128i const*)p ); |
47 | 2.99M | __m128i v2 = _mm_cmpeq_epi8( v1, q1 ); // quote |
48 | 2.99M | __m128i v3 = _mm_cmpeq_epi8( v1, q2 ); // backslash |
49 | 2.99M | __m128i v4 = _mm_or_si128( v2, v3 ); // combine quotes and backslash |
50 | 2.99M | __m128i v5 = _mm_min_epu8( v1, q3 ); |
51 | 2.99M | __m128i v6 = _mm_cmpeq_epi8( v5, v1 ); // controls |
52 | 2.99M | __m128i v7 = _mm_or_si128( v4, v6 ); // combine with control |
53 | | |
54 | 2.99M | int w = _mm_movemask_epi8( v7 ); |
55 | | |
56 | 2.99M | if( w != 0 ) |
57 | 482k | { |
58 | 482k | int m; |
59 | 482k | #if defined(__GNUC__) || defined(__clang__) |
60 | 482k | m = __builtin_ffs( w ) - 1; |
61 | | #else |
62 | | unsigned long index; |
63 | | _BitScanForward( &index, w ); |
64 | | m = index; |
65 | | #endif |
66 | 482k | return p + m; |
67 | 482k | } |
68 | | |
69 | 2.51M | p += 16; |
70 | 2.51M | } |
71 | | |
72 | 5.86k | while(p != end) |
73 | 5.32k | { |
74 | 5.32k | const unsigned char c = *p; |
75 | 5.32k | if(c == '\x22' || c == '\\' || c < 0x20) |
76 | 2.20k | break; |
77 | 3.12k | ++p; |
78 | 3.12k | } |
79 | | |
80 | 2.73k | return p; |
81 | 485k | } |
82 | | |
83 | | template<> |
84 | | inline |
85 | | const char* |
86 | | count_valid<false>( |
87 | | char const* p, |
88 | | const char* end) noexcept |
89 | 1.94M | { |
90 | 1.94M | __m128i const q1 = _mm_set1_epi8( '\x22' ); // '"' |
91 | 1.94M | __m128i const q2 = _mm_set1_epi8( '\\' ); |
92 | 1.94M | __m128i const q3 = _mm_set1_epi8( 0x20 ); |
93 | | |
94 | 3.74M | while(end - p >= 16) |
95 | 3.73M | { |
96 | 3.73M | __m128i v1 = _mm_loadu_si128( (__m128i const*)p ); |
97 | | |
98 | 3.73M | __m128i v2 = _mm_cmpeq_epi8( v1, q1 ); |
99 | 3.73M | __m128i v3 = _mm_cmpeq_epi8( v1, q2 ); |
100 | 3.73M | __m128i v4 = _mm_cmplt_epi8( v1, q3 ); |
101 | | |
102 | 3.73M | __m128i v5 = _mm_or_si128( v2, v3 ); |
103 | 3.73M | __m128i v6 = _mm_or_si128( v5, v4 ); |
104 | | |
105 | 3.73M | int w = _mm_movemask_epi8( v6 ); |
106 | | |
107 | 3.73M | if( w != 0 ) |
108 | 1.93M | { |
109 | 1.93M | int m; |
110 | 1.93M | #if defined(__GNUC__) || defined(__clang__) |
111 | 1.93M | m = __builtin_ffs( w ) - 1; |
112 | | #else |
113 | | unsigned long index; |
114 | | _BitScanForward( &index, w ); |
115 | | m = index; |
116 | | #endif |
117 | 1.93M | p += m; |
118 | 1.93M | break; |
119 | 1.93M | } |
120 | | |
121 | 1.80M | p += 16; |
122 | 1.80M | } |
123 | | |
124 | 47.8M | while(p != end) |
125 | 47.8M | { |
126 | 47.8M | const unsigned char c = *p; |
127 | 47.8M | if(c == '\x22' || c == '\\' || c < 0x20) |
128 | 1.94M | break; |
129 | 45.9M | if(c < 0x80) |
130 | 45.7M | { |
131 | 45.7M | ++p; |
132 | 45.7M | continue; |
133 | 45.7M | } |
134 | | // validate utf-8 |
135 | 181k | uint16_t first = classify_utf8(c); |
136 | 181k | uint8_t len = first & 0xFF; |
137 | 181k | if(BOOST_JSON_UNLIKELY(end - p < len)) |
138 | 76 | break; |
139 | 181k | if(BOOST_JSON_UNLIKELY(! is_valid_utf8(p, first))) |
140 | 979 | break; |
141 | 180k | p += len; |
142 | 180k | } |
143 | | |
144 | 1.94M | return p; |
145 | 1.94M | } |
146 | | |
147 | | #else |
148 | | |
149 | | template<bool AllowBadUTF8> |
150 | | char const* |
151 | | count_valid( |
152 | | char const* p, |
153 | | char const* end) noexcept |
154 | | { |
155 | | while(p != end) |
156 | | { |
157 | | const unsigned char c = *p; |
158 | | if(c == '\x22' || c == '\\' || c < 0x20) |
159 | | break; |
160 | | ++p; |
161 | | } |
162 | | |
163 | | return p; |
164 | | } |
165 | | |
166 | | template<> |
167 | | inline |
168 | | char const* |
169 | | count_valid<false>( |
170 | | char const* p, |
171 | | char const* end) noexcept |
172 | | { |
173 | | while(p != end) |
174 | | { |
175 | | const unsigned char c = *p; |
176 | | if(c == '\x22' || c == '\\' || c < 0x20) |
177 | | break; |
178 | | if(c < 0x80) |
179 | | { |
180 | | ++p; |
181 | | continue; |
182 | | } |
183 | | // validate utf-8 |
184 | | uint16_t first = classify_utf8(c); |
185 | | uint8_t len = first & 0xFF; |
186 | | if(BOOST_JSON_UNLIKELY(end - p < len)) |
187 | | break; |
188 | | if(BOOST_JSON_UNLIKELY(! is_valid_utf8(p, first))) |
189 | | break; |
190 | | p += len; |
191 | | } |
192 | | |
193 | | return p; |
194 | | } |
195 | | |
196 | | #endif |
197 | | |
198 | | // KRYSTIAN NOTE: does not stop to validate |
199 | | // count_unescaped |
200 | | |
201 | | #ifdef BOOST_JSON_USE_SSE2 |
202 | | |
203 | | inline |
204 | | size_t |
205 | | count_unescaped( |
206 | | char const* s, |
207 | | size_t n) noexcept |
208 | 23.4k | { |
209 | | |
210 | 23.4k | __m128i const q1 = _mm_set1_epi8( '\x22' ); // '"' |
211 | 23.4k | __m128i const q2 = _mm_set1_epi8( '\\' ); // '\\' |
212 | 23.4k | __m128i const q3 = _mm_set1_epi8( 0x1F ); |
213 | | |
214 | 23.4k | char const * s0 = s; |
215 | | |
216 | 451k | while( n >= 16 ) |
217 | 435k | { |
218 | 435k | __m128i v1 = _mm_loadu_si128( (__m128i const*)s ); |
219 | 435k | __m128i v2 = _mm_cmpeq_epi8( v1, q1 ); // quote |
220 | 435k | __m128i v3 = _mm_cmpeq_epi8( v1, q2 ); // backslash |
221 | 435k | __m128i v4 = _mm_or_si128( v2, v3 ); // combine quotes and backslash |
222 | 435k | __m128i v5 = _mm_min_epu8( v1, q3 ); |
223 | 435k | __m128i v6 = _mm_cmpeq_epi8( v5, v1 ); // controls |
224 | 435k | __m128i v7 = _mm_or_si128( v4, v6 ); // combine with control |
225 | | |
226 | 435k | int w = _mm_movemask_epi8( v7 ); |
227 | | |
228 | 435k | if( w != 0 ) |
229 | 7.08k | { |
230 | 7.08k | int m; |
231 | 7.08k | #if defined(__GNUC__) || defined(__clang__) |
232 | 7.08k | m = __builtin_ffs( w ) - 1; |
233 | | #else |
234 | | unsigned long index; |
235 | | _BitScanForward( &index, w ); |
236 | | m = index; |
237 | | #endif |
238 | | |
239 | 7.08k | s += m; |
240 | 7.08k | break; |
241 | 7.08k | } |
242 | | |
243 | 428k | s += 16; |
244 | 428k | n -= 16; |
245 | 428k | } |
246 | | |
247 | 23.4k | return s - s0; |
248 | 23.4k | } |
249 | | |
250 | | #else |
251 | | |
252 | | inline |
253 | | std::size_t |
254 | | count_unescaped( |
255 | | char const*, |
256 | | std::size_t) noexcept |
257 | | { |
258 | | return 0; |
259 | | } |
260 | | |
261 | | #endif |
262 | | |
263 | | // count_digits |
264 | | |
265 | | #ifdef BOOST_JSON_USE_SSE2 |
266 | | |
267 | | // assumes p..p+15 are valid |
268 | | inline int count_digits( char const* p ) noexcept |
269 | 12.8M | { |
270 | 12.8M | __m128i v1 = _mm_loadu_si128( (__m128i const*)p ); |
271 | 12.8M | v1 = _mm_add_epi8(v1, _mm_set1_epi8(70)); |
272 | 12.8M | v1 = _mm_cmplt_epi8(v1, _mm_set1_epi8(118)); |
273 | | |
274 | 12.8M | int m = _mm_movemask_epi8(v1); |
275 | | |
276 | 12.8M | int n; |
277 | | |
278 | 12.8M | if( m == 0 ) |
279 | 180k | { |
280 | 180k | n = 16; |
281 | 180k | } |
282 | 12.6M | else |
283 | 12.6M | { |
284 | 12.6M | #if defined(__GNUC__) || defined(__clang__) |
285 | 12.6M | n = __builtin_ffs( m ) - 1; |
286 | | #else |
287 | | unsigned long index; |
288 | | _BitScanForward( &index, m ); |
289 | | n = static_cast<int>(index); |
290 | | #endif |
291 | 12.6M | } |
292 | | |
293 | 12.8M | return n; |
294 | 12.8M | } |
295 | | |
296 | | #else |
297 | | |
298 | | // assumes p..p+15 are valid |
299 | | inline int count_digits( char const* p ) noexcept |
300 | | { |
301 | | int n = 0; |
302 | | |
303 | | for( ; n < 16; ++n ) |
304 | | { |
305 | | unsigned char const d = *p++ - '0'; |
306 | | if(d > 9) break; |
307 | | } |
308 | | |
309 | | return n; |
310 | | } |
311 | | |
312 | | #endif |
313 | | |
314 | | // parse_unsigned |
315 | | |
316 | | inline uint64_t parse_unsigned( uint64_t r, char const * p, std::size_t n ) noexcept |
317 | 12.2M | { |
318 | 13.1M | while( n >= 4 ) |
319 | 893k | { |
320 | | // faster on on clang for x86, |
321 | | // slower on gcc |
322 | 893k | #ifdef __clang__ |
323 | 893k | r = r * 10 + p[0] - '0'; |
324 | 893k | r = r * 10 + p[1] - '0'; |
325 | 893k | r = r * 10 + p[2] - '0'; |
326 | 893k | r = r * 10 + p[3] - '0'; |
327 | | #else |
328 | | uint32_t v; |
329 | | std::memcpy( &v, p, 4 ); |
330 | | endian::native_to_little_inplace(v); |
331 | | |
332 | | v -= 0x30303030; |
333 | | |
334 | | unsigned w0 = v & 0xFF; |
335 | | unsigned w1 = (v >> 8) & 0xFF; |
336 | | unsigned w2 = (v >> 16) & 0xFF; |
337 | | unsigned w3 = (v >> 24); |
338 | | |
339 | | r = (((r * 10 + w0) * 10 + w1) * 10 + w2) * 10 + w3; |
340 | | #endif |
341 | 893k | p += 4; |
342 | 893k | n -= 4; |
343 | 893k | } |
344 | | |
345 | 12.2M | switch( n ) |
346 | 12.2M | { |
347 | 212k | case 0: |
348 | 212k | break; |
349 | 11.7M | case 1: |
350 | 11.7M | r = r * 10 + p[0] - '0'; |
351 | 11.7M | break; |
352 | 167k | case 2: |
353 | 167k | r = r * 10 + p[0] - '0'; |
354 | 167k | r = r * 10 + p[1] - '0'; |
355 | 167k | break; |
356 | 92.5k | case 3: |
357 | 92.5k | r = r * 10 + p[0] - '0'; |
358 | 92.5k | r = r * 10 + p[1] - '0'; |
359 | 92.5k | r = r * 10 + p[2] - '0'; |
360 | 92.5k | break; |
361 | 12.2M | } |
362 | 12.2M | return r; |
363 | 12.2M | } |
364 | | |
365 | | // KRYSTIAN: this function is unused |
366 | | // count_leading |
367 | | |
368 | | /* |
369 | | #ifdef BOOST_JSON_USE_SSE2 |
370 | | |
371 | | // assumes p..p+15 |
372 | | inline std::size_t count_leading( char const * p, char ch ) noexcept |
373 | | { |
374 | | __m128i const q1 = _mm_set1_epi8( ch ); |
375 | | |
376 | | __m128i v = _mm_loadu_si128( (__m128i const*)p ); |
377 | | |
378 | | __m128i w = _mm_cmpeq_epi8( v, q1 ); |
379 | | |
380 | | int m = _mm_movemask_epi8( w ) ^ 0xFFFF; |
381 | | |
382 | | std::size_t n; |
383 | | |
384 | | if( m == 0 ) |
385 | | { |
386 | | n = 16; |
387 | | } |
388 | | else |
389 | | { |
390 | | #if defined(__GNUC__) || defined(__clang__) |
391 | | n = __builtin_ffs( m ) - 1; |
392 | | #else |
393 | | unsigned long index; |
394 | | _BitScanForward( &index, m ); |
395 | | n = index; |
396 | | #endif |
397 | | } |
398 | | |
399 | | return n; |
400 | | } |
401 | | |
402 | | #else |
403 | | |
404 | | // assumes p..p+15 |
405 | | inline std::size_t count_leading( char const * p, char ch ) noexcept |
406 | | { |
407 | | std::size_t n = 0; |
408 | | |
409 | | for( ; n < 16 && *p == ch; ++p, ++n ); |
410 | | |
411 | | return n; |
412 | | } |
413 | | |
414 | | #endif |
415 | | */ |
416 | | |
417 | | // count_whitespace |
418 | | |
419 | | #ifdef BOOST_JSON_USE_SSE2 |
420 | | |
421 | | inline const char* count_whitespace( char const* p, const char* end ) noexcept |
422 | 32.1M | { |
423 | 32.1M | if( p == end ) |
424 | 19.7k | { |
425 | 19.7k | return p; |
426 | 19.7k | } |
427 | | |
428 | 32.1M | if( static_cast<unsigned char>( *p ) > 0x20 ) |
429 | 32.0M | { |
430 | 32.0M | return p; |
431 | 32.0M | } |
432 | | |
433 | 78.9k | __m128i const q1 = _mm_set1_epi8( ' ' ); |
434 | 78.9k | __m128i const q2 = _mm_set1_epi8( '\n' ); |
435 | 78.9k | __m128i const q3 = _mm_set1_epi8( 4 ); // '\t' | 4 == '\r' |
436 | 78.9k | __m128i const q4 = _mm_set1_epi8( '\r' ); |
437 | | |
438 | 91.0k | while( end - p >= 16 ) |
439 | 87.2k | { |
440 | 87.2k | __m128i v0 = _mm_loadu_si128( (__m128i const*)p ); |
441 | | |
442 | 87.2k | __m128i w0 = _mm_or_si128( |
443 | 87.2k | _mm_cmpeq_epi8( v0, q1 ), |
444 | 87.2k | _mm_cmpeq_epi8( v0, q2 )); |
445 | 87.2k | __m128i v1 = _mm_or_si128( v0, q3 ); |
446 | 87.2k | __m128i w1 = _mm_cmpeq_epi8( v1, q4 ); |
447 | 87.2k | __m128i w2 = _mm_or_si128( w0, w1 ); |
448 | | |
449 | 87.2k | int m = _mm_movemask_epi8( w2 ) ^ 0xFFFF; |
450 | | |
451 | 87.2k | if( m != 0 ) |
452 | 75.1k | { |
453 | 75.1k | #if defined(__GNUC__) || defined(__clang__) |
454 | 75.1k | std::size_t c = __builtin_ffs( m ) - 1; |
455 | | #else |
456 | | unsigned long index; |
457 | | _BitScanForward( &index, m ); |
458 | | std::size_t c = index; |
459 | | #endif |
460 | | |
461 | 75.1k | p += c; |
462 | 75.1k | return p; |
463 | 75.1k | } |
464 | | |
465 | 12.0k | p += 16; |
466 | 12.0k | } |
467 | | |
468 | 9.69k | while( p != end ) |
469 | 8.33k | { |
470 | 8.33k | if( *p != ' ' && *p != '\t' && *p != '\r' && *p != '\n' ) |
471 | 2.40k | { |
472 | 2.40k | return p; |
473 | 2.40k | } |
474 | | |
475 | 5.92k | ++p; |
476 | 5.92k | } |
477 | | |
478 | 1.36k | return p; |
479 | 3.76k | } |
480 | | |
481 | | /* |
482 | | |
483 | | // slightly faster on msvc-14.2, slightly slower on clang-win |
484 | | |
485 | | inline std::size_t count_whitespace( char const * p, std::size_t n ) noexcept |
486 | | { |
487 | | char const * p0 = p; |
488 | | |
489 | | while( n > 0 ) |
490 | | { |
491 | | char ch = *p; |
492 | | |
493 | | if( ch == '\n' || ch == '\r' ) |
494 | | { |
495 | | ++p; |
496 | | --n; |
497 | | continue; |
498 | | } |
499 | | |
500 | | if( ch != ' ' && ch != '\t' ) |
501 | | { |
502 | | break; |
503 | | } |
504 | | |
505 | | ++p; |
506 | | --n; |
507 | | |
508 | | while( n >= 16 ) |
509 | | { |
510 | | std::size_t n2 = count_leading( p, ch ); |
511 | | |
512 | | p += n2; |
513 | | n -= n2; |
514 | | |
515 | | if( n2 < 16 ) |
516 | | { |
517 | | break; |
518 | | } |
519 | | } |
520 | | } |
521 | | |
522 | | return p - p0; |
523 | | } |
524 | | */ |
525 | | |
526 | | #else |
527 | | |
528 | | inline const char* count_whitespace( char const* p, const char* end ) noexcept |
529 | | { |
530 | | |
531 | | for(; p != end; ++p) |
532 | | { |
533 | | char const c = *p; |
534 | | if( c != ' ' && c != '\n' && c != '\r' && c != '\t' ) break; |
535 | | } |
536 | | |
537 | | return p; |
538 | | } |
539 | | |
540 | | #endif |
541 | | |
542 | | } // detail |
543 | | } // namespace json |
544 | | } // namespace boost |
545 | | |
546 | | #endif |