/src/zlib-ng/arch/x86/compare256_sse2.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* compare256_sse2.c -- SSE2 version of compare256 |
2 | | * Copyright Adam Stylinski <kungfujesus06@gmail.com> |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | */ |
5 | | |
6 | | #include "zbuild.h" |
7 | | #include "zmemory.h" |
8 | | #include "deflate.h" |
9 | | #include "fallback_builtins.h" |
10 | | |
11 | | #if defined(X86_SSE2) && defined(HAVE_BUILTIN_CTZ) |
12 | | |
13 | | #include <emmintrin.h> |
14 | | |
15 | 0 | static inline uint32_t compare256_sse2_static(const uint8_t *src0, const uint8_t *src1) { |
16 | 0 | uint32_t len = 0; |
17 | 0 | int align_offset = ((uintptr_t)src0) & 15; |
18 | 0 | const uint8_t *end0 = src0 + 256; |
19 | 0 | const uint8_t *end1 = src1 + 256; |
20 | 0 | __m128i xmm_src0, xmm_src1, xmm_cmp; |
21 | | |
22 | | /* Do the first load unaligned, than all subsequent ones we have at least |
23 | | * one aligned load. Sadly aligning both loads is probably unrealistic */ |
24 | 0 | xmm_src0 = _mm_loadu_si128((__m128i*)src0); |
25 | 0 | xmm_src1 = _mm_loadu_si128((__m128i*)src1); |
26 | 0 | xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); |
27 | |
|
28 | 0 | unsigned mask = (unsigned)_mm_movemask_epi8(xmm_cmp); |
29 | | |
30 | | /* Compiler _may_ turn this branch into a ptest + movemask, |
31 | | * since a lot of those uops are shared and fused */ |
32 | 0 | if (mask != 0xFFFF) { |
33 | 0 | uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); |
34 | 0 | return len + match_byte; |
35 | 0 | } |
36 | | |
37 | 0 | int align_adv = 16 - align_offset; |
38 | 0 | len += align_adv; |
39 | 0 | src0 += align_adv; |
40 | 0 | src1 += align_adv; |
41 | | |
42 | | /* Do a flooring division (should just be a shift right) */ |
43 | 0 | int num_iter = (256 - len) / 16; |
44 | |
|
45 | 0 | for (int i = 0; i < num_iter; ++i) { |
46 | 0 | xmm_src0 = _mm_load_si128((__m128i*)src0); |
47 | 0 | xmm_src1 = _mm_loadu_si128((__m128i*)src1); |
48 | 0 | xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); |
49 | |
|
50 | 0 | mask = (unsigned)_mm_movemask_epi8(xmm_cmp); |
51 | | |
52 | | /* Compiler _may_ turn this branch into a ptest + movemask, |
53 | | * since a lot of those uops are shared and fused */ |
54 | 0 | if (mask != 0xFFFF) { |
55 | 0 | uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); |
56 | 0 | return len + match_byte; |
57 | 0 | } |
58 | | |
59 | 0 | len += 16, src0 += 16, src1 += 16; |
60 | 0 | } |
61 | | |
62 | 0 | if (align_offset) { |
63 | 0 | src0 = end0 - 16; |
64 | 0 | src1 = end1 - 16; |
65 | 0 | len = 256 - 16; |
66 | |
|
67 | 0 | xmm_src0 = _mm_loadu_si128((__m128i*)src0); |
68 | 0 | xmm_src1 = _mm_loadu_si128((__m128i*)src1); |
69 | 0 | xmm_cmp = _mm_cmpeq_epi8(xmm_src0, xmm_src1); |
70 | |
|
71 | 0 | mask = (unsigned)_mm_movemask_epi8(xmm_cmp); |
72 | |
|
73 | 0 | if (mask != 0xFFFF) { |
74 | 0 | uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); |
75 | 0 | return len + match_byte; |
76 | 0 | } |
77 | 0 | } |
78 | | |
79 | 0 | return 256; |
80 | 0 | } |
81 | | |
82 | 0 | Z_INTERNAL uint32_t compare256_sse2(const uint8_t *src0, const uint8_t *src1) { |
83 | 0 | return compare256_sse2_static(src0, src1); |
84 | 0 | } |
85 | | |
86 | | #define LONGEST_MATCH longest_match_sse2 |
87 | 0 | #define COMPARE256 compare256_sse2_static |
88 | | |
89 | | #include "match_tpl.h" |
90 | | |
91 | | #define LONGEST_MATCH_SLOW |
92 | | #define LONGEST_MATCH longest_match_slow_sse2 |
93 | 0 | #define COMPARE256 compare256_sse2_static |
94 | | |
95 | | #include "match_tpl.h" |
96 | | |
97 | | #endif |