/src/zlib-ng/arch/x86/compare256_avx2.c
Line | Count | Source |
1 | | /* compare256_avx2.c -- AVX2 version of compare256 |
2 | | * Copyright Mika T. Lindqvist <postmaster@raasu.org> |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | */ |
5 | | |
6 | | #include "zbuild.h" |
7 | | #include "zmemory.h" |
8 | | #include "deflate.h" |
9 | | #include "fallback_builtins.h" |
10 | | |
11 | | #if defined(X86_AVX2) && defined(HAVE_BUILTIN_CTZ) |
12 | | |
13 | | #include <immintrin.h> |
14 | | #ifdef _MSC_VER |
15 | | # include <nmmintrin.h> |
16 | | #endif |
17 | | |
18 | 57.2M | static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) { |
19 | 57.2M | uint32_t len = 0; |
20 | | |
21 | 68.9M | do { |
22 | 68.9M | __m256i ymm_src0, ymm_src1, ymm_cmp; |
23 | 68.9M | ymm_src0 = _mm256_loadu_si256((__m256i*)src0); |
24 | 68.9M | ymm_src1 = _mm256_loadu_si256((__m256i*)src1); |
25 | 68.9M | ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ |
26 | 68.9M | unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); |
27 | 68.9M | if (mask != 0xFFFFFFFF) { |
28 | 50.5M | uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); /* Invert bits so identical = 0 */ |
29 | 50.5M | return len + match_byte; |
30 | 50.5M | } |
31 | | |
32 | 18.3M | src0 += 32, src1 += 32, len += 32; |
33 | | |
34 | 18.3M | ymm_src0 = _mm256_loadu_si256((__m256i*)src0); |
35 | 18.3M | ymm_src1 = _mm256_loadu_si256((__m256i*)src1); |
36 | 18.3M | ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); |
37 | 18.3M | mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); |
38 | 18.3M | if (mask != 0xFFFFFFFF) { |
39 | 3.88M | uint32_t match_byte = (uint32_t)__builtin_ctz(~mask); |
40 | 3.88M | return len + match_byte; |
41 | 3.88M | } |
42 | | |
43 | 14.4M | src0 += 32, src1 += 32, len += 32; |
44 | 14.4M | } while (len < 256); |
45 | | |
46 | 2.74M | return 256; |
47 | 57.2M | } |
48 | | |
49 | 6.29M | Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) { |
50 | 6.29M | return compare256_avx2_static(src0, src1); |
51 | 6.29M | } |
52 | | |
53 | | #define LONGEST_MATCH longest_match_avx2 |
54 | 40.3M | #define COMPARE256 compare256_avx2_static |
55 | | |
56 | | #include "match_tpl.h" |
57 | | |
58 | | #define LONGEST_MATCH_SLOW |
59 | | #define LONGEST_MATCH longest_match_slow_avx2 |
60 | 10.5M | #define COMPARE256 compare256_avx2_static |
61 | | |
62 | | #include "match_tpl.h" |
63 | | |
64 | | #endif |