/src/zlib-ng/arch/x86/compare256_avx2.c
Line | Count | Source |
1 | | /* compare256_avx2.c -- AVX2 version of compare256 |
2 | | * Copyright Mika T. Lindqvist <postmaster@raasu.org> |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | */ |
5 | | |
6 | | #include "zbuild.h" |
7 | | #include "zendian.h" |
8 | | #include "zmemory.h" |
9 | | #include "deflate.h" |
10 | | #include "fallback_builtins.h" |
11 | | |
12 | | #ifdef X86_AVX2 |
13 | | |
14 | | #include <immintrin.h> |
15 | | #ifdef _MSC_VER |
16 | | # include <nmmintrin.h> |
17 | | #endif |
18 | | |
19 | 24.8M | static inline uint32_t compare256_avx2_static(const uint8_t *src0, const uint8_t *src1) { |
20 | 24.8M | uint32_t len = 0; |
21 | | |
22 | 28.9M | do { |
23 | 28.9M | __m256i ymm_src0, ymm_src1, ymm_cmp; |
24 | 28.9M | ymm_src0 = _mm256_loadu_si256((__m256i*)src0); |
25 | 28.9M | ymm_src1 = _mm256_loadu_si256((__m256i*)src1); |
26 | 28.9M | ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); /* non-identical bytes = 00, identical bytes = FF */ |
27 | 28.9M | unsigned mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); |
28 | 28.9M | if (mask != 0xFFFFFFFF) |
29 | 22.0M | return len + zng_ctz32(~mask); /* Invert bits so identical = 0 */ |
30 | | |
31 | 6.81M | src0 += 32, src1 += 32, len += 32; |
32 | | |
33 | 6.81M | ymm_src0 = _mm256_loadu_si256((__m256i*)src0); |
34 | 6.81M | ymm_src1 = _mm256_loadu_si256((__m256i*)src1); |
35 | 6.81M | ymm_cmp = _mm256_cmpeq_epi8(ymm_src0, ymm_src1); |
36 | 6.81M | mask = (unsigned)_mm256_movemask_epi8(ymm_cmp); |
37 | 6.81M | if (mask != 0xFFFFFFFF) |
38 | 1.91M | return len + zng_ctz32(~mask); |
39 | | |
40 | 4.90M | src0 += 32, src1 += 32, len += 32; |
41 | 4.90M | } while (len < 256); |
42 | | |
43 | 881k | return 256; |
44 | 24.8M | } |
45 | | |
46 | 4.39M | Z_INTERNAL uint32_t compare256_avx2(const uint8_t *src0, const uint8_t *src1) { |
47 | 4.39M | return compare256_avx2_static(src0, src1); |
48 | 4.39M | } |
49 | | |
50 | | #define LONGEST_MATCH longest_match_avx2 |
51 | 20.4M | #define COMPARE256 compare256_avx2_static |
52 | | |
53 | | #include "match_tpl.h" |
54 | | |
55 | | #define LONGEST_MATCH_SLOW |
56 | | #define LONGEST_MATCH longest_match_slow_avx2 |
57 | 0 | #define COMPARE256 compare256_avx2_static |
58 | | |
59 | | #include "match_tpl.h" |
60 | | |
61 | | #endif |