Line | Count | Source |
1 | | /* adler32_p.h -- Private inline functions and macros shared with |
2 | | * different computation of the Adler-32 checksum |
3 | | * of a data stream. |
4 | | * Copyright (C) 1995-2011, 2016 Mark Adler |
5 | | * For conditions of distribution and use, see copyright notice in zlib.h |
6 | | */ |
7 | | |
8 | | #ifndef ADLER32_P_H |
9 | | #define ADLER32_P_H |
10 | | |
11 | | #include "zendian.h" |
12 | | |
13 | 604M | #define BASE 65521U /* largest prime smaller than 65536 */ |
14 | 1.12M | #define NMAX 5552 |
15 | | /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ |
16 | | #define NMAX_ALIGNED32 (NMAX & ~31) |
17 | | /* NMAX rounded down to a multiple of 32 is 5536 */ |
18 | | |
19 | 307M | #define ADLER_DO1(sum1, sum2, buf, i) {(sum1) += buf[(i)]; (sum2) += (sum1);} |
20 | 3.97M | #define ADLER_DO2(sum1, sum2, buf, i) {ADLER_DO1(sum1, sum2, buf, i); ADLER_DO1(sum1, sum2, buf, i+1);} |
21 | 1.22M | #define ADLER_DO4(sum1, sum2, buf, i) {ADLER_DO2(sum1, sum2, buf, i); ADLER_DO2(sum1, sum2, buf, i+2);} |
22 | | #define ADLER_DO8(sum1, sum2, buf, i) {ADLER_DO4(sum1, sum2, buf, i); ADLER_DO4(sum1, sum2, buf, i+4);} |
23 | | #define ADLER_DO16(sum1, sum2, buf) {ADLER_DO8(sum1, sum2, buf, 0); ADLER_DO8(sum1, sum2, buf, 8);} |
24 | | |
25 | | Z_FORCEINLINE static void adler32_copy_align(uint32_t *Z_RESTRICT adler, uint8_t *dst, const uint8_t *buf, size_t len, |
26 | 0 | uint32_t *Z_RESTRICT sum2, const int MAX_LEN, const int COPY) { |
27 | 0 | Z_UNUSED(MAX_LEN); |
28 | 0 | if (len & 1) { |
29 | 0 | if (COPY) { |
30 | 0 | *dst = *buf; |
31 | 0 | dst += 1; |
32 | 0 | } |
33 | 0 | ADLER_DO1(*adler, *sum2, buf, 0); |
34 | 0 | buf += 1; |
35 | 0 | } |
36 | 0 | if (len & 2) { |
37 | 0 | if (COPY) { |
38 | 0 | memcpy(dst, buf, 2); |
39 | 0 | dst += 2; |
40 | 0 | } |
41 | 0 | ADLER_DO2(*adler, *sum2, buf, 0); |
42 | 0 | buf += 2; |
43 | 0 | } |
44 | 0 | while (len >= 4) { |
45 | 0 | if (COPY) { |
46 | 0 | memcpy(dst, buf, 4); |
47 | 0 | dst += 4; |
48 | 0 | } |
49 | 0 | len -= 4; |
50 | 0 | ADLER_DO4(*adler, *sum2, buf, 0); |
51 | 0 | buf += 4; |
52 | 0 | } |
53 | 0 | } Unexecuted instantiation: adler32_ssse3.c:adler32_copy_align Unexecuted instantiation: adler32_sse42.c:adler32_copy_align Unexecuted instantiation: adler32_avx2.c:adler32_copy_align Unexecuted instantiation: adler32_avx512.c:adler32_copy_align Unexecuted instantiation: adler32_avx512_vnni.c:adler32_copy_align Unexecuted instantiation: adler32_c.c:adler32_copy_align Unexecuted instantiation: adler32.c:adler32_copy_align |
54 | | |
55 | | /* SIMD Within A Register (SWAR) scalar adler32. Splits bytes into |
56 | | * even/odd lanes packed as 4x16-bit in uint64_t, with prefix sums for s2. |
57 | | * Reduction uses multiply-and-shift with positional weight constants. |
58 | | * |
59 | | * Technique pioneered by Michael Niedermayer <michaelni@gmx.at>. |
60 | | * Max chunk: 23 iterations * 8 bytes = 184 (255*23 = 5865 < 65535). */ |
61 | | #define ADLER32_SWAR_MAX_BYTES (23 * 8) |
62 | 330k | #define ADLER32_SWAR_EVEN_MASK 0x00FF00FF00FF00FFULL |
63 | 330k | #define ADLER32_SWAR_HSUM 0x1000100010001ULL |
64 | | |
65 | | Z_FORCEINLINE static void adler32_swar(uint32_t *adler, uint8_t *dst, const uint8_t *buf, size_t len, |
66 | 165k | uint32_t *sum2, const int COPY) { |
67 | 165k | uint64_t sum_even = 0, sum_odd = 0, prefix_even = 0, prefix_odd = 0; |
68 | | |
69 | 165k | *sum2 += *adler * (uint32_t)len; |
70 | | |
71 | 165k | const uint64_t *src64 = (const uint64_t *)buf; |
72 | | |
73 | 165k | while (len >= 16) { |
74 | 0 | uint64_t v0 = src64[0]; |
75 | 0 | uint64_t v1 = src64[1]; |
76 | 0 | if (COPY) { |
77 | 0 | memcpy(dst, &v0, sizeof(v0)); |
78 | 0 | memcpy(dst + 8, &v1, sizeof(v1)); |
79 | 0 | dst += 16; |
80 | 0 | } |
81 | |
|
82 | 0 | prefix_even += sum_even; |
83 | 0 | prefix_odd += sum_odd; |
84 | 0 | sum_even += v0 & ADLER32_SWAR_EVEN_MASK; |
85 | 0 | sum_odd += (v0 >> 8) & ADLER32_SWAR_EVEN_MASK; |
86 | |
|
87 | 0 | prefix_even += sum_even; |
88 | 0 | prefix_odd += sum_odd; |
89 | 0 | sum_even += v1 & ADLER32_SWAR_EVEN_MASK; |
90 | 0 | sum_odd += (v1 >> 8) & ADLER32_SWAR_EVEN_MASK; |
91 | |
|
92 | 0 | src64 += 2; |
93 | 0 | len -= 16; |
94 | 0 | } |
95 | | |
96 | | /* Handle remaining 8 bytes if present */ |
97 | 165k | if (len >= 8) { |
98 | 165k | uint64_t v = *src64; |
99 | 165k | if (COPY) |
100 | 18.9k | memcpy(dst, &v, sizeof(v)); |
101 | | |
102 | 165k | prefix_even += sum_even; |
103 | 165k | prefix_odd += sum_odd; |
104 | 165k | sum_even += v & ADLER32_SWAR_EVEN_MASK; |
105 | 165k | sum_odd += (v >> 8) & ADLER32_SWAR_EVEN_MASK; |
106 | 165k | } |
107 | | |
108 | | /* Horizontal sum of 4x16-bit lanes for s1 */ |
109 | 165k | *adler += (uint32_t)(((sum_even + sum_odd) * ADLER32_SWAR_HSUM) >> 48); |
110 | | |
111 | | /* Widen prefix sums to 32-bit pairs and horizontal sum for s2 */ |
112 | 165k | uint64_t pe_lo = prefix_even & 0xFFFF0000FFFFULL; |
113 | 165k | uint64_t pe_hi = (prefix_even >> 16) & 0xFFFF0000FFFFULL; |
114 | 165k | uint64_t po_lo = prefix_odd & 0xFFFF0000FFFFULL; |
115 | 165k | uint64_t po_hi = (prefix_odd >> 16) & 0xFFFF0000FFFFULL; |
116 | | |
117 | 165k | *sum2 += (uint32_t)(((pe_lo + po_lo + pe_hi + po_hi) * 0x800000008ULL) >> 32); |
118 | | |
119 | | /* Positional weights [8,7,6,5,4,3,2,1] per 8-byte group for s2. |
120 | | * On big-endian the even mask captures odd-index memory bytes (b1,b3,b5,b7) |
121 | | * so HSUM (+1 per odd-index byte) must be applied to sum_even, not sum_odd. */ |
122 | 165k | #if BYTE_ORDER == LITTLE_ENDIAN |
123 | 165k | *sum2 += 2 * (uint32_t)((sum_even * 0x4000300020001ULL) >> 48) |
124 | 165k | + (uint32_t)((sum_odd * ADLER32_SWAR_HSUM) >> 48) |
125 | 165k | + 2 * (uint32_t)((sum_odd * 0x3000200010000ULL) >> 48); |
126 | | #else |
127 | | *sum2 += 2 * (uint32_t)((sum_even * 0x0000100020003ULL) >> 48) |
128 | | + (uint32_t)((sum_even * ADLER32_SWAR_HSUM) >> 48) |
129 | | + 2 * (uint32_t)((sum_odd * 0x1000200030004ULL) >> 48); |
130 | | #endif |
131 | 165k | } adler32_ssse3.c:adler32_swar Line | Count | Source | 66 | 118k | uint32_t *sum2, const int COPY) { | 67 | 118k | uint64_t sum_even = 0, sum_odd = 0, prefix_even = 0, prefix_odd = 0; | 68 | | | 69 | 118k | *sum2 += *adler * (uint32_t)len; | 70 | | | 71 | 118k | const uint64_t *src64 = (const uint64_t *)buf; | 72 | | | 73 | 118k | while (len >= 16) { | 74 | 0 | uint64_t v0 = src64[0]; | 75 | 0 | uint64_t v1 = src64[1]; | 76 | 0 | if (COPY) { | 77 | 0 | memcpy(dst, &v0, sizeof(v0)); | 78 | 0 | memcpy(dst + 8, &v1, sizeof(v1)); | 79 | 0 | dst += 16; | 80 | 0 | } | 81 | |
| 82 | 0 | prefix_even += sum_even; | 83 | 0 | prefix_odd += sum_odd; | 84 | 0 | sum_even += v0 & ADLER32_SWAR_EVEN_MASK; | 85 | 0 | sum_odd += (v0 >> 8) & ADLER32_SWAR_EVEN_MASK; | 86 | |
| 87 | 0 | prefix_even += sum_even; | 88 | 0 | prefix_odd += sum_odd; | 89 | 0 | sum_even += v1 & ADLER32_SWAR_EVEN_MASK; | 90 | 0 | sum_odd += (v1 >> 8) & ADLER32_SWAR_EVEN_MASK; | 91 | |
| 92 | 0 | src64 += 2; | 93 | 0 | len -= 16; | 94 | 0 | } | 95 | | | 96 | | /* Handle remaining 8 bytes if present */ | 97 | 118k | if (len >= 8) { | 98 | 118k | uint64_t v = *src64; | 99 | 118k | if (COPY) | 100 | 0 | memcpy(dst, &v, sizeof(v)); | 101 | | | 102 | 118k | prefix_even += sum_even; | 103 | 118k | prefix_odd += sum_odd; | 104 | 118k | sum_even += v & ADLER32_SWAR_EVEN_MASK; | 105 | 118k | sum_odd += (v >> 8) & ADLER32_SWAR_EVEN_MASK; | 106 | 118k | } | 107 | | | 108 | | /* Horizontal sum of 4x16-bit lanes for s1 */ | 109 | 118k | *adler += (uint32_t)(((sum_even + sum_odd) * ADLER32_SWAR_HSUM) >> 48); | 110 | | | 111 | | /* Widen prefix sums to 32-bit pairs and horizontal sum for s2 */ | 112 | 118k | uint64_t pe_lo = prefix_even & 0xFFFF0000FFFFULL; | 113 | 118k | uint64_t pe_hi = (prefix_even >> 16) & 0xFFFF0000FFFFULL; | 114 | 118k | uint64_t po_lo = prefix_odd & 0xFFFF0000FFFFULL; | 115 | 118k | uint64_t po_hi = (prefix_odd >> 16) & 0xFFFF0000FFFFULL; | 116 | | | 117 | 118k | *sum2 += (uint32_t)(((pe_lo + po_lo + pe_hi + po_hi) * 0x800000008ULL) >> 32); | 118 | | | 119 | | /* Positional weights [8,7,6,5,4,3,2,1] per 8-byte group for s2. | 120 | | * On big-endian the even mask captures odd-index memory bytes (b1,b3,b5,b7) | 121 | | * so HSUM (+1 per odd-index byte) must be applied to sum_even, not sum_odd. */ | 122 | 118k | #if BYTE_ORDER == LITTLE_ENDIAN | 123 | 118k | *sum2 += 2 * (uint32_t)((sum_even * 0x4000300020001ULL) >> 48) | 124 | 118k | + (uint32_t)((sum_odd * ADLER32_SWAR_HSUM) >> 48) | 125 | 118k | + 2 * (uint32_t)((sum_odd * 0x3000200010000ULL) >> 48); | 126 | | #else | 127 | | *sum2 += 2 * (uint32_t)((sum_even * 0x0000100020003ULL) >> 48) | 128 | | + (uint32_t)((sum_even * ADLER32_SWAR_HSUM) >> 48) | 129 | | + 2 * (uint32_t)((sum_odd * 0x1000200030004ULL) >> 48); | 130 | | #endif | 131 | 118k | } |
adler32_sse42.c:adler32_swar Line | Count | Source | 66 | 9.11k | uint32_t *sum2, const int COPY) { | 67 | 9.11k | uint64_t sum_even = 0, sum_odd = 0, prefix_even = 0, prefix_odd = 0; | 68 | | | 69 | 9.11k | *sum2 += *adler * (uint32_t)len; | 70 | | | 71 | 9.11k | const uint64_t *src64 = (const uint64_t *)buf; | 72 | | | 73 | 9.11k | while (len >= 16) { | 74 | 0 | uint64_t v0 = src64[0]; | 75 | 0 | uint64_t v1 = src64[1]; | 76 | 0 | if (COPY) { | 77 | 0 | memcpy(dst, &v0, sizeof(v0)); | 78 | 0 | memcpy(dst + 8, &v1, sizeof(v1)); | 79 | 0 | dst += 16; | 80 | 0 | } | 81 | |
| 82 | 0 | prefix_even += sum_even; | 83 | 0 | prefix_odd += sum_odd; | 84 | 0 | sum_even += v0 & ADLER32_SWAR_EVEN_MASK; | 85 | 0 | sum_odd += (v0 >> 8) & ADLER32_SWAR_EVEN_MASK; | 86 | |
| 87 | 0 | prefix_even += sum_even; | 88 | 0 | prefix_odd += sum_odd; | 89 | 0 | sum_even += v1 & ADLER32_SWAR_EVEN_MASK; | 90 | 0 | sum_odd += (v1 >> 8) & ADLER32_SWAR_EVEN_MASK; | 91 | |
| 92 | 0 | src64 += 2; | 93 | 0 | len -= 16; | 94 | 0 | } | 95 | | | 96 | | /* Handle remaining 8 bytes if present */ | 97 | 9.11k | if (len >= 8) { | 98 | 9.11k | uint64_t v = *src64; | 99 | 9.11k | if (COPY) | 100 | 9.11k | memcpy(dst, &v, sizeof(v)); | 101 | | | 102 | 9.11k | prefix_even += sum_even; | 103 | 9.11k | prefix_odd += sum_odd; | 104 | 9.11k | sum_even += v & ADLER32_SWAR_EVEN_MASK; | 105 | 9.11k | sum_odd += (v >> 8) & ADLER32_SWAR_EVEN_MASK; | 106 | 9.11k | } | 107 | | | 108 | | /* Horizontal sum of 4x16-bit lanes for s1 */ | 109 | 9.11k | *adler += (uint32_t)(((sum_even + sum_odd) * ADLER32_SWAR_HSUM) >> 48); | 110 | | | 111 | | /* Widen prefix sums to 32-bit pairs and horizontal sum for s2 */ | 112 | 9.11k | uint64_t pe_lo = prefix_even & 0xFFFF0000FFFFULL; | 113 | 9.11k | uint64_t pe_hi = (prefix_even >> 16) & 0xFFFF0000FFFFULL; | 114 | 9.11k | uint64_t po_lo = prefix_odd & 0xFFFF0000FFFFULL; | 115 | 9.11k | uint64_t po_hi = (prefix_odd >> 16) & 0xFFFF0000FFFFULL; | 116 | | | 117 | 9.11k | *sum2 += (uint32_t)(((pe_lo + po_lo + pe_hi + po_hi) * 0x800000008ULL) >> 32); | 118 | | | 119 | | /* Positional weights [8,7,6,5,4,3,2,1] per 8-byte group for s2. | 120 | | * On big-endian the even mask captures odd-index memory bytes (b1,b3,b5,b7) | 121 | | * so HSUM (+1 per odd-index byte) must be applied to sum_even, not sum_odd. */ | 122 | 9.11k | #if BYTE_ORDER == LITTLE_ENDIAN | 123 | 9.11k | *sum2 += 2 * (uint32_t)((sum_even * 0x4000300020001ULL) >> 48) | 124 | 9.11k | + (uint32_t)((sum_odd * ADLER32_SWAR_HSUM) >> 48) | 125 | 9.11k | + 2 * (uint32_t)((sum_odd * 0x3000200010000ULL) >> 48); | 126 | | #else | 127 | | *sum2 += 2 * (uint32_t)((sum_even * 0x0000100020003ULL) >> 48) | 128 | | + (uint32_t)((sum_even * ADLER32_SWAR_HSUM) >> 48) | 129 | | + 2 * (uint32_t)((sum_odd * 0x1000200030004ULL) >> 48); | 130 | | #endif | 131 | 9.11k | } |
adler32_avx2.c:adler32_swar Line | Count | Source | 66 | 37.6k | uint32_t *sum2, const int COPY) { | 67 | 37.6k | uint64_t sum_even = 0, sum_odd = 0, prefix_even = 0, prefix_odd = 0; | 68 | | | 69 | 37.6k | *sum2 += *adler * (uint32_t)len; | 70 | | | 71 | 37.6k | const uint64_t *src64 = (const uint64_t *)buf; | 72 | | | 73 | 37.6k | while (len >= 16) { | 74 | 0 | uint64_t v0 = src64[0]; | 75 | 0 | uint64_t v1 = src64[1]; | 76 | 0 | if (COPY) { | 77 | 0 | memcpy(dst, &v0, sizeof(v0)); | 78 | 0 | memcpy(dst + 8, &v1, sizeof(v1)); | 79 | 0 | dst += 16; | 80 | 0 | } | 81 | |
| 82 | 0 | prefix_even += sum_even; | 83 | 0 | prefix_odd += sum_odd; | 84 | 0 | sum_even += v0 & ADLER32_SWAR_EVEN_MASK; | 85 | 0 | sum_odd += (v0 >> 8) & ADLER32_SWAR_EVEN_MASK; | 86 | |
| 87 | 0 | prefix_even += sum_even; | 88 | 0 | prefix_odd += sum_odd; | 89 | 0 | sum_even += v1 & ADLER32_SWAR_EVEN_MASK; | 90 | 0 | sum_odd += (v1 >> 8) & ADLER32_SWAR_EVEN_MASK; | 91 | |
| 92 | 0 | src64 += 2; | 93 | 0 | len -= 16; | 94 | 0 | } | 95 | | | 96 | | /* Handle remaining 8 bytes if present */ | 97 | 37.6k | if (len >= 8) { | 98 | 37.6k | uint64_t v = *src64; | 99 | 37.6k | if (COPY) | 100 | 9.87k | memcpy(dst, &v, sizeof(v)); | 101 | | | 102 | 37.6k | prefix_even += sum_even; | 103 | 37.6k | prefix_odd += sum_odd; | 104 | 37.6k | sum_even += v & ADLER32_SWAR_EVEN_MASK; | 105 | 37.6k | sum_odd += (v >> 8) & ADLER32_SWAR_EVEN_MASK; | 106 | 37.6k | } | 107 | | | 108 | | /* Horizontal sum of 4x16-bit lanes for s1 */ | 109 | 37.6k | *adler += (uint32_t)(((sum_even + sum_odd) * ADLER32_SWAR_HSUM) >> 48); | 110 | | | 111 | | /* Widen prefix sums to 32-bit pairs and horizontal sum for s2 */ | 112 | 37.6k | uint64_t pe_lo = prefix_even & 0xFFFF0000FFFFULL; | 113 | 37.6k | uint64_t pe_hi = (prefix_even >> 16) & 0xFFFF0000FFFFULL; | 114 | 37.6k | uint64_t po_lo = prefix_odd & 0xFFFF0000FFFFULL; | 115 | 37.6k | uint64_t po_hi = (prefix_odd >> 16) & 0xFFFF0000FFFFULL; | 116 | | | 117 | 37.6k | *sum2 += (uint32_t)(((pe_lo + po_lo + pe_hi + po_hi) * 0x800000008ULL) >> 32); | 118 | | | 119 | | /* Positional weights [8,7,6,5,4,3,2,1] per 8-byte group for s2. | 120 | | * On big-endian the even mask captures odd-index memory bytes (b1,b3,b5,b7) | 121 | | * so HSUM (+1 per odd-index byte) must be applied to sum_even, not sum_odd. */ | 122 | 37.6k | #if BYTE_ORDER == LITTLE_ENDIAN | 123 | 37.6k | *sum2 += 2 * (uint32_t)((sum_even * 0x4000300020001ULL) >> 48) | 124 | 37.6k | + (uint32_t)((sum_odd * ADLER32_SWAR_HSUM) >> 48) | 125 | 37.6k | + 2 * (uint32_t)((sum_odd * 0x3000200010000ULL) >> 48); | 126 | | #else | 127 | | *sum2 += 2 * (uint32_t)((sum_even * 0x0000100020003ULL) >> 48) | 128 | | + (uint32_t)((sum_even * ADLER32_SWAR_HSUM) >> 48) | 129 | | + 2 * (uint32_t)((sum_odd * 0x1000200030004ULL) >> 48); | 130 | | #endif | 131 | 37.6k | } |
Unexecuted instantiation: adler32_avx512.c:adler32_swar Unexecuted instantiation: adler32_avx512_vnni.c:adler32_swar Unexecuted instantiation: adler32_c.c:adler32_swar Unexecuted instantiation: adler32.c:adler32_swar |
132 | | |
133 | | Z_FORCEINLINE static uint32_t adler32_copy_tail(uint32_t adler, uint8_t *dst, const uint8_t *buf, size_t len, |
134 | 301M | uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) { |
135 | 301M | if (len) { |
136 | 300M | Z_UNUSED(MAX_LEN); |
137 | | /* Process using packed 64-bit arithmetic when source is aligned */ |
138 | 301M | while (len >= 8 && ((uintptr_t)buf & 7) == 0) { |
139 | 165k | size_t chunk = MIN(ALIGN_DOWN(len, (size_t)8), (size_t)ADLER32_SWAR_MAX_BYTES); |
140 | 165k | adler32_swar(&adler, dst, buf, chunk, &sum2, COPY); |
141 | 165k | buf += chunk; |
142 | 165k | if (COPY) |
143 | 18.9k | dst += chunk; |
144 | 165k | len -= chunk; |
145 | 165k | } |
146 | | /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */ |
147 | 302M | while (len >= 4) { |
148 | 1.22M | if (COPY) { |
149 | 28.1k | memcpy(dst, buf, 4); |
150 | 28.1k | dst += 4; |
151 | 28.1k | } |
152 | 1.22M | len -= 4; |
153 | 1.22M | ADLER_DO4(adler, sum2, buf, 0); |
154 | 1.22M | buf += 4; |
155 | 1.22M | } |
156 | 300M | if (len & 2) { |
157 | 1.52M | if (COPY) { |
158 | 28.5k | memcpy(dst, buf, 2); |
159 | 28.5k | dst += 2; |
160 | 28.5k | } |
161 | 1.52M | ADLER_DO2(adler, sum2, buf, 0); |
162 | 1.52M | buf += 2; |
163 | 1.52M | } |
164 | 300M | if (len & 1) { |
165 | 299M | if (COPY) |
166 | 297M | *dst = *buf; |
167 | 299M | ADLER_DO1(adler, sum2, buf, 0); |
168 | 299M | } |
169 | 300M | } |
170 | 301M | if (REBASE) { |
171 | 300M | adler %= BASE; |
172 | 300M | sum2 %= BASE; |
173 | 300M | } |
174 | | /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ |
175 | 301M | return adler | (sum2 << 16); |
176 | 301M | } adler32_ssse3.c:adler32_copy_tail Line | Count | Source | 134 | 564k | uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) { | 135 | 564k | if (len) { | 136 | 472k | Z_UNUSED(MAX_LEN); | 137 | | /* Process using packed 64-bit arithmetic when source is aligned */ | 138 | 591k | while (len >= 8 && ((uintptr_t)buf & 7) == 0) { | 139 | 118k | size_t chunk = MIN(ALIGN_DOWN(len, (size_t)8), (size_t)ADLER32_SWAR_MAX_BYTES); | 140 | 118k | adler32_swar(&adler, dst, buf, chunk, &sum2, COPY); | 141 | 118k | buf += chunk; | 142 | 118k | if (COPY) | 143 | 0 | dst += chunk; | 144 | 118k | len -= chunk; | 145 | 118k | } | 146 | | /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */ | 147 | 1.55M | while (len >= 4) { | 148 | 1.08M | if (COPY) { | 149 | 0 | memcpy(dst, buf, 4); | 150 | 0 | dst += 4; | 151 | 0 | } | 152 | 1.08M | len -= 4; | 153 | 1.08M | ADLER_DO4(adler, sum2, buf, 0); | 154 | 1.08M | buf += 4; | 155 | 1.08M | } | 156 | 472k | if (len & 2) { | 157 | 384k | if (COPY) { | 158 | 0 | memcpy(dst, buf, 2); | 159 | 0 | dst += 2; | 160 | 0 | } | 161 | 384k | ADLER_DO2(adler, sum2, buf, 0); | 162 | 384k | buf += 2; | 163 | 384k | } | 164 | 472k | if (len & 1) { | 165 | 144k | if (COPY) | 166 | 0 | *dst = *buf; | 167 | 144k | ADLER_DO1(adler, sum2, buf, 0); | 168 | 144k | } | 169 | 472k | } | 170 | 564k | if (REBASE) { | 171 | 472k | adler %= BASE; | 172 | 472k | sum2 %= BASE; | 173 | 472k | } | 174 | | /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ | 175 | 564k | return adler | (sum2 << 16); | 176 | 564k | } |
adler32_sse42.c:adler32_copy_tail Line | Count | Source | 134 | 19.4k | uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) { | 135 | 19.4k | if (len) { | 136 | 19.4k | Z_UNUSED(MAX_LEN); | 137 | | /* Process using packed 64-bit arithmetic when source is aligned */ | 138 | 28.5k | while (len >= 8 && ((uintptr_t)buf & 7) == 0) { | 139 | 9.11k | size_t chunk = MIN(ALIGN_DOWN(len, (size_t)8), (size_t)ADLER32_SWAR_MAX_BYTES); | 140 | 9.11k | adler32_swar(&adler, dst, buf, chunk, &sum2, COPY); | 141 | 9.11k | buf += chunk; | 142 | 9.11k | if (COPY) | 143 | 9.11k | dst += chunk; | 144 | 9.11k | len -= chunk; | 145 | 9.11k | } | 146 | | /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */ | 147 | 32.4k | while (len >= 4) { | 148 | 12.9k | if (COPY) { | 149 | 12.9k | memcpy(dst, buf, 4); | 150 | 12.9k | dst += 4; | 151 | 12.9k | } | 152 | 12.9k | len -= 4; | 153 | 12.9k | ADLER_DO4(adler, sum2, buf, 0); | 154 | 12.9k | buf += 4; | 155 | 12.9k | } | 156 | 19.4k | if (len & 2) { | 157 | 11.1k | if (COPY) { | 158 | 11.1k | memcpy(dst, buf, 2); | 159 | 11.1k | dst += 2; | 160 | 11.1k | } | 161 | 11.1k | ADLER_DO2(adler, sum2, buf, 0); | 162 | 11.1k | buf += 2; | 163 | 11.1k | } | 164 | 19.4k | if (len & 1) { | 165 | 9.88k | if (COPY) | 166 | 9.88k | *dst = *buf; | 167 | 9.88k | ADLER_DO1(adler, sum2, buf, 0); | 168 | 9.88k | } | 169 | 19.4k | } | 170 | 19.4k | if (REBASE) { | 171 | 19.4k | adler %= BASE; | 172 | 19.4k | sum2 %= BASE; | 173 | 19.4k | } | 174 | | /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ | 175 | 19.4k | return adler | (sum2 << 16); | 176 | 19.4k | } |
adler32_avx2.c:adler32_copy_tail Line | Count | Source | 134 | 300M | uint32_t sum2, const int REBASE, const int MAX_LEN, const int COPY) { | 135 | 300M | if (len) { | 136 | 300M | Z_UNUSED(MAX_LEN); | 137 | | /* Process using packed 64-bit arithmetic when source is aligned */ | 138 | 300M | while (len >= 8 && ((uintptr_t)buf & 7) == 0) { | 139 | 37.6k | size_t chunk = MIN(ALIGN_DOWN(len, (size_t)8), (size_t)ADLER32_SWAR_MAX_BYTES); | 140 | 37.6k | adler32_swar(&adler, dst, buf, chunk, &sum2, COPY); | 141 | 37.6k | buf += chunk; | 142 | 37.6k | if (COPY) | 143 | 9.87k | dst += chunk; | 144 | 37.6k | len -= chunk; | 145 | 37.6k | } | 146 | | /* DO4 loop avoids GCC x86 register pressure from hoisted DO8/DO16 loads. */ | 147 | 300M | while (len >= 4) { | 148 | 129k | if (COPY) { | 149 | 15.1k | memcpy(dst, buf, 4); | 150 | 15.1k | dst += 4; | 151 | 15.1k | } | 152 | 129k | len -= 4; | 153 | 129k | ADLER_DO4(adler, sum2, buf, 0); | 154 | 129k | buf += 4; | 155 | 129k | } | 156 | 300M | if (len & 2) { | 157 | 1.12M | if (COPY) { | 158 | 17.3k | memcpy(dst, buf, 2); | 159 | 17.3k | dst += 2; | 160 | 17.3k | } | 161 | 1.12M | ADLER_DO2(adler, sum2, buf, 0); | 162 | 1.12M | buf += 2; | 163 | 1.12M | } | 164 | 300M | if (len & 1) { | 165 | 299M | if (COPY) | 166 | 297M | *dst = *buf; | 167 | 299M | ADLER_DO1(adler, sum2, buf, 0); | 168 | 299M | } | 169 | 300M | } | 170 | 300M | if (REBASE) { | 171 | 300M | adler %= BASE; | 172 | 300M | sum2 %= BASE; | 173 | 300M | } | 174 | | /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ | 175 | 300M | return adler | (sum2 << 16); | 176 | 300M | } |
Unexecuted instantiation: adler32_avx512.c:adler32_copy_tail Unexecuted instantiation: adler32_avx512_vnni.c:adler32_copy_tail Unexecuted instantiation: adler32_c.c:adler32_copy_tail Unexecuted instantiation: adler32.c:adler32_copy_tail |
177 | | |
178 | | #endif /* ADLER32_P_H */ |