/src/zlib-ng/arch/generic/crc32_braid_c.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* crc32_braid.c -- compute the CRC-32 of a data stream |
2 | | * Copyright (C) 1995-2022 Mark Adler |
3 | | * For conditions of distribution and use, see copyright notice in zlib.h |
4 | | * |
5 | | * This interleaved implementation of a CRC makes use of pipelined multiple |
6 | | * arithmetic-logic units, commonly found in modern CPU cores. It is due to |
7 | | * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. |
8 | | */ |
9 | | |
10 | | #include "zbuild.h" |
11 | | #include "crc32_braid_p.h" |
12 | | #include "crc32_braid_tbl.h" |
13 | | |
14 | | /* |
15 | | A CRC of a message is computed on BRAID_N braids of words in the message, where |
16 | | each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then |
17 | | three running sparse CRCs are calculated respectively on each braid, at these |
18 | | indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ... |
19 | | This is done starting at a word boundary, and continues until as many blocks of |
20 | | BRAID_N * BRAID_W bytes as are available have been processed. The results are |
21 | | combined into a single CRC at the end. For this code, BRAID_N must be in the |
22 | | range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased |
23 | | if desired by adding more #if blocks, extending the patterns apparent in the code. |
24 | | In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N |
25 | | value is increased. |
26 | | |
27 | | BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time |
28 | | on a given processor. The choices for BRAID_N and BRAID_W below were based on |
29 | | testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC |
30 | | POWER9, and MIPS64 Octeon II processors. |
31 | | The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8. |
32 | | The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4. |
33 | | They were all tested with either gcc or clang, all using the -O3 optimization |
34 | | level. Your mileage may vary. |
35 | | */ |
36 | | |
37 | | /* ========================================================================= */ |
38 | | #ifdef BRAID_W |
39 | | /* |
40 | | Return the CRC of the BRAID_W bytes in the word_t data, taking the |
41 | | least-significant byte of the word as the first byte of data, without any pre |
42 | | or post conditioning. This is used to combine the CRCs of each braid. |
43 | | */ |
44 | | # if BYTE_ORDER == LITTLE_ENDIAN |
45 | 0 | static uint32_t crc_word(z_word_t data) { |
46 | 0 | int k; |
47 | 0 | for (k = 0; k < BRAID_W; k++) |
48 | 0 | data = (data >> 8) ^ crc_table[data & 0xff]; |
49 | 0 | return (uint32_t)data; |
50 | 0 | } |
51 | | # elif BYTE_ORDER == BIG_ENDIAN |
52 | | static z_word_t crc_word(z_word_t data) { |
53 | | int k; |
54 | | for (k = 0; k < BRAID_W; k++) |
55 | | data = (data << 8) ^ |
56 | | crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff]; |
57 | | return data; |
58 | | } |
59 | | # endif /* BYTE_ORDER */ |
60 | | #endif /* BRAID_W */ |
61 | | |
62 | | /* ========================================================================= */ |
63 | 0 | Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len) { |
64 | |
|
65 | 0 | #ifdef BRAID_W |
66 | | /* If provided enough bytes, do a braided CRC calculation. */ |
67 | 0 | if (len >= BRAID_N * BRAID_W + BRAID_W - 1) { |
68 | 0 | size_t blks; |
69 | 0 | z_word_t const *words; |
70 | 0 | int k; |
71 | | |
72 | | /* Compute the CRC up to a z_word_t boundary. */ |
73 | 0 | while (len && ((uintptr_t)buf & (BRAID_W - 1)) != 0) { |
74 | 0 | len--; |
75 | 0 | CRC_DO1; |
76 | 0 | } |
77 | | |
78 | | /* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */ |
79 | 0 | blks = len / (BRAID_N * BRAID_W); |
80 | 0 | len -= blks * BRAID_N * BRAID_W; |
81 | 0 | words = (z_word_t const *)buf; |
82 | |
|
83 | 0 | z_word_t crc0, word0, comb; |
84 | 0 | #if BRAID_N > 1 |
85 | 0 | z_word_t crc1, word1; |
86 | 0 | #if BRAID_N > 2 |
87 | 0 | z_word_t crc2, word2; |
88 | 0 | #if BRAID_N > 3 |
89 | 0 | z_word_t crc3, word3; |
90 | 0 | #if BRAID_N > 4 |
91 | 0 | z_word_t crc4, word4; |
92 | | #if BRAID_N > 5 |
93 | | z_word_t crc5, word5; |
94 | | #endif |
95 | 0 | #endif |
96 | 0 | #endif |
97 | 0 | #endif |
98 | 0 | #endif |
99 | | /* Initialize the CRC for each braid. */ |
100 | 0 | crc0 = ZSWAPWORD(c); |
101 | 0 | #if BRAID_N > 1 |
102 | 0 | crc1 = 0; |
103 | 0 | #if BRAID_N > 2 |
104 | 0 | crc2 = 0; |
105 | 0 | #if BRAID_N > 3 |
106 | 0 | crc3 = 0; |
107 | 0 | #if BRAID_N > 4 |
108 | 0 | crc4 = 0; |
109 | | #if BRAID_N > 5 |
110 | | crc5 = 0; |
111 | | #endif |
112 | 0 | #endif |
113 | 0 | #endif |
114 | 0 | #endif |
115 | 0 | #endif |
116 | | /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */ |
117 | 0 | while (--blks) { |
118 | | /* Load the word for each braid into registers. */ |
119 | 0 | word0 = crc0 ^ words[0]; |
120 | 0 | #if BRAID_N > 1 |
121 | 0 | word1 = crc1 ^ words[1]; |
122 | 0 | #if BRAID_N > 2 |
123 | 0 | word2 = crc2 ^ words[2]; |
124 | 0 | #if BRAID_N > 3 |
125 | 0 | word3 = crc3 ^ words[3]; |
126 | 0 | #if BRAID_N > 4 |
127 | 0 | word4 = crc4 ^ words[4]; |
128 | | #if BRAID_N > 5 |
129 | | word5 = crc5 ^ words[5]; |
130 | | #endif |
131 | 0 | #endif |
132 | 0 | #endif |
133 | 0 | #endif |
134 | 0 | #endif |
135 | 0 | words += BRAID_N; |
136 | | |
137 | | /* Compute and update the CRC for each word. The loop should get unrolled. */ |
138 | 0 | crc0 = BRAID_TABLE[0][word0 & 0xff]; |
139 | 0 | #if BRAID_N > 1 |
140 | 0 | crc1 = BRAID_TABLE[0][word1 & 0xff]; |
141 | 0 | #if BRAID_N > 2 |
142 | 0 | crc2 = BRAID_TABLE[0][word2 & 0xff]; |
143 | 0 | #if BRAID_N > 3 |
144 | 0 | crc3 = BRAID_TABLE[0][word3 & 0xff]; |
145 | 0 | #if BRAID_N > 4 |
146 | 0 | crc4 = BRAID_TABLE[0][word4 & 0xff]; |
147 | | #if BRAID_N > 5 |
148 | | crc5 = BRAID_TABLE[0][word5 & 0xff]; |
149 | | #endif |
150 | 0 | #endif |
151 | 0 | #endif |
152 | 0 | #endif |
153 | 0 | #endif |
154 | 0 | for (k = 1; k < BRAID_W; k++) { |
155 | 0 | crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff]; |
156 | 0 | #if BRAID_N > 1 |
157 | 0 | crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff]; |
158 | 0 | #if BRAID_N > 2 |
159 | 0 | crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff]; |
160 | 0 | #if BRAID_N > 3 |
161 | 0 | crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff]; |
162 | 0 | #if BRAID_N > 4 |
163 | 0 | crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff]; |
164 | | #if BRAID_N > 5 |
165 | | crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff]; |
166 | | #endif |
167 | 0 | #endif |
168 | 0 | #endif |
169 | 0 | #endif |
170 | 0 | #endif |
171 | 0 | } |
172 | 0 | } |
173 | | |
174 | | /* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */ |
175 | 0 | comb = crc_word(crc0 ^ words[0]); |
176 | 0 | #if BRAID_N > 1 |
177 | 0 | comb = crc_word(crc1 ^ words[1] ^ comb); |
178 | 0 | #if BRAID_N > 2 |
179 | 0 | comb = crc_word(crc2 ^ words[2] ^ comb); |
180 | 0 | #if BRAID_N > 3 |
181 | 0 | comb = crc_word(crc3 ^ words[3] ^ comb); |
182 | 0 | #if BRAID_N > 4 |
183 | 0 | comb = crc_word(crc4 ^ words[4] ^ comb); |
184 | | #if BRAID_N > 5 |
185 | | comb = crc_word(crc5 ^ words[5] ^ comb); |
186 | | #endif |
187 | 0 | #endif |
188 | 0 | #endif |
189 | 0 | #endif |
190 | 0 | #endif |
191 | 0 | words += BRAID_N; |
192 | 0 | Assert(comb <= UINT32_MAX, "comb should fit in uint32_t"); |
193 | 0 | c = (uint32_t)ZSWAPWORD(comb); |
194 | | |
195 | | /* Update the pointer to the remaining bytes to process. */ |
196 | 0 | buf = (const unsigned char *)words; |
197 | 0 | } |
198 | |
|
199 | 0 | #endif /* BRAID_W */ |
200 | | |
201 | | /* Complete the computation of the CRC on any remaining bytes. */ |
202 | 0 | while (len >= 8) { |
203 | 0 | len -= 8; |
204 | 0 | CRC_DO8; |
205 | 0 | } |
206 | 0 | while (len) { |
207 | 0 | len--; |
208 | 0 | CRC_DO1; |
209 | 0 | } |
210 | | |
211 | | /* Return the CRC, post-conditioned. */ |
212 | 0 | return c; |
213 | 0 | } |
214 | | |
215 | 0 | Z_INTERNAL uint32_t crc32_braid(uint32_t c, const uint8_t *buf, size_t len) { |
216 | 0 | c = (~c) & 0xffffffff; |
217 | |
|
218 | 0 | c = crc32_braid_internal(c, buf, len); |
219 | | |
220 | | /* Return the CRC, post-conditioned. */ |
221 | 0 | return c ^ 0xffffffff; |
222 | 0 | } |