/src/zlib-ng/arch/generic/crc32_braid_c.c
Line  | Count  | Source  | 
1  |  | /* crc32_braid.c -- compute the CRC-32 of a data stream  | 
2  |  |  * Copyright (C) 1995-2022 Mark Adler  | 
3  |  |  * For conditions of distribution and use, see copyright notice in zlib.h  | 
4  |  |  *  | 
5  |  |  * This interleaved implementation of a CRC makes use of pipelined multiple  | 
6  |  |  * arithmetic-logic units, commonly found in modern CPU cores. It is due to  | 
7  |  |  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.  | 
8  |  |  */  | 
9  |  |  | 
10  |  | #include "zbuild.h"  | 
11  |  | #include "crc32_braid_p.h"  | 
12  |  | #include "crc32_braid_tbl.h"  | 
13  |  |  | 
14  |  | /*  | 
15  |  |   A CRC of a message is computed on BRAID_N braids of words in the message, where  | 
16  |  |   each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then  | 
17  |  |   three running sparse CRCs are calculated respectively on each braid, at these  | 
18  |  |   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...  | 
19  |  |   This is done starting at a word boundary, and continues until as many blocks of  | 
20  |  |   BRAID_N * BRAID_W bytes as are available have been processed. The results are  | 
21  |  |   combined into a single CRC at the end. For this code, BRAID_N must be in the  | 
22  |  |   range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased  | 
23  |  |   if desired by adding more #if blocks, extending the patterns apparent in the code.  | 
24  |  |   In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N  | 
25  |  |   value is increased.  | 
26  |  |  | 
27  |  |   BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time  | 
28  |  |   on a given processor. The choices for BRAID_N and BRAID_W below were based on  | 
29  |  |   testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC  | 
30  |  |   POWER9, and MIPS64 Octeon II processors.  | 
31  |  |   The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8.  | 
32  |  |   The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4.  | 
33  |  |   They were all tested with either gcc or clang, all using the -O3 optimization  | 
34  |  |   level. Your mileage may vary.  | 
35  |  | */  | 
36  |  |  | 
37  |  | /* ========================================================================= */  | 
38  |  | #ifdef BRAID_W  | 
39  |  | /*  | 
40  |  |   Return the CRC of the BRAID_W bytes in the word_t data, taking the  | 
41  |  |   least-significant byte of the word as the first byte of data, without any pre  | 
42  |  |   or post conditioning. This is used to combine the CRCs of each braid.  | 
43  |  |  */  | 
44  |  | #  if BYTE_ORDER == LITTLE_ENDIAN  | 
45  | 0  | static uint32_t crc_word(z_word_t data) { | 
46  | 0  |     int k;  | 
47  | 0  |     for (k = 0; k < BRAID_W; k++)  | 
48  | 0  |         data = (data >> 8) ^ crc_table[data & 0xff];  | 
49  | 0  |     return (uint32_t)data;  | 
50  | 0  | }  | 
51  |  | #  elif BYTE_ORDER == BIG_ENDIAN  | 
52  |  | static z_word_t crc_word(z_word_t data) { | 
53  |  |     int k;  | 
54  |  |     for (k = 0; k < BRAID_W; k++)  | 
55  |  |         data = (data << 8) ^  | 
56  |  |             crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff];  | 
57  |  |     return data;  | 
58  |  | }  | 
59  |  | #  endif /* BYTE_ORDER */  | 
60  |  | #endif /* BRAID_W */  | 
61  |  |  | 
62  |  | /* ========================================================================= */  | 
63  | 0  | Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len) { | 
64  |  | 
  | 
65  | 0  | #ifdef BRAID_W  | 
66  |  |     /* If provided enough bytes, do a braided CRC calculation. */  | 
67  | 0  |     if (len >= BRAID_N * BRAID_W + BRAID_W - 1) { | 
68  | 0  |         size_t blks;  | 
69  | 0  |         z_word_t const *words;  | 
70  | 0  |         int k;  | 
71  |  |  | 
72  |  |         /* Compute the CRC up to a z_word_t boundary. */  | 
73  | 0  |         while (len && ((uintptr_t)buf & (BRAID_W - 1)) != 0) { | 
74  | 0  |             len--;  | 
75  | 0  |             CRC_DO1;  | 
76  | 0  |         }  | 
77  |  |  | 
78  |  |         /* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */  | 
79  | 0  |         blks = len / (BRAID_N * BRAID_W);  | 
80  | 0  |         len -= blks * BRAID_N * BRAID_W;  | 
81  | 0  |         words = (z_word_t const *)buf;  | 
82  |  | 
  | 
83  | 0  |         z_word_t crc0, word0, comb;  | 
84  | 0  | #if BRAID_N > 1  | 
85  | 0  |         z_word_t crc1, word1;  | 
86  | 0  | #if BRAID_N > 2  | 
87  | 0  |         z_word_t crc2, word2;  | 
88  | 0  | #if BRAID_N > 3  | 
89  | 0  |         z_word_t crc3, word3;  | 
90  | 0  | #if BRAID_N > 4  | 
91  | 0  |         z_word_t crc4, word4;  | 
92  |  | #if BRAID_N > 5  | 
93  |  |         z_word_t crc5, word5;  | 
94  |  | #endif  | 
95  | 0  | #endif  | 
96  | 0  | #endif  | 
97  | 0  | #endif  | 
98  | 0  | #endif  | 
99  |  |         /* Initialize the CRC for each braid. */  | 
100  | 0  |         crc0 = ZSWAPWORD(c);  | 
101  | 0  | #if BRAID_N > 1  | 
102  | 0  |         crc1 = 0;  | 
103  | 0  | #if BRAID_N > 2  | 
104  | 0  |         crc2 = 0;  | 
105  | 0  | #if BRAID_N > 3  | 
106  | 0  |         crc3 = 0;  | 
107  | 0  | #if BRAID_N > 4  | 
108  | 0  |         crc4 = 0;  | 
109  |  | #if BRAID_N > 5  | 
110  |  |         crc5 = 0;  | 
111  |  | #endif  | 
112  | 0  | #endif  | 
113  | 0  | #endif  | 
114  | 0  | #endif  | 
115  | 0  | #endif  | 
116  |  |         /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */  | 
117  | 0  |         while (--blks) { | 
118  |  |             /* Load the word for each braid into registers. */  | 
119  | 0  |             word0 = crc0 ^ words[0];  | 
120  | 0  | #if BRAID_N > 1  | 
121  | 0  |             word1 = crc1 ^ words[1];  | 
122  | 0  | #if BRAID_N > 2  | 
123  | 0  |             word2 = crc2 ^ words[2];  | 
124  | 0  | #if BRAID_N > 3  | 
125  | 0  |             word3 = crc3 ^ words[3];  | 
126  | 0  | #if BRAID_N > 4  | 
127  | 0  |             word4 = crc4 ^ words[4];  | 
128  |  | #if BRAID_N > 5  | 
129  |  |             word5 = crc5 ^ words[5];  | 
130  |  | #endif  | 
131  | 0  | #endif  | 
132  | 0  | #endif  | 
133  | 0  | #endif  | 
134  | 0  | #endif  | 
135  | 0  |             words += BRAID_N;  | 
136  |  |  | 
137  |  |             /* Compute and update the CRC for each word. The loop should get unrolled. */  | 
138  | 0  |             crc0 = BRAID_TABLE[0][word0 & 0xff];  | 
139  | 0  | #if BRAID_N > 1  | 
140  | 0  |             crc1 = BRAID_TABLE[0][word1 & 0xff];  | 
141  | 0  | #if BRAID_N > 2  | 
142  | 0  |             crc2 = BRAID_TABLE[0][word2 & 0xff];  | 
143  | 0  | #if BRAID_N > 3  | 
144  | 0  |             crc3 = BRAID_TABLE[0][word3 & 0xff];  | 
145  | 0  | #if BRAID_N > 4  | 
146  | 0  |             crc4 = BRAID_TABLE[0][word4 & 0xff];  | 
147  |  | #if BRAID_N > 5  | 
148  |  |             crc5 = BRAID_TABLE[0][word5 & 0xff];  | 
149  |  | #endif  | 
150  | 0  | #endif  | 
151  | 0  | #endif  | 
152  | 0  | #endif  | 
153  | 0  | #endif  | 
154  | 0  |             for (k = 1; k < BRAID_W; k++) { | 
155  | 0  |                 crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];  | 
156  | 0  | #if BRAID_N > 1  | 
157  | 0  |                 crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];  | 
158  | 0  | #if BRAID_N > 2  | 
159  | 0  |                 crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];  | 
160  | 0  | #if BRAID_N > 3  | 
161  | 0  |                 crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];  | 
162  | 0  | #if BRAID_N > 4  | 
163  | 0  |                 crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];  | 
164  |  | #if BRAID_N > 5  | 
165  |  |                 crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];  | 
166  |  | #endif  | 
167  | 0  | #endif  | 
168  | 0  | #endif  | 
169  | 0  | #endif  | 
170  | 0  | #endif  | 
171  | 0  |             }  | 
172  | 0  |         }  | 
173  |  |  | 
174  |  |         /* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */  | 
175  | 0  |         comb = crc_word(crc0 ^ words[0]);  | 
176  | 0  | #if BRAID_N > 1  | 
177  | 0  |         comb = crc_word(crc1 ^ words[1] ^ comb);  | 
178  | 0  | #if BRAID_N > 2  | 
179  | 0  |         comb = crc_word(crc2 ^ words[2] ^ comb);  | 
180  | 0  | #if BRAID_N > 3  | 
181  | 0  |         comb = crc_word(crc3 ^ words[3] ^ comb);  | 
182  | 0  | #if BRAID_N > 4  | 
183  | 0  |         comb = crc_word(crc4 ^ words[4] ^ comb);  | 
184  |  | #if BRAID_N > 5  | 
185  |  |         comb = crc_word(crc5 ^ words[5] ^ comb);  | 
186  |  | #endif  | 
187  | 0  | #endif  | 
188  | 0  | #endif  | 
189  | 0  | #endif  | 
190  | 0  | #endif  | 
191  | 0  |         words += BRAID_N;  | 
192  | 0  |         Assert(comb <= UINT32_MAX, "comb should fit in uint32_t");  | 
193  | 0  |         c = (uint32_t)ZSWAPWORD(comb);  | 
194  |  |  | 
195  |  |         /* Update the pointer to the remaining bytes to process. */  | 
196  | 0  |         buf = (const unsigned char *)words;  | 
197  | 0  |     }  | 
198  |  | 
  | 
199  | 0  | #endif /* BRAID_W */  | 
200  |  |  | 
201  |  |     /* Complete the computation of the CRC on any remaining bytes. */  | 
202  | 0  |     while (len >= 8) { | 
203  | 0  |         len -= 8;  | 
204  | 0  |         CRC_DO8;  | 
205  | 0  |     }  | 
206  | 0  |     while (len) { | 
207  | 0  |         len--;  | 
208  | 0  |         CRC_DO1;  | 
209  | 0  |     }  | 
210  |  |  | 
211  |  |     /* Return the CRC, post-conditioned. */  | 
212  | 0  |     return c;  | 
213  | 0  | }  | 
214  |  |  | 
215  | 0  | Z_INTERNAL uint32_t crc32_braid(uint32_t c, const uint8_t *buf, size_t len) { | 
216  | 0  |     c = (~c) & 0xffffffff;  | 
217  |  | 
  | 
218  | 0  |     c = crc32_braid_internal(c, buf, len);  | 
219  |  |  | 
220  |  |     /* Return the CRC, post-conditioned. */  | 
221  | 0  |     return c ^ 0xffffffff;  | 
222  | 0  | }  |