/src/abseil-cpp/absl/crc/internal/crc.cc
Line | Count | Source |
1 | | // Copyright 2022 The Abseil Authors. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | // Implementation of CRCs (aka Rabin Fingerprints). |
16 | | // Treats the input as a polynomial with coefficients in Z(2), |
17 | | // and finds the remainder when divided by an irreducible polynomial |
18 | | // of the appropriate length. |
19 | | // It handles all CRC sizes from 8 to 128 bits. |
20 | | // It's somewhat complicated by having separate implementations optimized for |
21 | | // CRC's <=32 bits, <= 64 bits, and <= 128 bits. |
22 | | // The input string is prefixed with a "1" bit, and has "degree" "0" bits |
23 | | // appended to it before the remainder is found. This ensures that |
24 | | // short strings are scrambled somewhat and that strings consisting |
25 | | // of all nulls have a non-zero CRC. |
26 | | // |
27 | | // Uses the "interleaved word-by-word" method from |
28 | | // "Everything we know about CRC but afraid to forget" by Andrew Kadatch |
29 | | // and Bob Jenkins, |
30 | | // http://crcutil.googlecode.com/files/crc-doc.1.0.pdf |
31 | | // |
32 | | // The idea is to compute kStride CRCs simultaneously, allowing the |
33 | | // processor to more effectively use multiple execution units. Each of |
34 | | // the CRCs is calculated on one word of data followed by kStride - 1 |
35 | | // words of zeroes; the CRC starting points are staggered by one word. |
36 | | // Assuming a stride of 4 with data words "ABCDABCDABCD", the first |
37 | | // CRC is over A000A000A, the second over 0B000B000B, and so on. |
38 | | // The CRC of the whole data is then calculated by properly aligning the |
39 | | // CRCs by appending zeroes until the data lengths agree then XORing |
40 | | // the CRCs. |
41 | | |
42 | | #include "absl/crc/internal/crc.h" |
43 | | |
44 | | #include <cstdint> |
45 | | |
46 | | #include "absl/base/internal/endian.h" |
47 | | #include "absl/base/internal/raw_logging.h" |
48 | | #include "absl/base/prefetch.h" |
49 | | #include "absl/crc/internal/crc_internal.h" |
50 | | |
51 | | namespace absl { |
52 | | ABSL_NAMESPACE_BEGIN |
53 | | namespace crc_internal { |
54 | | |
55 | | namespace { |
56 | | |
57 | | // Constants |
58 | | #if defined(__i386__) || defined(__x86_64__) |
59 | | constexpr bool kNeedAlignedLoads = false; |
60 | | #else |
61 | | constexpr bool kNeedAlignedLoads = true; |
62 | | #endif |
63 | | |
64 | | // We express the number of zeroes as a number in base ZEROES_BASE. By |
65 | | // pre-computing the zero extensions for all possible components of such an |
66 | | // expression (numbers in a form a*ZEROES_BASE**b), we can calculate the |
67 | | // resulting extension by multiplying the extensions for individual components |
68 | | // using log_{ZEROES_BASE}(num_zeroes) polynomial multiplications. The tables of |
69 | | // zero extensions contain (ZEROES_BASE - 1) * (log_{ZEROES_BASE}(64)) entries. |
70 | | constexpr int ZEROES_BASE_LG = 4; // log_2(ZEROES_BASE) |
71 | | constexpr int ZEROES_BASE = (1 << ZEROES_BASE_LG); // must be a power of 2 |
72 | | |
73 | | constexpr uint32_t kCrc32cPoly = 0x82f63b78; |
74 | | |
75 | 0 | uint32_t ReverseBits(uint32_t bits) { |
76 | 0 | bits = (bits & 0xaaaaaaaau) >> 1 | (bits & 0x55555555u) << 1; |
77 | 0 | bits = (bits & 0xccccccccu) >> 2 | (bits & 0x33333333u) << 2; |
78 | 0 | bits = (bits & 0xf0f0f0f0u) >> 4 | (bits & 0x0f0f0f0fu) << 4; |
79 | 0 | return absl::gbswap_32(bits); |
80 | 0 | } |
81 | | |
82 | | // Polynomial long multiplication mod the polynomial of degree 32. |
83 | 0 | void PolyMultiply(uint32_t* val, uint32_t m, uint32_t poly) { |
84 | 0 | uint32_t l = *val; |
85 | 0 | uint32_t result = 0; |
86 | 0 | auto onebit = uint32_t{0x80000000u}; |
87 | 0 | for (uint32_t one = onebit; one != 0; one >>= 1) { |
88 | 0 | if ((l & one) != 0) { |
89 | 0 | result ^= m; |
90 | 0 | } |
91 | 0 | if (m & 1) { |
92 | 0 | m = (m >> 1) ^ poly; |
93 | 0 | } else { |
94 | 0 | m >>= 1; |
95 | 0 | } |
96 | 0 | } |
97 | 0 | *val = result; |
98 | 0 | } |
99 | | } // namespace |
100 | | |
101 | | void CRCImpl::FillWordTable(uint32_t poly, uint32_t last, int word_size, |
102 | 0 | Uint32By256* t) { |
103 | 0 | for (int j = 0; j != word_size; j++) { // for each byte of extension.... |
104 | 0 | t[j][0] = 0; // a zero has no effect |
105 | 0 | for (int i = 128; i != 0; i >>= 1) { // fill in entries for powers of 2 |
106 | 0 | if (j == 0 && i == 128) { |
107 | 0 | t[j][i] = last; // top bit in last byte is given |
108 | 0 | } else { |
109 | | // each successive power of two is derived from the previous |
110 | | // one, either in this table, or the last table |
111 | 0 | uint32_t pred; |
112 | 0 | if (i == 128) { |
113 | 0 | pred = t[j - 1][1]; |
114 | 0 | } else { |
115 | 0 | pred = t[j][i << 1]; |
116 | 0 | } |
117 | | // Advance the CRC by one bit (multiply by X, and take remainder |
118 | | // through one step of polynomial long division) |
119 | 0 | if (pred & 1) { |
120 | 0 | t[j][i] = (pred >> 1) ^ poly; |
121 | 0 | } else { |
122 | 0 | t[j][i] = pred >> 1; |
123 | 0 | } |
124 | 0 | } |
125 | 0 | } |
126 | | // CRCs have the property that CRC(a xor b) == CRC(a) xor CRC(b) |
127 | | // so we can make all the tables for non-powers of two by |
128 | | // xoring previously created entries. |
129 | 0 | for (int i = 2; i != 256; i <<= 1) { |
130 | 0 | for (int k = i + 1; k != (i << 1); k++) { |
131 | 0 | t[j][k] = t[j][i] ^ t[j][k - i]; |
132 | 0 | } |
133 | 0 | } |
134 | 0 | } |
135 | 0 | } |
136 | | |
137 | 0 | int CRCImpl::FillZeroesTable(uint32_t poly, Uint32By256* t) { |
138 | 0 | uint32_t inc = 1; |
139 | 0 | inc <<= 31; |
140 | | |
141 | | // Extend by one zero bit. We know degree > 1 so (inc & 1) == 0. |
142 | 0 | inc >>= 1; |
143 | | |
144 | | // Now extend by 2, 4, and 8 bits, so now `inc` is extended by one zero byte. |
145 | 0 | for (int i = 0; i < 3; ++i) { |
146 | 0 | PolyMultiply(&inc, inc, poly); |
147 | 0 | } |
148 | |
|
149 | 0 | int j = 0; |
150 | 0 | for (uint64_t inc_len = 1; inc_len != 0; inc_len <<= ZEROES_BASE_LG) { |
151 | | // Every entry in the table adds an additional inc_len zeroes. |
152 | 0 | uint32_t v = inc; |
153 | 0 | for (int a = 1; a != ZEROES_BASE; a++) { |
154 | 0 | t[0][j] = v; |
155 | 0 | PolyMultiply(&v, inc, poly); |
156 | 0 | j++; |
157 | 0 | } |
158 | 0 | inc = v; |
159 | 0 | } |
160 | 0 | ABSL_RAW_CHECK(j <= 256, ""); |
161 | 0 | return j; |
162 | 0 | } |
163 | | |
164 | | // Internal version of the "constructor". |
165 | 0 | CRCImpl* CRCImpl::NewInternal() { |
166 | | // Find an accelearated implementation first. |
167 | 0 | CRCImpl* result = TryNewCRC32AcceleratedX86ARMCombined(); |
168 | | |
169 | | // Fall back to generic implementions if no acceleration is available. |
170 | 0 | if (result == nullptr) { |
171 | 0 | result = new CRC32(); |
172 | 0 | } |
173 | |
|
174 | 0 | result->InitTables(); |
175 | |
|
176 | 0 | return result; |
177 | 0 | } |
178 | | |
179 | | // The 32-bit implementation |
180 | | |
181 | 0 | void CRC32::InitTables() { |
182 | | // Compute the table for extending a CRC by one byte. |
183 | 0 | Uint32By256* t = new Uint32By256[4]; |
184 | 0 | FillWordTable(kCrc32cPoly, kCrc32cPoly, 1, t); |
185 | 0 | for (int i = 0; i != 256; i++) { |
186 | 0 | this->table0_[i] = t[0][i]; |
187 | 0 | } |
188 | | |
189 | | // Construct a table for updating the CRC by 4 bytes data followed by |
190 | | // 12 bytes of zeroes. |
191 | | // |
192 | | // Note: the data word size could be larger than the CRC size; it might |
193 | | // be slightly faster to use a 64-bit data word, but doing so doubles the |
194 | | // table size. |
195 | 0 | uint32_t last = kCrc32cPoly; |
196 | 0 | const size_t size = 12; |
197 | 0 | for (size_t i = 0; i < size; ++i) { |
198 | 0 | last = (last >> 8) ^ this->table0_[last & 0xff]; |
199 | 0 | } |
200 | 0 | FillWordTable(kCrc32cPoly, last, 4, t); |
201 | 0 | for (size_t b = 0; b < 4; ++b) { |
202 | 0 | for (int i = 0; i < 256; ++i) { |
203 | 0 | this->table_[b][i] = t[b][i]; |
204 | 0 | } |
205 | 0 | } |
206 | |
|
207 | 0 | int j = FillZeroesTable(kCrc32cPoly, t); |
208 | 0 | ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->zeroes_)), ""); |
209 | 0 | for (int i = 0; i < j; i++) { |
210 | 0 | this->zeroes_[i] = t[0][i]; |
211 | 0 | } |
212 | |
|
213 | 0 | delete[] t; |
214 | | |
215 | | // Build up tables for _reversing_ the operation of doing CRC operations on |
216 | | // zero bytes. |
217 | | |
218 | | // In C++, extending `crc` by a single zero bit is done by the following: |
219 | | // (A) bool low_bit_set = (crc & 1); |
220 | | // crc >>= 1; |
221 | | // if (low_bit_set) crc ^= kCrc32cPoly; |
222 | | // |
223 | | // In particular note that the high bit of `crc` after this operation will be |
224 | | // set if and only if the low bit of `crc` was set before it. This means that |
225 | | // no information is lost, and the operation can be reversed, as follows: |
226 | | // (B) bool high_bit_set = (crc & 0x80000000u); |
227 | | // if (high_bit_set) crc ^= kCrc32cPoly; |
228 | | // crc <<= 1; |
229 | | // if (high_bit_set) crc ^= 1; |
230 | | // |
231 | | // Or, equivalently: |
232 | | // (C) bool high_bit_set = (crc & 0x80000000u); |
233 | | // crc <<= 1; |
234 | | // if (high_bit_set) crc ^= ((kCrc32cPoly << 1) ^ 1); |
235 | | // |
236 | | // The last observation is, if we store our checksums in variable `rcrc`, |
237 | | // with order of the bits reversed, the inverse operation becomes: |
238 | | // (D) bool low_bit_set = (rcrc & 1); |
239 | | // rcrc >>= 1; |
240 | | // if (low_bit_set) rcrc ^= ReverseBits((kCrc32cPoly << 1) ^ 1) |
241 | | // |
242 | | // This is the same algorithm (A) that we started with, only with a different |
243 | | // polynomial bit pattern. This means that by building up our tables with |
244 | | // this alternate polynomial, we can apply the CRC algorithms to a |
245 | | // bit-reversed CRC checksum to perform inverse zero-extension. |
246 | |
|
247 | 0 | const uint32_t kCrc32cUnextendPoly = |
248 | 0 | ReverseBits(static_cast<uint32_t>((kCrc32cPoly << 1) ^ 1)); |
249 | 0 | FillWordTable(kCrc32cUnextendPoly, kCrc32cUnextendPoly, 1, &reverse_table0_); |
250 | |
|
251 | 0 | j = FillZeroesTable(kCrc32cUnextendPoly, &reverse_zeroes_); |
252 | 0 | ABSL_RAW_CHECK(j <= static_cast<int>(ABSL_ARRAYSIZE(this->reverse_zeroes_)), |
253 | 0 | ""); |
254 | 0 | } |
255 | | |
256 | 0 | void CRC32::Extend(uint32_t* crc, const void* bytes, size_t length) const { |
257 | 0 | const uint8_t* p = static_cast<const uint8_t*>(bytes); |
258 | 0 | const uint8_t* e = p + length; |
259 | 0 | uint32_t l = *crc; |
260 | |
|
261 | 0 | auto step_one_byte = [this, &p, &l]() { |
262 | 0 | int c = (l & 0xff) ^ *p++; |
263 | 0 | l = this->table0_[c] ^ (l >> 8); |
264 | 0 | }; |
265 | |
|
266 | 0 | if (kNeedAlignedLoads) { |
267 | | // point x at first 4-byte aligned byte in string. this might be past the |
268 | | // end of the string. |
269 | 0 | const uint8_t* x = RoundUp<4>(p); |
270 | 0 | if (x <= e) { |
271 | | // Process bytes until finished or p is 4-byte aligned |
272 | 0 | while (p != x) { |
273 | 0 | step_one_byte(); |
274 | 0 | } |
275 | 0 | } |
276 | 0 | } |
277 | |
|
278 | 0 | const size_t kSwathSize = 16; |
279 | 0 | if (static_cast<size_t>(e - p) >= kSwathSize) { |
280 | | // Load one swath of data into the operating buffers. |
281 | 0 | uint32_t buf0 = absl::little_endian::Load32(p) ^ l; |
282 | 0 | uint32_t buf1 = absl::little_endian::Load32(p + 4); |
283 | 0 | uint32_t buf2 = absl::little_endian::Load32(p + 8); |
284 | 0 | uint32_t buf3 = absl::little_endian::Load32(p + 12); |
285 | 0 | p += kSwathSize; |
286 | | |
287 | | // Increment a CRC value by a "swath"; this combines the four bytes |
288 | | // starting at `ptr` and twelve zero bytes, so that four CRCs can be |
289 | | // built incrementally and combined at the end. |
290 | 0 | const auto step_swath = [this](uint32_t crc_in, const std::uint8_t* ptr) { |
291 | 0 | return absl::little_endian::Load32(ptr) ^ |
292 | 0 | this->table_[3][crc_in & 0xff] ^ |
293 | 0 | this->table_[2][(crc_in >> 8) & 0xff] ^ |
294 | 0 | this->table_[1][(crc_in >> 16) & 0xff] ^ |
295 | 0 | this->table_[0][crc_in >> 24]; |
296 | 0 | }; |
297 | | |
298 | | // Run one CRC calculation step over all swaths in one 16-byte stride |
299 | 0 | const auto step_stride = [&]() { |
300 | 0 | buf0 = step_swath(buf0, p); |
301 | 0 | buf1 = step_swath(buf1, p + 4); |
302 | 0 | buf2 = step_swath(buf2, p + 8); |
303 | 0 | buf3 = step_swath(buf3, p + 12); |
304 | 0 | p += 16; |
305 | 0 | }; |
306 | | |
307 | | // Process kStride interleaved swaths through the data in parallel. |
308 | 0 | while ((e - p) > kPrefetchHorizon) { |
309 | 0 | PrefetchToLocalCacheNta( |
310 | 0 | reinterpret_cast<const void*>(p + kPrefetchHorizon)); |
311 | | // Process 64 bytes at a time |
312 | 0 | step_stride(); |
313 | 0 | step_stride(); |
314 | 0 | step_stride(); |
315 | 0 | step_stride(); |
316 | 0 | } |
317 | 0 | while (static_cast<size_t>(e - p) >= kSwathSize) { |
318 | 0 | step_stride(); |
319 | 0 | } |
320 | | |
321 | | // Now advance one word at a time as far as possible. This isn't worth |
322 | | // doing if we have word-advance tables. |
323 | 0 | while (static_cast<size_t>(e - p) >= 4) { |
324 | 0 | buf0 = step_swath(buf0, p); |
325 | 0 | uint32_t tmp = buf0; |
326 | 0 | buf0 = buf1; |
327 | 0 | buf1 = buf2; |
328 | 0 | buf2 = buf3; |
329 | 0 | buf3 = tmp; |
330 | 0 | p += 4; |
331 | 0 | } |
332 | | |
333 | | // Combine the results from the different swaths. This is just a CRC |
334 | | // on the data values in the bufX words. |
335 | 0 | auto combine_one_word = [this](uint32_t crc_in, uint32_t w) { |
336 | 0 | w ^= crc_in; |
337 | 0 | for (size_t i = 0; i < 4; ++i) { |
338 | 0 | w = (w >> 8) ^ this->table0_[w & 0xff]; |
339 | 0 | } |
340 | 0 | return w; |
341 | 0 | }; |
342 | |
|
343 | 0 | l = combine_one_word(0, buf0); |
344 | 0 | l = combine_one_word(l, buf1); |
345 | 0 | l = combine_one_word(l, buf2); |
346 | 0 | l = combine_one_word(l, buf3); |
347 | 0 | } |
348 | | |
349 | | // Process the last few bytes |
350 | 0 | while (p != e) { |
351 | 0 | step_one_byte(); |
352 | 0 | } |
353 | |
|
354 | 0 | *crc = l; |
355 | 0 | } |
356 | | |
357 | | void CRC32::ExtendByZeroesImpl(uint32_t* crc, size_t length, |
358 | | const uint32_t zeroes_table[256], |
359 | 0 | const uint32_t poly_table[256]) { |
360 | 0 | if (length != 0) { |
361 | 0 | uint32_t l = *crc; |
362 | | // For each ZEROES_BASE_LG bits in length |
363 | | // (after the low-order bits have been removed) |
364 | | // we lookup the appropriate polynomial in the zeroes_ array |
365 | | // and do a polynomial long multiplication (mod the CRC polynomial) |
366 | | // to extend the CRC by the appropriate number of bits. |
367 | 0 | for (int i = 0; length != 0; |
368 | 0 | i += ZEROES_BASE - 1, length >>= ZEROES_BASE_LG) { |
369 | 0 | int c = length & (ZEROES_BASE - 1); // pick next ZEROES_BASE_LG bits |
370 | 0 | if (c != 0) { // if they are not zero, |
371 | | // multiply by entry in table |
372 | | // Build a table to aid in multiplying 2 bits at a time. |
373 | | // It takes too long to build tables for more bits. |
374 | 0 | uint64_t m = zeroes_table[c + i - 1]; |
375 | 0 | m <<= 1; |
376 | 0 | uint64_t m2 = m << 1; |
377 | 0 | uint64_t mtab[4] = {0, m, m2, m2 ^ m}; |
378 | | |
379 | | // Do the multiply one byte at a time. |
380 | 0 | uint64_t result = 0; |
381 | 0 | for (int x = 0; x < 32; x += 8) { |
382 | | // The carry-less multiply. |
383 | 0 | result ^= mtab[l & 3] ^ (mtab[(l >> 2) & 3] << 2) ^ |
384 | 0 | (mtab[(l >> 4) & 3] << 4) ^ (mtab[(l >> 6) & 3] << 6); |
385 | 0 | l >>= 8; |
386 | | |
387 | | // Reduce modulo the polynomial |
388 | 0 | result = (result >> 8) ^ poly_table[result & 0xff]; |
389 | 0 | } |
390 | 0 | l = static_cast<uint32_t>(result); |
391 | 0 | } |
392 | 0 | } |
393 | 0 | *crc = l; |
394 | 0 | } |
395 | 0 | } |
396 | | |
397 | 0 | void CRC32::ExtendByZeroes(uint32_t* crc, size_t length) const { |
398 | 0 | return CRC32::ExtendByZeroesImpl(crc, length, zeroes_, table0_); |
399 | 0 | } |
400 | | |
401 | 0 | void CRC32::UnextendByZeroes(uint32_t* crc, size_t length) const { |
402 | | // See the comment in CRC32::InitTables() for an explanation of the algorithm |
403 | | // below. |
404 | 0 | *crc = ReverseBits(*crc); |
405 | 0 | ExtendByZeroesImpl(crc, length, reverse_zeroes_, reverse_table0_); |
406 | 0 | *crc = ReverseBits(*crc); |
407 | 0 | } |
408 | | |
409 | 0 | void CRC32::Scramble(uint32_t* crc) const { |
410 | | // Rotate by near half the word size plus 1. See the scramble comment in |
411 | | // crc_internal.h for an explanation. |
412 | 0 | constexpr int scramble_rotate = (32 / 2) + 1; |
413 | 0 | *crc = RotateRight<uint32_t>(static_cast<unsigned int>(*crc + kScrambleLo), |
414 | 0 | 32, scramble_rotate) & |
415 | 0 | MaskOfLength<uint32_t>(32); |
416 | 0 | } |
417 | | |
418 | 0 | void CRC32::Unscramble(uint32_t* crc) const { |
419 | 0 | constexpr int scramble_rotate = (32 / 2) + 1; |
420 | 0 | uint64_t rotated = RotateRight<uint32_t>(static_cast<unsigned int>(*crc), 32, |
421 | 0 | 32 - scramble_rotate); |
422 | 0 | *crc = (rotated - kScrambleLo) & MaskOfLength<uint32_t>(32); |
423 | 0 | } |
424 | | |
425 | | // Constructor and destructor for base class CRC. |
426 | 0 | CRC::~CRC() {} |
427 | 0 | CRC::CRC() {} |
428 | | |
429 | | // The "constructor" for a CRC32C with a standard polynomial. |
430 | 0 | CRC* CRC::Crc32c() { |
431 | 0 | static CRC* singleton = CRCImpl::NewInternal(); |
432 | 0 | return singleton; |
433 | 0 | } |
434 | | |
435 | | } // namespace crc_internal |
436 | | ABSL_NAMESPACE_END |
437 | | } // namespace absl |