Coverage Report

Created: 2024-05-20 07:14

/src/skia/third_party/externals/zlib/crc32.c
Line
Count
Source (jump to first uncovered line)
1
/* crc32.c -- compute the CRC-32 of a data stream
2
 * Copyright (C) 1995-2022 Mark Adler
3
 * For conditions of distribution and use, see copyright notice in zlib.h
4
 *
5
 * This interleaved implementation of a CRC makes use of pipelined multiple
6
 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7
 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8
 */
9
10
/* @(#) $Id$ */
11
12
/*
13
  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14
  protection on the static variables used to control the first-use generation
15
  of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16
  first call get_crc_table() to initialize the tables before allowing more than
17
  one thread to use crc32().
18
19
  MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20
  produced, so that this one source file can be compiled to an executable.
21
 */
22
23
#ifdef MAKECRCH
24
#  include <stdio.h>
25
#  ifndef DYNAMIC_CRC_TABLE
26
#    define DYNAMIC_CRC_TABLE
27
#  endif /* !DYNAMIC_CRC_TABLE */
28
#endif /* MAKECRCH */
29
30
#include "deflate.h"
31
#include "cpu_features.h"
32
#include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
33
34
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
35
#include "crc32_simd.h"
36
#endif
37
38
 /*
39
  A CRC of a message is computed on N braids of words in the message, where
40
  each word consists of W bytes (4 or 8). If N is 3, for example, then three
41
  running sparse CRCs are calculated respectively on each braid, at these
42
  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
43
  This is done starting at a word boundary, and continues until as many blocks
44
  of N * W bytes as are available have been processed. The results are combined
45
  into a single CRC at the end. For this code, N must be in the range 1..6 and
46
  W must be 4 or 8. The upper limit on N can be increased if desired by adding
47
  more #if blocks, extending the patterns apparent in the code. In addition,
48
  crc32.h would need to be regenerated, if the maximum N value is increased.
49
50
  N and W are chosen empirically by benchmarking the execution time on a given
51
  processor. The choices for N and W below were based on testing on Intel Kaby
52
  Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
53
  Octeon II processors. The Intel, AMD, and ARM processors were all fastest
54
  with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
55
  They were all tested with either gcc or clang, all using the -O3 optimization
56
  level. Your mileage may vary.
57
 */
58
59
/* Define N */
60
#ifdef Z_TESTN
61
#  define N Z_TESTN
62
#else
63
655k
#  define N 5
64
#endif
65
#if N < 1 || N > 6
66
#  error N must be in 1..6
67
#endif
68
69
/*
70
  z_crc_t must be at least 32 bits. z_word_t must be at least as long as
71
  z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
72
  that bytes are eight bits.
73
 */
74
75
/*
76
  Define W and the associated z_word_t type. If W is not defined, then a
77
  braided calculation is not used, and the associated tables and code are not
78
  compiled.
79
 */
80
#ifdef Z_TESTW
81
#  if Z_TESTW-1 != -1
82
#    define W Z_TESTW
83
#  endif
84
#else
85
#  ifdef MAKECRCH
86
#    define W 8         /* required for MAKECRCH */
87
#  else
88
#    if defined(__x86_64__) || defined(__aarch64__)
89
1.40M
#      define W 8
90
#    else
91
#      define W 4
92
#    endif
93
#  endif
94
#endif
95
#ifdef W
96
#  if W == 8 && defined(Z_U8)
97
     typedef Z_U8 z_word_t;
98
#  elif defined(Z_U4)
99
#    undef W
100
#    define W 4
101
     typedef Z_U4 z_word_t;
102
#  else
103
#    undef W
104
#  endif
105
#endif
106
107
/* If available, use the ARM processor CRC32 instruction. */
108
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 \
109
    && defined(USE_CANONICAL_ARMV8_CRC32)
110
#  define ARMCRC32_CANONICAL_ZLIB
111
#endif
112
113
#if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
114
/*
115
  Swap the bytes in a z_word_t to convert between little and big endian. Any
116
  self-respecting compiler will optimize this to a single machine byte-swap
117
  instruction, if one is available. This assumes that word_t is either 32 bits
118
  or 64 bits.
119
 */
120
0
local z_word_t byte_swap(z_word_t word) {
121
0
#  if W == 8
122
0
    return
123
0
        (word & 0xff00000000000000) >> 56 |
124
0
        (word & 0xff000000000000) >> 40 |
125
0
        (word & 0xff0000000000) >> 24 |
126
0
        (word & 0xff00000000) >> 8 |
127
0
        (word & 0xff000000) << 8 |
128
0
        (word & 0xff0000) << 24 |
129
0
        (word & 0xff00) << 40 |
130
0
        (word & 0xff) << 56;
131
#  else   /* W == 4 */
132
    return
133
        (word & 0xff000000) >> 24 |
134
        (word & 0xff0000) >> 8 |
135
        (word & 0xff00) << 8 |
136
        (word & 0xff) << 24;
137
#  endif
138
0
}
139
#endif
140
141
#ifdef DYNAMIC_CRC_TABLE
142
/* =========================================================================
143
 * Table of powers of x for combining CRC-32s, filled in by make_crc_table()
144
 * below.
145
 */
146
   local z_crc_t FAR x2n_table[32];
147
#else
148
/* =========================================================================
149
 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
150
 * of x for combining CRC-32s, all made by make_crc_table().
151
 */
152
#  include "crc32.h"
153
#endif
154
155
/* CRC polynomial. */
156
0
#define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
157
158
/*
159
  Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
160
  reflected. For speed, this requires that a not be zero.
161
 */
162
0
local z_crc_t multmodp(z_crc_t a, z_crc_t b) {
163
0
    z_crc_t m, p;
164
165
0
    m = (z_crc_t)1 << 31;
166
0
    p = 0;
167
0
    for (;;) {
168
0
        if (a & m) {
169
0
            p ^= b;
170
0
            if ((a & (m - 1)) == 0)
171
0
                break;
172
0
        }
173
0
        m >>= 1;
174
0
        b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
175
0
    }
176
0
    return p;
177
0
}
178
179
/*
180
  Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
181
  initialized.
182
 */
183
0
local z_crc_t x2nmodp(z_off64_t n, unsigned k) {
184
0
    z_crc_t p;
185
186
0
    p = (z_crc_t)1 << 31;           /* x^0 == 1 */
187
0
    while (n) {
188
0
        if (n & 1)
189
0
            p = multmodp(x2n_table[k & 31], p);
190
0
        n >>= 1;
191
0
        k++;
192
0
    }
193
0
    return p;
194
0
}
195
196
#ifdef DYNAMIC_CRC_TABLE
197
/* =========================================================================
198
 * Build the tables for byte-wise and braided CRC-32 calculations, and a table
199
 * of powers of x for combining CRC-32s.
200
 */
201
local z_crc_t FAR crc_table[256];
202
#ifdef W
203
   local z_word_t FAR crc_big_table[256];
204
   local z_crc_t FAR crc_braid_table[W][256];
205
   local z_word_t FAR crc_braid_big_table[W][256];
206
   local void braid(z_crc_t [][256], z_word_t [][256], int, int);
207
#endif
208
#ifdef MAKECRCH
209
   local void write_table(FILE *, const z_crc_t FAR *, int);
210
   local void write_table32hi(FILE *, const z_word_t FAR *, int);
211
   local void write_table64(FILE *, const z_word_t FAR *, int);
212
#endif /* MAKECRCH */
213
214
/*
215
  Define a once() function depending on the availability of atomics. If this is
216
  compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
217
  multiple threads, and if atomics are not available, then get_crc_table() must
218
  be called to initialize the tables and must return before any threads are
219
  allowed to compute or combine CRCs.
220
 */
221
222
/* Definition of once functionality. */
223
typedef struct once_s once_t;
224
225
/* Check for the availability of atomics. */
226
#if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
227
    !defined(__STDC_NO_ATOMICS__)
228
229
#include <stdatomic.h>
230
231
/* Structure for once(), which must be initialized with ONCE_INIT. */
232
struct once_s {
233
    atomic_flag begun;
234
    atomic_int done;
235
};
236
#define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
237
238
/*
239
  Run the provided init() function exactly once, even if multiple threads
240
  invoke once() at the same time. The state must be a once_t initialized with
241
  ONCE_INIT.
242
 */
243
local void once(once_t *state, void (*init)(void)) {
244
    if (!atomic_load(&state->done)) {
245
        if (atomic_flag_test_and_set(&state->begun))
246
            while (!atomic_load(&state->done))
247
                ;
248
        else {
249
            init();
250
            atomic_store(&state->done, 1);
251
        }
252
    }
253
}
254
255
#else   /* no atomics */
256
257
/* Structure for once(), which must be initialized with ONCE_INIT. */
258
struct once_s {
259
    volatile int begun;
260
    volatile int done;
261
};
262
#define ONCE_INIT {0, 0}
263
264
/* Test and set. Alas, not atomic, but tries to minimize the period of
265
   vulnerability. */
266
local int test_and_set(int volatile *flag) {
267
    int was;
268
269
    was = *flag;
270
    *flag = 1;
271
    return was;
272
}
273
274
/* Run the provided init() function once. This is not thread-safe. */
275
local void once(once_t *state, void (*init)(void)) {
276
    if (!state->done) {
277
        if (test_and_set(&state->begun))
278
            while (!state->done)
279
                ;
280
        else {
281
            init();
282
            state->done = 1;
283
        }
284
    }
285
}
286
287
#endif
288
289
/* State for once(). */
290
local once_t made = ONCE_INIT;
291
292
/*
293
  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
294
  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
295
296
  Polynomials over GF(2) are represented in binary, one bit per coefficient,
297
  with the lowest powers in the most significant bit. Then adding polynomials
298
  is just exclusive-or, and multiplying a polynomial by x is a right shift by
299
  one. If we call the above polynomial p, and represent a byte as the
300
  polynomial q, also with the lowest power in the most significant bit (so the
301
  byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
302
  where a mod b means the remainder after dividing a by b.
303
304
  This calculation is done using the shift-register method of multiplying and
305
  taking the remainder. The register is initialized to zero, and for each
306
  incoming bit, x^32 is added mod p to the register if the bit is a one (where
307
  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
308
  (which is shifting right by one and adding x^32 mod p if the bit shifted out
309
  is a one). We start with the highest power (least significant bit) of q and
310
  repeat for all eight bits of q.
311
312
  The table is simply the CRC of all possible eight bit values. This is all the
313
  information needed to generate CRCs on data a byte at a time for all
314
  combinations of CRC register values and incoming bytes.
315
 */
316
local void make_crc_table(void)
317
{
318
    unsigned i, j, n;
319
    z_crc_t p;
320
321
    /* initialize the CRC of bytes tables */
322
    for (i = 0; i < 256; i++) {
323
        p = i;
324
        for (j = 0; j < 8; j++)
325
            p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
326
        crc_table[i] = p;
327
#ifdef W
328
        crc_big_table[i] = byte_swap(p);
329
#endif
330
    }
331
332
    /* initialize the x^2^n mod p(x) table */
333
    p = (z_crc_t)1 << 30;         /* x^1 */
334
    x2n_table[0] = p;
335
    for (n = 1; n < 32; n++)
336
        x2n_table[n] = p = multmodp(p, p);
337
338
#ifdef W
339
    /* initialize the braiding tables -- needs x2n_table[] */
340
    braid(crc_braid_table, crc_braid_big_table, N, W);
341
#endif
342
343
#ifdef MAKECRCH
344
    {
345
        /*
346
          The crc32.h header file contains tables for both 32-bit and 64-bit
347
          z_word_t's, and so requires a 64-bit type be available. In that case,
348
          z_word_t must be defined to be 64-bits. This code then also generates
349
          and writes out the tables for the case that z_word_t is 32 bits.
350
         */
351
#if !defined(W) || W != 8
352
#  error Need a 64-bit integer type in order to generate crc32.h.
353
#endif
354
        FILE *out;
355
        int k, n;
356
        z_crc_t ltl[8][256];
357
        z_word_t big[8][256];
358
359
        out = fopen("crc32.h", "w");
360
        if (out == NULL) return;
361
362
        /* write out little-endian CRC table to crc32.h */
363
        fprintf(out,
364
            "/* crc32.h -- tables for rapid CRC calculation\n"
365
            " * Generated automatically by crc32.c\n */\n"
366
            "\n"
367
            "local const z_crc_t FAR crc_table[] = {\n"
368
            "    ");
369
        write_table(out, crc_table, 256);
370
        fprintf(out,
371
            "};\n");
372
373
        /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
374
        fprintf(out,
375
            "\n"
376
            "#ifdef W\n"
377
            "\n"
378
            "#if W == 8\n"
379
            "\n"
380
            "local const z_word_t FAR crc_big_table[] = {\n"
381
            "    ");
382
        write_table64(out, crc_big_table, 256);
383
        fprintf(out,
384
            "};\n");
385
386
        /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
387
        fprintf(out,
388
            "\n"
389
            "#else /* W == 4 */\n"
390
            "\n"
391
            "local const z_word_t FAR crc_big_table[] = {\n"
392
            "    ");
393
        write_table32hi(out, crc_big_table, 256);
394
        fprintf(out,
395
            "};\n"
396
            "\n"
397
            "#endif\n");
398
399
        /* write out braid tables for each value of N */
400
        for (n = 1; n <= 6; n++) {
401
            fprintf(out,
402
            "\n"
403
            "#if N == %d\n", n);
404
405
            /* compute braid tables for this N and 64-bit word_t */
406
            braid(ltl, big, n, 8);
407
408
            /* write out braid tables for 64-bit z_word_t to crc32.h */
409
            fprintf(out,
410
            "\n"
411
            "#if W == 8\n"
412
            "\n"
413
            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
414
            for (k = 0; k < 8; k++) {
415
                fprintf(out, "   {");
416
                write_table(out, ltl[k], 256);
417
                fprintf(out, "}%s", k < 7 ? ",\n" : "");
418
            }
419
            fprintf(out,
420
            "};\n"
421
            "\n"
422
            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
423
            for (k = 0; k < 8; k++) {
424
                fprintf(out, "   {");
425
                write_table64(out, big[k], 256);
426
                fprintf(out, "}%s", k < 7 ? ",\n" : "");
427
            }
428
            fprintf(out,
429
            "};\n");
430
431
            /* compute braid tables for this N and 32-bit word_t */
432
            braid(ltl, big, n, 4);
433
434
            /* write out braid tables for 32-bit z_word_t to crc32.h */
435
            fprintf(out,
436
            "\n"
437
            "#else /* W == 4 */\n"
438
            "\n"
439
            "local const z_crc_t FAR crc_braid_table[][256] = {\n");
440
            for (k = 0; k < 4; k++) {
441
                fprintf(out, "   {");
442
                write_table(out, ltl[k], 256);
443
                fprintf(out, "}%s", k < 3 ? ",\n" : "");
444
            }
445
            fprintf(out,
446
            "};\n"
447
            "\n"
448
            "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
449
            for (k = 0; k < 4; k++) {
450
                fprintf(out, "   {");
451
                write_table32hi(out, big[k], 256);
452
                fprintf(out, "}%s", k < 3 ? ",\n" : "");
453
            }
454
            fprintf(out,
455
            "};\n"
456
            "\n"
457
            "#endif\n"
458
            "\n"
459
            "#endif\n");
460
        }
461
        fprintf(out,
462
            "\n"
463
            "#endif\n");
464
465
        /* write out zeros operator table to crc32.h */
466
        fprintf(out,
467
            "\n"
468
            "local const z_crc_t FAR x2n_table[] = {\n"
469
            "    ");
470
        write_table(out, x2n_table, 32);
471
        fprintf(out,
472
            "};\n");
473
        fclose(out);
474
    }
475
#endif /* MAKECRCH */
476
}
477
478
#ifdef MAKECRCH
479
480
/*
481
   Write the 32-bit values in table[0..k-1] to out, five per line in
482
   hexadecimal separated by commas.
483
 */
484
local void write_table(FILE *out, const z_crc_t FAR *table, int k) {
485
    int n;
486
487
    for (n = 0; n < k; n++)
488
        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
489
                (unsigned long)(table[n]),
490
                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
491
}
492
493
/*
494
   Write the high 32-bits of each value in table[0..k-1] to out, five per line
495
   in hexadecimal separated by commas.
496
 */
497
local void write_table32hi(FILE *out, const z_word_t FAR *table, int k) {
498
    int n;
499
500
    for (n = 0; n < k; n++)
501
        fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
502
                (unsigned long)(table[n] >> 32),
503
                n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
504
}
505
506
/*
507
  Write the 64-bit values in table[0..k-1] to out, three per line in
508
  hexadecimal separated by commas. This assumes that if there is a 64-bit
509
  type, then there is also a long long integer type, and it is at least 64
510
  bits. If not, then the type cast and format string can be adjusted
511
  accordingly.
512
 */
513
local void write_table64(FILE *out, const z_word_t FAR *table, int k) {
514
    int n;
515
516
    for (n = 0; n < k; n++)
517
        fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
518
                (unsigned long long)(table[n]),
519
                n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
520
}
521
522
/* Actually do the deed. */
523
int main(void) {
524
    make_crc_table();
525
    return 0;
526
}
527
528
#endif /* MAKECRCH */
529
530
#ifdef W
531
/*
532
  Generate the little and big-endian braid tables for the given n and z_word_t
533
  size w. Each array must have room for w blocks of 256 elements.
534
 */
535
local void braid(z_crc_t ltl[][256], z_word_t big[][256], int n, int w) {
536
    int k;
537
    z_crc_t i, p, q;
538
    for (k = 0; k < w; k++) {
539
        p = x2nmodp((n * w + 3 - k) << 3, 0);
540
        ltl[k][0] = 0;
541
        big[w - 1 - k][0] = 0;
542
        for (i = 1; i < 256; i++) {
543
            ltl[k][i] = q = multmodp(i << 24, p);
544
            big[w - 1 - k][i] = byte_swap(q);
545
        }
546
    }
547
}
548
#endif
549
550
#endif /* DYNAMIC_CRC_TABLE */
551
552
/* =========================================================================
553
 * This function can be used by asm versions of crc32(), and to force the
554
 * generation of the CRC tables in a threaded application.
555
 */
556
0
const z_crc_t FAR * ZEXPORT get_crc_table(void) {
557
#ifdef DYNAMIC_CRC_TABLE
558
    once(&made, make_crc_table);
559
#endif /* DYNAMIC_CRC_TABLE */
560
0
    return (const z_crc_t FAR *)crc_table;
561
0
}
562
563
/* =========================================================================
564
 * Use ARM machine instructions if available. This will compute the CRC about
565
 * ten times faster than the braided calculation. This code does not check for
566
 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
567
 * only be defined if the compilation specifies an ARM processor architecture
568
 * that has the instructions. For example, compiling with -march=armv8.1-a or
569
 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
570
 * instructions.
571
 */
572
#if ARMCRC32_CANONICAL_ZLIB
573
574
/*
575
   Constants empirically determined to maximize speed. These values are from
576
   measurements on a Cortex-A57. Your mileage may vary.
577
 */
578
#define Z_BATCH 3990                /* number of words in a batch */
579
#define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
580
#define Z_BATCH_MIN 800             /* fewest words in a final batch */
581
582
unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
583
                              z_size_t len) {
584
    z_crc_t val;
585
    z_word_t crc1, crc2;
586
    const z_word_t *word;
587
    z_word_t val0, val1, val2;
588
    z_size_t last, last2, i;
589
    z_size_t num;
590
591
    /* Return initial CRC, if requested. */
592
    if (buf == Z_NULL) return 0;
593
594
#ifdef DYNAMIC_CRC_TABLE
595
    once(&made, make_crc_table);
596
#endif /* DYNAMIC_CRC_TABLE */
597
598
    /* Pre-condition the CRC */
599
    crc = (~crc) & 0xffffffff;
600
601
    /* Compute the CRC up to a word boundary. */
602
    while (len && ((z_size_t)buf & 7) != 0) {
603
        len--;
604
        val = *buf++;
605
        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
606
    }
607
608
    /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
609
    word = (z_word_t const *)buf;
610
    num = len >> 3;
611
    len &= 7;
612
613
    /* Do three interleaved CRCs to realize the throughput of one crc32x
614
       instruction per cycle. Each CRC is calculated on Z_BATCH words. The
615
       three CRCs are combined into a single CRC after each set of batches. */
616
    while (num >= 3 * Z_BATCH) {
617
        crc1 = 0;
618
        crc2 = 0;
619
        for (i = 0; i < Z_BATCH; i++) {
620
            val0 = word[i];
621
            val1 = word[i + Z_BATCH];
622
            val2 = word[i + 2 * Z_BATCH];
623
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
624
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
625
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
626
        }
627
        word += 3 * Z_BATCH;
628
        num -= 3 * Z_BATCH;
629
        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
630
        crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
631
    }
632
633
    /* Do one last smaller batch with the remaining words, if there are enough
634
       to pay for the combination of CRCs. */
635
    last = num / 3;
636
    if (last >= Z_BATCH_MIN) {
637
        last2 = last << 1;
638
        crc1 = 0;
639
        crc2 = 0;
640
        for (i = 0; i < last; i++) {
641
            val0 = word[i];
642
            val1 = word[i + last];
643
            val2 = word[i + last2];
644
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
645
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
646
            __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
647
        }
648
        word += 3 * last;
649
        num -= 3 * last;
650
        val = x2nmodp(last, 6);
651
        crc = multmodp(val, crc) ^ crc1;
652
        crc = multmodp(val, crc) ^ crc2;
653
    }
654
655
    /* Compute the CRC on any remaining words. */
656
    for (i = 0; i < num; i++) {
657
        val0 = word[i];
658
        __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
659
    }
660
    word += num;
661
662
    /* Complete the CRC on any remaining bytes. */
663
    buf = (const unsigned char FAR *)word;
664
    while (len) {
665
        len--;
666
        val = *buf++;
667
        __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
668
    }
669
670
    /* Return the CRC, post-conditioned. */
671
    return crc ^ 0xffffffff;
672
}
673
674
#else
675
676
#ifdef W
677
678
/*
679
  Return the CRC of the W bytes in the word_t data, taking the
680
  least-significant byte of the word as the first byte of data, without any pre
681
  or post conditioning. This is used to combine the CRCs of each braid.
682
 */
683
11.1k
local z_crc_t crc_word(z_word_t data) {
684
11.1k
    int k;
685
100k
    for (k = 0; k < W; k++)
686
89.2k
        data = (data >> 8) ^ crc_table[data & 0xff];
687
11.1k
    return (z_crc_t)data;
688
11.1k
}
689
690
0
local z_word_t crc_word_big(z_word_t data) {
691
0
    int k;
692
0
    for (k = 0; k < W; k++)
693
0
        data = (data << 8) ^
694
0
            crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
695
0
    return data;
696
0
}
697
698
#endif
699
700
/* ========================================================================= */
701
unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
702
672k
                              z_size_t len) {
703
    /*
704
     * zlib convention is to call crc32(0, NULL, 0); before making
705
     * calls to crc32(). So this is a good, early (and infrequent)
706
     * place to cache CPU features if needed for those later, more
707
     * interesting crc32() calls.
708
     */
709
672k
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
710
    /*
711
     * Since this routine can be freely used, check CPU features here.
712
     */
713
672k
    if (buf == Z_NULL) {
714
0
        if (!len) /* Assume user is calling crc32(0, NULL, 0); */
715
0
            cpu_check_features();
716
0
        return 0UL;
717
0
    }
718
719
672k
#endif
720
#if defined(CRC32_SIMD_AVX512_PCLMUL)
721
    if (x86_cpu_enable_avx512 && len >= Z_CRC32_AVX512_MINIMUM_LENGTH) {
722
        /* crc32 64-byte chunks */
723
        z_size_t chunk_size = len & ~Z_CRC32_AVX512_CHUNKSIZE_MASK;
724
        crc = ~crc32_avx512_simd_(buf, chunk_size, ~(uint32_t)crc);
725
        /* check remaining data */
726
        len -= chunk_size;
727
        if (!len)
728
            return crc;
729
        /* Fall into the default crc32 for the remaining data. */
730
        buf += chunk_size;
731
    }
732
#elif defined(CRC32_SIMD_SSE42_PCLMUL)
733
672k
    if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
734
        /* crc32 16-byte chunks */
735
90.3k
        z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
736
90.3k
        crc = ~crc32_sse42_simd_(buf, chunk_size, ~(uint32_t)crc);
737
        /* check remaining data */
738
90.3k
        len -= chunk_size;
739
90.3k
        if (!len)
740
24.4k
            return crc;
741
        /* Fall into the default crc32 for the remaining data. */
742
65.9k
        buf += chunk_size;
743
65.9k
    }
744
#elif defined(CRC32_ARMV8_CRC32)
745
    if (arm_cpu_enable_crc32) {
746
#if defined(__aarch64__)
747
        /* PMULL is 64bit only, plus code needs at least a 64 bytes buffer. */
748
        if (arm_cpu_enable_pmull && (len > Z_CRC32_PMULL_MINIMUM_LENGTH)) {
749
            const size_t chunk_size = len & ~Z_CRC32_PMULL_CHUNKSIZE_MASK;
750
            crc = ~armv8_crc32_pmull_little(buf, chunk_size, ~(uint32_t)crc);
751
            /* Check remaining data. */
752
            len -= chunk_size;
753
            if (!len)
754
                return crc;
755
756
            /* Fall through for the remaining data. */
757
            buf += chunk_size;
758
        }
759
#endif
760
        return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
761
    }
762
#else
763
    if (buf == Z_NULL) {
764
        return 0UL;
765
    }
766
#endif /* CRC32_SIMD */
767
768
#ifdef DYNAMIC_CRC_TABLE
769
    once(&made, make_crc_table);
770
#endif /* DYNAMIC_CRC_TABLE */
771
    /* Pre-condition the CRC */
772
648k
    crc = (~crc) & 0xffffffff;
773
774
648k
#ifdef W
775
776
    /* If provided enough bytes, do a braided CRC calculation. */
777
648k
    if (len >= N * W + W - 1) {
778
2.23k
        z_size_t blks;
779
2.23k
        z_word_t const *words;
780
2.23k
        unsigned endian;
781
2.23k
        int k;
782
783
        /* Compute the CRC up to a z_word_t boundary. */
784
2.23k
        while (len && ((z_size_t)buf & (W - 1)) != 0) {
785
0
            len--;
786
0
            crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
787
0
        }
788
789
        /* Compute the CRC on as many N z_word_t blocks as are available. */
790
2.23k
        blks = len / (N * W);
791
2.23k
        len -= blks * N * W;
792
2.23k
        words = (z_word_t const *)buf;
793
794
        /* Do endian check at execution time instead of compile time, since ARM
795
           processors can change the endianness at execution time. If the
796
           compiler knows what the endianness will be, it can optimize out the
797
           check and the unused branch. */
798
2.23k
        endian = 1;
799
2.23k
        if (*(unsigned char *)&endian) {
800
            /* Little endian. */
801
802
2.23k
            z_crc_t crc0;
803
2.23k
            z_word_t word0;
804
2.23k
#if N > 1
805
2.23k
            z_crc_t crc1;
806
2.23k
            z_word_t word1;
807
2.23k
#if N > 2
808
2.23k
            z_crc_t crc2;
809
2.23k
            z_word_t word2;
810
2.23k
#if N > 3
811
2.23k
            z_crc_t crc3;
812
2.23k
            z_word_t word3;
813
2.23k
#if N > 4
814
2.23k
            z_crc_t crc4;
815
2.23k
            z_word_t word4;
816
#if N > 5
817
            z_crc_t crc5;
818
            z_word_t word5;
819
#endif
820
2.23k
#endif
821
2.23k
#endif
822
2.23k
#endif
823
2.23k
#endif
824
825
            /* Initialize the CRC for each braid. */
826
2.23k
            crc0 = crc;
827
2.23k
#if N > 1
828
2.23k
            crc1 = 0;
829
2.23k
#if N > 2
830
2.23k
            crc2 = 0;
831
2.23k
#if N > 3
832
2.23k
            crc3 = 0;
833
2.23k
#if N > 4
834
2.23k
            crc4 = 0;
835
#if N > 5
836
            crc5 = 0;
837
#endif
838
2.23k
#endif
839
2.23k
#endif
840
2.23k
#endif
841
2.23k
#endif
842
843
            /*
844
              Process the first blks-1 blocks, computing the CRCs on each braid
845
              independently.
846
             */
847
2.23k
            while (--blks) {
848
                /* Load the word for each braid into registers. */
849
0
                word0 = crc0 ^ words[0];
850
0
#if N > 1
851
0
                word1 = crc1 ^ words[1];
852
0
#if N > 2
853
0
                word2 = crc2 ^ words[2];
854
0
#if N > 3
855
0
                word3 = crc3 ^ words[3];
856
0
#if N > 4
857
0
                word4 = crc4 ^ words[4];
858
#if N > 5
859
                word5 = crc5 ^ words[5];
860
#endif
861
0
#endif
862
0
#endif
863
0
#endif
864
0
#endif
865
0
                words += N;
866
867
                /* Compute and update the CRC for each word. The loop should
868
                   get unrolled. */
869
0
                crc0 = crc_braid_table[0][word0 & 0xff];
870
0
#if N > 1
871
0
                crc1 = crc_braid_table[0][word1 & 0xff];
872
0
#if N > 2
873
0
                crc2 = crc_braid_table[0][word2 & 0xff];
874
0
#if N > 3
875
0
                crc3 = crc_braid_table[0][word3 & 0xff];
876
0
#if N > 4
877
0
                crc4 = crc_braid_table[0][word4 & 0xff];
878
#if N > 5
879
                crc5 = crc_braid_table[0][word5 & 0xff];
880
#endif
881
0
#endif
882
0
#endif
883
0
#endif
884
0
#endif
885
0
                for (k = 1; k < W; k++) {
886
0
                    crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
887
0
#if N > 1
888
0
                    crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
889
0
#if N > 2
890
0
                    crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
891
0
#if N > 3
892
0
                    crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
893
0
#if N > 4
894
0
                    crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
895
#if N > 5
896
                    crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
897
#endif
898
0
#endif
899
0
#endif
900
0
#endif
901
0
#endif
902
0
                }
903
0
            }
904
905
            /*
906
              Process the last block, combining the CRCs of the N braids at the
907
              same time.
908
             */
909
2.23k
            crc = crc_word(crc0 ^ words[0]);
910
2.23k
#if N > 1
911
2.23k
            crc = crc_word(crc1 ^ words[1] ^ crc);
912
2.23k
#if N > 2
913
2.23k
            crc = crc_word(crc2 ^ words[2] ^ crc);
914
2.23k
#if N > 3
915
2.23k
            crc = crc_word(crc3 ^ words[3] ^ crc);
916
2.23k
#if N > 4
917
2.23k
            crc = crc_word(crc4 ^ words[4] ^ crc);
918
#if N > 5
919
            crc = crc_word(crc5 ^ words[5] ^ crc);
920
#endif
921
2.23k
#endif
922
2.23k
#endif
923
2.23k
#endif
924
2.23k
#endif
925
2.23k
            words += N;
926
2.23k
        }
927
0
        else {
928
            /* Big endian. */
929
930
0
            z_word_t crc0, word0, comb;
931
0
#if N > 1
932
0
            z_word_t crc1, word1;
933
0
#if N > 2
934
0
            z_word_t crc2, word2;
935
0
#if N > 3
936
0
            z_word_t crc3, word3;
937
0
#if N > 4
938
0
            z_word_t crc4, word4;
939
#if N > 5
940
            z_word_t crc5, word5;
941
#endif
942
0
#endif
943
0
#endif
944
0
#endif
945
0
#endif
946
947
            /* Initialize the CRC for each braid. */
948
0
            crc0 = byte_swap(crc);
949
0
#if N > 1
950
0
            crc1 = 0;
951
0
#if N > 2
952
0
            crc2 = 0;
953
0
#if N > 3
954
0
            crc3 = 0;
955
0
#if N > 4
956
0
            crc4 = 0;
957
#if N > 5
958
            crc5 = 0;
959
#endif
960
0
#endif
961
0
#endif
962
0
#endif
963
0
#endif
964
965
            /*
966
              Process the first blks-1 blocks, computing the CRCs on each braid
967
              independently.
968
             */
969
0
            while (--blks) {
970
                /* Load the word for each braid into registers. */
971
0
                word0 = crc0 ^ words[0];
972
0
#if N > 1
973
0
                word1 = crc1 ^ words[1];
974
0
#if N > 2
975
0
                word2 = crc2 ^ words[2];
976
0
#if N > 3
977
0
                word3 = crc3 ^ words[3];
978
0
#if N > 4
979
0
                word4 = crc4 ^ words[4];
980
#if N > 5
981
                word5 = crc5 ^ words[5];
982
#endif
983
0
#endif
984
0
#endif
985
0
#endif
986
0
#endif
987
0
                words += N;
988
989
                /* Compute and update the CRC for each word. The loop should
990
                   get unrolled. */
991
0
                crc0 = crc_braid_big_table[0][word0 & 0xff];
992
0
#if N > 1
993
0
                crc1 = crc_braid_big_table[0][word1 & 0xff];
994
0
#if N > 2
995
0
                crc2 = crc_braid_big_table[0][word2 & 0xff];
996
0
#if N > 3
997
0
                crc3 = crc_braid_big_table[0][word3 & 0xff];
998
0
#if N > 4
999
0
                crc4 = crc_braid_big_table[0][word4 & 0xff];
1000
#if N > 5
1001
                crc5 = crc_braid_big_table[0][word5 & 0xff];
1002
#endif
1003
0
#endif
1004
0
#endif
1005
0
#endif
1006
0
#endif
1007
0
                for (k = 1; k < W; k++) {
1008
0
                    crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
1009
0
#if N > 1
1010
0
                    crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
1011
0
#if N > 2
1012
0
                    crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
1013
0
#if N > 3
1014
0
                    crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1015
0
#if N > 4
1016
0
                    crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1017
#if N > 5
1018
                    crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1019
#endif
1020
0
#endif
1021
0
#endif
1022
0
#endif
1023
0
#endif
1024
0
                }
1025
0
            }
1026
1027
            /*
1028
              Process the last block, combining the CRCs of the N braids at the
1029
              same time.
1030
             */
1031
0
            comb = crc_word_big(crc0 ^ words[0]);
1032
0
#if N > 1
1033
0
            comb = crc_word_big(crc1 ^ words[1] ^ comb);
1034
0
#if N > 2
1035
0
            comb = crc_word_big(crc2 ^ words[2] ^ comb);
1036
0
#if N > 3
1037
0
            comb = crc_word_big(crc3 ^ words[3] ^ comb);
1038
0
#if N > 4
1039
0
            comb = crc_word_big(crc4 ^ words[4] ^ comb);
1040
#if N > 5
1041
            comb = crc_word_big(crc5 ^ words[5] ^ comb);
1042
#endif
1043
0
#endif
1044
0
#endif
1045
0
#endif
1046
0
#endif
1047
0
            words += N;
1048
0
            crc = byte_swap(comb);
1049
0
        }
1050
1051
        /*
1052
          Update the pointer to the remaining bytes to process.
1053
         */
1054
2.23k
        buf = (unsigned char const *)words;
1055
2.23k
    }
1056
1057
648k
#endif /* W */
1058
1059
    /* Complete the computation of the CRC on any remaining bytes. */
1060
871k
    while (len >= 8) {
1061
223k
        len -= 8;
1062
223k
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1063
223k
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1064
223k
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1065
223k
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1066
223k
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1067
223k
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1068
223k
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1069
223k
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1070
223k
    }
1071
2.98M
    while (len) {
1072
2.34M
        len--;
1073
2.34M
        crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1074
2.34M
    }
1075
1076
    /* Return the CRC, post-conditioned. */
1077
648k
    return crc ^ 0xffffffff;
1078
672k
}
1079
1080
#endif
1081
1082
/* ========================================================================= */
1083
unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1084
996k
                            uInt len) {
1085
    /* Some bots compile with optimizations disabled, others will emulate
1086
     * ARM on x86 and other weird combinations.
1087
     */
1088
996k
#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
1089
    /* We got to verify CPU features, so exploit the common usage pattern
1090
     * of calling this function with Z_NULL for an initial valid crc value.
1091
     * This allows to cache the result of the feature check and avoid extraneous
1092
     * function calls.
1093
     */
1094
996k
    if (buf == Z_NULL) {
1095
323k
        if (!len) /* Assume user is calling crc32(0, NULL, 0); */
1096
323k
            cpu_check_features();
1097
323k
        return 0UL;
1098
323k
    }
1099
672k
#endif
1100
1101
#if defined(CRC32_ARMV8_CRC32)
1102
    if (arm_cpu_enable_crc32) {
1103
#if defined(__aarch64__)
1104
        /* PMULL is 64bit only, plus code needs at least a 64 bytes buffer. */
1105
        if (arm_cpu_enable_pmull && (len > Z_CRC32_PMULL_MINIMUM_LENGTH)) {
1106
            const size_t chunk_size = len & ~Z_CRC32_PMULL_CHUNKSIZE_MASK;
1107
            crc = ~armv8_crc32_pmull_little(buf, chunk_size, ~(uint32_t)crc);
1108
            /* Check remaining data. */
1109
            len -= chunk_size;
1110
            if (!len)
1111
                return crc;
1112
1113
            /* Fall through for the remaining data. */
1114
            buf += chunk_size;
1115
        }
1116
#endif
1117
        return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
1118
    }
1119
#endif
1120
672k
    return crc32_z(crc, buf, len); /* Armv7 or Armv8 w/o crypto extensions. */
1121
996k
}
1122
1123
/* ========================================================================= */
1124
0
uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2) {
1125
#ifdef DYNAMIC_CRC_TABLE
1126
    once(&made, make_crc_table);
1127
#endif /* DYNAMIC_CRC_TABLE */
1128
0
    return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1129
0
}
1130
1131
/* ========================================================================= */
1132
0
uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2) {
1133
0
    return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1134
0
}
1135
/* ========================================================================= */
1136
0
uLong ZEXPORT crc32_combine_gen64(z_off64_t len2) {
1137
#ifdef DYNAMIC_CRC_TABLE
1138
    once(&made, make_crc_table);
1139
#endif /* DYNAMIC_CRC_TABLE */
1140
0
    return x2nmodp(len2, 3);
1141
0
}
1142
1143
/* ========================================================================= */
1144
0
uLong ZEXPORT crc32_combine_gen(z_off_t len2) {
1145
0
    return crc32_combine_gen64((z_off64_t)len2);
1146
0
}
1147
1148
/* ========================================================================= */
1149
0
uLong ZEXPORT crc32_combine_op(uLong crc1, uLong crc2, uLong op) {
1150
0
    return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1151
0
}
1152
1153
ZLIB_INTERNAL void crc_reset(deflate_state *const s)
1154
0
{
1155
0
#ifdef CRC32_SIMD_SSE42_PCLMUL
1156
0
    if (x86_cpu_enable_simd) {
1157
0
        crc_fold_init(s);
1158
0
        return;
1159
0
    }
1160
0
#endif
1161
0
    s->strm->adler = crc32(0L, Z_NULL, 0);
1162
0
}
1163
1164
ZLIB_INTERNAL void crc_finalize(deflate_state *const s)
1165
0
{
1166
0
#ifdef CRC32_SIMD_SSE42_PCLMUL
1167
0
    if (x86_cpu_enable_simd)
1168
0
        s->strm->adler = crc_fold_512to32(s);
1169
0
#endif
1170
0
}
1171
1172
ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size)
1173
0
{
1174
0
#ifdef CRC32_SIMD_SSE42_PCLMUL
1175
0
    if (x86_cpu_enable_simd) {
1176
0
        crc_fold_copy(strm->state, dst, strm->next_in, size);
1177
0
        return;
1178
0
    }
1179
0
#endif
1180
0
    zmemcpy(dst, strm->next_in, size);
1181
0
    strm->adler = crc32(strm->adler, dst, size);
1182
0
}