/src/zlib-ng/arch/generic/crc32_braid_c.c

Source
/* crc32_braid.c -- compute the CRC-32 of a data stream
 * Copyright (C) 1995-2022 Mark Adler
 * For conditions of distribution and use, see copyright notice in zlib.h
 *
 * This interleaved implementation of a CRC makes use of pipelined multiple
 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
 */

#include "zbuild.h"
#include "crc32_braid_p.h"
#include "crc32_braid_tbl.h"

/*
  A CRC of a message is computed on BRAID_N braids of words in the message, where
  each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then
  three running sparse CRCs are calculated respectively on each braid, at these
  indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
  This is done starting at a word boundary, and continues until as many blocks of
  BRAID_N * BRAID_W bytes as are available have been processed. The results are
  combined into a single CRC at the end. For this code, BRAID_N must be in the
  range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased
  if desired by adding more #if blocks, extending the patterns apparent in the code.
  In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N
  value is increased.

  BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time
  on a given processor. The choices for BRAID_N and BRAID_W below were based on
  testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC
  POWER9, and MIPS64 Octeon II processors.
  The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8.
  The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4.
  They were all tested with either gcc or clang, all using the -O3 optimization
  level. Your mileage may vary.
*/

/* ========================================================================= */
#ifdef BRAID_W
/*
  Return the CRC of the BRAID_W bytes in the word_t data, taking the
  least-significant byte of the word as the first byte of data, without any pre
  or post conditioning. This is used to combine the CRCs of each braid.
 */
#  if BYTE_ORDER == LITTLE_ENDIAN
static uint32_t crc_word(z_word_t data) {
    int k;
    for (k = 0; k < BRAID_W; k++)
        data = (data >> 8) ^ crc_table[data & 0xff];
    return (uint32_t)data;
}
#  elif BYTE_ORDER == BIG_ENDIAN
static z_word_t crc_word(z_word_t data) {
    int k;
    for (k = 0; k < BRAID_W; k++)
        data = (data << 8) ^
            crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff];
    return data;
}
#  endif /* BYTE_ORDER */
#endif /* BRAID_W */

/* ========================================================================= */
Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len) {

#ifdef BRAID_W
    /* If provided enough bytes, do a braided CRC calculation. */
    if (len >= BRAID_N * BRAID_W + BRAID_W - 1) {
        size_t blks;
        z_word_t const *words;
        int k;

        /* Compute the CRC up to a z_word_t boundary. */
        while (len && ((uintptr_t)buf & (BRAID_W - 1)) != 0) {
            len--;
            CRC_DO1;
        }

        /* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */
        blks = len / (BRAID_N * BRAID_W);
        len -= blks * BRAID_N * BRAID_W;
        words = (z_word_t const *)buf;

        z_word_t crc0, word0, comb;
#if BRAID_N > 1
        z_word_t crc1, word1;
#if BRAID_N > 2
        z_word_t crc2, word2;
#if BRAID_N > 3
        z_word_t crc3, word3;
#if BRAID_N > 4
        z_word_t crc4, word4;
#if BRAID_N > 5
        z_word_t crc5, word5;
#endif
#endif
#endif
#endif
#endif
        /* Initialize the CRC for each braid. */
        crc0 = ZSWAPWORD(c);
#if BRAID_N > 1
        crc1 = 0;
#if BRAID_N > 2
        crc2 = 0;
#if BRAID_N > 3
        crc3 = 0;
#if BRAID_N > 4
        crc4 = 0;
#if BRAID_N > 5
        crc5 = 0;
#endif
#endif
#endif
#endif
#endif
        /* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
        while (--blks) {
            /* Load the word for each braid into registers. */
            word0 = crc0 ^ words[0];
#if BRAID_N > 1
            word1 = crc1 ^ words[1];
#if BRAID_N > 2
            word2 = crc2 ^ words[2];
#if BRAID_N > 3
            word3 = crc3 ^ words[3];
#if BRAID_N > 4
            word4 = crc4 ^ words[4];
#if BRAID_N > 5
            word5 = crc5 ^ words[5];
#endif
#endif
#endif
#endif
#endif
            words += BRAID_N;

            /* Compute and update the CRC for each word. The loop should get unrolled. */
            crc0 = BRAID_TABLE[0][word0 & 0xff];
#if BRAID_N > 1
            crc1 = BRAID_TABLE[0][word1 & 0xff];
#if BRAID_N > 2
            crc2 = BRAID_TABLE[0][word2 & 0xff];
#if BRAID_N > 3
            crc3 = BRAID_TABLE[0][word3 & 0xff];
#if BRAID_N > 4
            crc4 = BRAID_TABLE[0][word4 & 0xff];
#if BRAID_N > 5
            crc5 = BRAID_TABLE[0][word5 & 0xff];
#endif
#endif
#endif
#endif
#endif
            for (k = 1; k < BRAID_W; k++) {
                crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
#if BRAID_N > 1
                crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
#if BRAID_N > 2
                crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
#if BRAID_N > 3
                crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
#if BRAID_N > 4
                crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
#if BRAID_N > 5
                crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
#endif
#endif
#endif
#endif
#endif
            }
        }

        /* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */
        comb = crc_word(crc0 ^ words[0]);
#if BRAID_N > 1
        comb = crc_word(crc1 ^ words[1] ^ comb);
#if BRAID_N > 2
        comb = crc_word(crc2 ^ words[2] ^ comb);
#if BRAID_N > 3
        comb = crc_word(crc3 ^ words[3] ^ comb);
#if BRAID_N > 4
        comb = crc_word(crc4 ^ words[4] ^ comb);
#if BRAID_N > 5
        comb = crc_word(crc5 ^ words[5] ^ comb);
#endif
#endif
#endif
#endif
#endif
        words += BRAID_N;
        Assert(comb <= UINT32_MAX, "comb should fit in uint32_t");
        c = (uint32_t)ZSWAPWORD(comb);

        /* Update the pointer to the remaining bytes to process. */
        buf = (const unsigned char *)words;
    }

#endif /* BRAID_W */

    /* Complete the computation of the CRC on any remaining bytes. */
    while (len >= 8) {
        len -= 8;
        CRC_DO8;
    }
    while (len) {
        len--;
        CRC_DO1;
    }

    /* Return the CRC, post-conditioned. */
    return c;
}

Z_INTERNAL uint32_t crc32_braid(uint32_t c, const uint8_t *buf, size_t len) {
    c = (~c) & 0xffffffff;

    c = crc32_braid_internal(c, buf, len);

    /* Return the CRC, post-conditioned. */
    return c ^ 0xffffffff;
}

Coverage Report

Created: 2025-10-28 06:58

Line	Count	Source
1		/* crc32_braid.c -- compute the CRC-32 of a data stream
2		* Copyright (C) 1995-2022 Mark Adler
3		* For conditions of distribution and use, see copyright notice in zlib.h
4		*
5		* This interleaved implementation of a CRC makes use of pipelined multiple
6		* arithmetic-logic units, commonly found in modern CPU cores. It is due to
7		* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8		*/
9
10		#include "zbuild.h"
11		#include "crc32_braid_p.h"
12		#include "crc32_braid_tbl.h"
13
14		/*
15		A CRC of a message is computed on BRAID_N braids of words in the message, where
16		each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then
17		three running sparse CRCs are calculated respectively on each braid, at these
18		indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
19		This is done starting at a word boundary, and continues until as many blocks of
20		BRAID_N * BRAID_W bytes as are available have been processed. The results are
21		combined into a single CRC at the end. For this code, BRAID_N must be in the
22		range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased
23		if desired by adding more #if blocks, extending the patterns apparent in the code.
24		In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N
25		value is increased.
26
27		BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time
28		on a given processor. The choices for BRAID_N and BRAID_W below were based on
29		testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC
30		POWER9, and MIPS64 Octeon II processors.
31		The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8.
32		The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4.
33		They were all tested with either gcc or clang, all using the -O3 optimization
34		level. Your mileage may vary.
35		*/
36
37		/* ========================================================================= */
38		#ifdef BRAID_W
39		/*
40		Return the CRC of the BRAID_W bytes in the word_t data, taking the
41		least-significant byte of the word as the first byte of data, without any pre
42		or post conditioning. This is used to combine the CRCs of each braid.
43		*/
44		# if BYTE_ORDER == LITTLE_ENDIAN
45	0	static uint32_t crc_word(z_word_t data) {
46	0	int k;
47	0	for (k = 0; k < BRAID_W; k++)
48	0	data = (data >> 8) ^ crc_table[data & 0xff];
49	0	return (uint32_t)data;
50	0	}
51		# elif BYTE_ORDER == BIG_ENDIAN
52		static z_word_t crc_word(z_word_t data) {
53		int k;
54		for (k = 0; k < BRAID_W; k++)
55		data = (data << 8) ^
56		crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff];
57		return data;
58		}
59		# endif /* BYTE_ORDER */
60		#endif /* BRAID_W */
61
62		/* ========================================================================= */
63	0	Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len) {
64
65	0	#ifdef BRAID_W
66		/* If provided enough bytes, do a braided CRC calculation. */
67	0	if (len >= BRAID_N * BRAID_W + BRAID_W - 1) {
68	0	size_t blks;
69	0	z_word_t const *words;
70	0	int k;
71
72		/* Compute the CRC up to a z_word_t boundary. */
73	0	while (len && ((uintptr_t)buf & (BRAID_W - 1)) != 0) {
74	0	len--;
75	0	CRC_DO1;
76	0	}
77
78		/* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */
79	0	blks = len / (BRAID_N * BRAID_W);
80	0	len -= blks * BRAID_N * BRAID_W;
81	0	words = (z_word_t const *)buf;
82
83	0	z_word_t crc0, word0, comb;
84	0	#if BRAID_N > 1
85	0	z_word_t crc1, word1;
86	0	#if BRAID_N > 2
87	0	z_word_t crc2, word2;
88	0	#if BRAID_N > 3
89	0	z_word_t crc3, word3;
90	0	#if BRAID_N > 4
91	0	z_word_t crc4, word4;
92		#if BRAID_N > 5
93		z_word_t crc5, word5;
94		#endif
95	0	#endif
96	0	#endif
97	0	#endif
98	0	#endif
99		/* Initialize the CRC for each braid. */
100	0	crc0 = ZSWAPWORD(c);
101	0	#if BRAID_N > 1
102	0	crc1 = 0;
103	0	#if BRAID_N > 2
104	0	crc2 = 0;
105	0	#if BRAID_N > 3
106	0	crc3 = 0;
107	0	#if BRAID_N > 4
108	0	crc4 = 0;
109		#if BRAID_N > 5
110		crc5 = 0;
111		#endif
112	0	#endif
113	0	#endif
114	0	#endif
115	0	#endif
116		/* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
117	0	while (--blks) {
118		/* Load the word for each braid into registers. */
119	0	word0 = crc0 ^ words[0];
120	0	#if BRAID_N > 1
121	0	word1 = crc1 ^ words[1];
122	0	#if BRAID_N > 2
123	0	word2 = crc2 ^ words[2];
124	0	#if BRAID_N > 3
125	0	word3 = crc3 ^ words[3];
126	0	#if BRAID_N > 4
127	0	word4 = crc4 ^ words[4];
128		#if BRAID_N > 5
129		word5 = crc5 ^ words[5];
130		#endif
131	0	#endif
132	0	#endif
133	0	#endif
134	0	#endif
135	0	words += BRAID_N;
136
137		/* Compute and update the CRC for each word. The loop should get unrolled. */
138	0	crc0 = BRAID_TABLE[0][word0 & 0xff];
139	0	#if BRAID_N > 1
140	0	crc1 = BRAID_TABLE[0][word1 & 0xff];
141	0	#if BRAID_N > 2
142	0	crc2 = BRAID_TABLE[0][word2 & 0xff];
143	0	#if BRAID_N > 3
144	0	crc3 = BRAID_TABLE[0][word3 & 0xff];
145	0	#if BRAID_N > 4
146	0	crc4 = BRAID_TABLE[0][word4 & 0xff];
147		#if BRAID_N > 5
148		crc5 = BRAID_TABLE[0][word5 & 0xff];
149		#endif
150	0	#endif
151	0	#endif
152	0	#endif
153	0	#endif
154	0	for (k = 1; k < BRAID_W; k++) {
155	0	crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
156	0	#if BRAID_N > 1
157	0	crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
158	0	#if BRAID_N > 2
159	0	crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
160	0	#if BRAID_N > 3
161	0	crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
162	0	#if BRAID_N > 4
163	0	crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
164		#if BRAID_N > 5
165		crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
166		#endif
167	0	#endif
168	0	#endif
169	0	#endif
170	0	#endif
171	0	}
172	0	}
173
174		/* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */
175	0	comb = crc_word(crc0 ^ words[0]);
176	0	#if BRAID_N > 1
177	0	comb = crc_word(crc1 ^ words[1] ^ comb);
178	0	#if BRAID_N > 2
179	0	comb = crc_word(crc2 ^ words[2] ^ comb);
180	0	#if BRAID_N > 3
181	0	comb = crc_word(crc3 ^ words[3] ^ comb);
182	0	#if BRAID_N > 4
183	0	comb = crc_word(crc4 ^ words[4] ^ comb);
184		#if BRAID_N > 5
185		comb = crc_word(crc5 ^ words[5] ^ comb);
186		#endif
187	0	#endif
188	0	#endif
189	0	#endif
190	0	#endif
191	0	words += BRAID_N;
192	0	Assert(comb <= UINT32_MAX, "comb should fit in uint32_t");
193	0	c = (uint32_t)ZSWAPWORD(comb);
194
195		/* Update the pointer to the remaining bytes to process. */
196	0	buf = (const unsigned char *)words;
197	0	}
198
199	0	#endif /* BRAID_W */
200
201		/* Complete the computation of the CRC on any remaining bytes. */
202	0	while (len >= 8) {
203	0	len -= 8;
204	0	CRC_DO8;
205	0	}
206	0	while (len) {
207	0	len--;
208	0	CRC_DO1;
209	0	}
210
211		/* Return the CRC, post-conditioned. */
212	0	return c;
213	0	}
214
215	0	Z_INTERNAL uint32_t crc32_braid(uint32_t c, const uint8_t *buf, size_t len) {
216	0	c = (~c) & 0xffffffff;
217
218	0	c = crc32_braid_internal(c, buf, len);
219
220		/* Return the CRC, post-conditioned. */
221	0	return c ^ 0xffffffff;
222	0	}