/src/postgres/src/port/pg_crc32c_sse42.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * pg_crc32c_sse42.c |
4 | | * Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions. |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/port/pg_crc32c_sse42.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | #include "c.h" |
16 | | |
17 | | #include <nmmintrin.h> |
18 | | #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK |
19 | | #include <immintrin.h> |
20 | | #endif |
21 | | |
22 | | #include "port/pg_crc32c.h" |
23 | | |
24 | | pg_attribute_no_sanitize_alignment() |
25 | | pg_attribute_target("sse4.2") |
26 | | pg_crc32c |
27 | | pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) |
28 | 0 | { |
29 | 0 | const unsigned char *p = data; |
30 | 0 | const unsigned char *pend = p + len; |
31 | | |
32 | | /* |
33 | | * Process eight bytes of data at a time. |
34 | | * |
35 | | * NB: We do unaligned accesses here. The Intel architecture allows that, |
36 | | * and performance testing didn't show any performance gain from aligning |
37 | | * the begin address. |
38 | | */ |
39 | 0 | #ifdef __x86_64__ |
40 | 0 | while (p + 8 <= pend) |
41 | 0 | { |
42 | 0 | crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); |
43 | 0 | p += 8; |
44 | 0 | } |
45 | | |
46 | | /* Process remaining full four bytes if any */ |
47 | 0 | if (p + 4 <= pend) |
48 | 0 | { |
49 | 0 | crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); |
50 | 0 | p += 4; |
51 | 0 | } |
52 | | #else |
53 | | |
54 | | /* |
55 | | * Process four bytes at a time. (The eight byte instruction is not |
56 | | * available on the 32-bit x86 architecture). |
57 | | */ |
58 | | while (p + 4 <= pend) |
59 | | { |
60 | | crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); |
61 | | p += 4; |
62 | | } |
63 | | #endif /* __x86_64__ */ |
64 | | |
65 | | /* Process any remaining bytes one at a time. */ |
66 | 0 | while (p < pend) |
67 | 0 | { |
68 | 0 | crc = _mm_crc32_u8(crc, *p); |
69 | 0 | p++; |
70 | 0 | } |
71 | |
|
72 | 0 | return crc; |
73 | 0 | } |
74 | | |
75 | | #ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK |
76 | | |
77 | | /* |
78 | | * Note: There is no copyright notice in the following generated code. |
79 | | * |
80 | | * We have modified the output to |
81 | | * - match our function declaration |
82 | | * - match whitespace to our project style |
83 | | * - add a threshold for the alignment stanza |
84 | | */ |
85 | | |
86 | | /* Generated by https://github.com/corsix/fast-crc32/ using: */ |
87 | | /* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */ |
88 | | /* MIT licensed */ |
89 | | |
90 | 0 | #define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0)) |
91 | 0 | #define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17)) |
92 | | |
93 | | pg_attribute_target("vpclmulqdq,avx512vl") |
94 | | pg_crc32c |
95 | | pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len) |
96 | 0 | { |
97 | | /* adjust names to match generated code */ |
98 | 0 | pg_crc32c crc0 = crc; |
99 | 0 | const char *buf = data; |
100 | | |
101 | | /* Align on cacheline boundary. The threshold is somewhat arbitrary. */ |
102 | 0 | if (unlikely(len > 256)) |
103 | 0 | { |
104 | 0 | for (; len && ((uintptr_t) buf & 7); --len) |
105 | 0 | crc0 = _mm_crc32_u8(crc0, *buf++); |
106 | 0 | while (((uintptr_t) buf & 56) && len >= 8) |
107 | 0 | { |
108 | 0 | crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf); |
109 | 0 | buf += 8; |
110 | 0 | len -= 8; |
111 | 0 | } |
112 | 0 | } |
113 | |
|
114 | 0 | if (len >= 64) |
115 | 0 | { |
116 | 0 | const char *end = buf + len; |
117 | 0 | const char *limit = buf + len - 64; |
118 | 0 | __m128i z0; |
119 | | |
120 | | /* First vector chunk. */ |
121 | 0 | __m512i x0 = _mm512_loadu_si512((const void *) buf), |
122 | 0 | y0; |
123 | 0 | __m512i k; |
124 | |
|
125 | 0 | k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0)); |
126 | 0 | x0 = _mm512_xor_si512(_mm512_zextsi128_si512(_mm_cvtsi32_si128(crc0)), x0); |
127 | 0 | buf += 64; |
128 | | |
129 | | /* Main loop. */ |
130 | 0 | while (buf <= limit) |
131 | 0 | { |
132 | 0 | y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); |
133 | 0 | x0 = _mm512_ternarylogic_epi64(x0, y0, |
134 | 0 | _mm512_loadu_si512((const void *) buf), |
135 | 0 | 0x96); |
136 | 0 | buf += 64; |
137 | 0 | } |
138 | | |
139 | | /* Reduce 512 bits to 128 bits. */ |
140 | 0 | k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, |
141 | 0 | 0x3da6d0cb, 0, 0xba4fc28e, 0, |
142 | 0 | 0xf20c0dfe, 0, 0x493c7d27, 0, |
143 | 0 | 0, 0, 0, 0); |
144 | 0 | y0 = clmul_lo(x0, k), k = clmul_hi(x0, k); |
145 | 0 | y0 = _mm512_xor_si512(y0, k); |
146 | 0 | z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), |
147 | 0 | _mm512_extracti32x4_epi32(y0, 1), |
148 | 0 | _mm512_extracti32x4_epi32(y0, 2), |
149 | 0 | 0x96); |
150 | 0 | z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3)); |
151 | | |
152 | | /* Reduce 128 bits to 32 bits, and multiply by x^32. */ |
153 | 0 | crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0)); |
154 | 0 | crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1)); |
155 | 0 | len = end - buf; |
156 | 0 | } |
157 | |
|
158 | 0 | return pg_comp_crc32c_sse42(crc0, buf, len); |
159 | 0 | } |
160 | | |
161 | | #endif |