/src/postgres/src/include/storage/checksum_impl.h
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * checksum_impl.h |
4 | | * Checksum implementation for data pages. |
5 | | * |
6 | | * This file exists for the benefit of external programs that may wish to |
7 | | * check Postgres page checksums. They can #include this to get the code |
8 | | * referenced by storage/checksum.h. (Note: you may need to redefine |
9 | | * Assert() as empty to compile this successfully externally.) |
10 | | * |
11 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
12 | | * Portions Copyright (c) 1994, Regents of the University of California |
13 | | * |
14 | | * src/include/storage/checksum_impl.h |
15 | | * |
16 | | *------------------------------------------------------------------------- |
17 | | */ |
18 | | |
19 | | /* |
20 | | * The algorithm used to checksum pages is chosen for very fast calculation. |
21 | | * Workloads where the database working set fits into OS file cache but not |
22 | | * into shared buffers can read in pages at a very fast pace and the checksum |
23 | | * algorithm itself can become the largest bottleneck. |
24 | | * |
25 | | * The checksum algorithm itself is based on the FNV-1a hash (FNV is shorthand |
26 | | * for Fowler/Noll/Vo). The primitive of a plain FNV-1a hash folds in data 1 |
27 | | * byte at a time according to the formula: |
28 | | * |
29 | | * hash = (hash ^ value) * FNV_PRIME |
30 | | * |
31 | | * FNV-1a algorithm is described at http://www.isthe.com/chongo/tech/comp/fnv/ |
32 | | * |
33 | | * PostgreSQL doesn't use FNV-1a hash directly because it has bad mixing of |
34 | | * high bits - high order bits in input data only affect high order bits in |
35 | | * output data. To resolve this we xor in the value prior to multiplication |
36 | | * shifted right by 17 bits. The number 17 was chosen because it doesn't |
37 | | * have common denominator with set bit positions in FNV_PRIME and empirically |
38 | | * provides the fastest mixing for high order bits of final iterations quickly |
39 | | * avalanche into lower positions. For performance reasons we choose to combine |
40 | | * 4 bytes at a time. The actual hash formula used as the basis is: |
41 | | * |
42 | | * hash = (hash ^ value) * FNV_PRIME ^ ((hash ^ value) >> 17) |
43 | | * |
44 | | * The main bottleneck in this calculation is the multiplication latency. To |
45 | | * hide the latency and to make use of SIMD parallelism multiple hash values |
46 | | * are calculated in parallel. The page is treated as a 32 column two |
47 | | * dimensional array of 32 bit values. Each column is aggregated separately |
48 | | * into a partial checksum. Each partial checksum uses a different initial |
49 | | * value (offset basis in FNV terminology). The initial values actually used |
50 | | * were chosen randomly, as the values themselves don't matter as much as that |
51 | | * they are different and don't match anything in real data. After initializing |
52 | | * partial checksums each value in the column is aggregated according to the |
53 | | * above formula. Finally two more iterations of the formula are performed with |
54 | | * value 0 to mix the bits of the last value added. |
55 | | * |
56 | | * The partial checksums are then folded together using xor to form a single |
57 | | * 32-bit checksum. The caller can safely reduce the value to 16 bits |
58 | | * using modulo 2^16-1. That will cause a very slight bias towards lower |
59 | | * values but this is not significant for the performance of the |
60 | | * checksum. |
61 | | * |
62 | | * The algorithm choice was based on what instructions are available in SIMD |
63 | | * instruction sets. This meant that a fast and good algorithm needed to use |
64 | | * multiplication as the main mixing operator. The simplest multiplication |
65 | | * based checksum primitive is the one used by FNV. The prime used is chosen |
66 | | * for good dispersion of values. It has no known simple patterns that result |
67 | | * in collisions. Test of 5-bit differentials of the primitive over 64bit keys |
68 | | * reveals no differentials with 3 or more values out of 100000 random keys |
69 | | * colliding. Avalanche test shows that only high order bits of the last word |
70 | | * have a bias. Tests of 1-4 uncorrelated bit errors, stray 0 and 0xFF bytes, |
71 | | * overwriting page from random position to end with 0 bytes, and overwriting |
72 | | * random segments of page with 0x00, 0xFF and random data all show optimal |
73 | | * 2e-16 false positive rate within margin of error. |
74 | | * |
75 | | * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer |
76 | | * multiplication instruction. As of 2013 the corresponding instruction is |
77 | | * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). |
78 | | * Vectorization requires a compiler to do the vectorization for us. For recent |
79 | | * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough |
80 | | * to achieve vectorization. |
81 | | * |
82 | | * The optimal amount of parallelism to use depends on CPU specific instruction |
83 | | * latency, SIMD instruction width, throughput and the amount of registers |
84 | | * available to hold intermediate state. Generally, more parallelism is better |
85 | | * up to the point that state doesn't fit in registers and extra load-store |
86 | | * instructions are needed to swap values in/out. The number chosen is a fixed |
87 | | * part of the algorithm because changing the parallelism changes the checksum |
88 | | * result. |
89 | | * |
90 | | * The parallelism number 32 was chosen based on the fact that it is the |
91 | | * largest state that fits into architecturally visible x86 SSE registers while |
92 | | * leaving some free registers for intermediate values. For future processors |
93 | | * with 256bit vector registers this will leave some performance on the table. |
94 | | * When vectorization is not available it might be beneficial to restructure |
95 | | * the computation to calculate a subset of the columns at a time and perform |
96 | | * multiple passes to avoid register spilling. This optimization opportunity |
97 | | * is not used. Current coding also assumes that the compiler has the ability |
98 | | * to unroll the inner loop to avoid loop overhead and minimize register |
99 | | * spilling. For less sophisticated compilers it might be beneficial to |
100 | | * manually unroll the inner loop. |
101 | | */ |
102 | | |
103 | | #include "storage/bufpage.h" |
104 | | |
105 | | /* number of checksums to calculate in parallel */ |
106 | 0 | #define N_SUMS 32 |
107 | | /* prime multiplier of FNV-1a hash */ |
108 | 0 | #define FNV_PRIME 16777619 |
109 | | |
110 | | /* Use a union so that this code is valid under strict aliasing */ |
111 | | typedef union |
112 | | { |
113 | | PageHeaderData phdr; |
114 | | uint32 data[BLCKSZ / (sizeof(uint32) * N_SUMS)][N_SUMS]; |
115 | | } PGChecksummablePage; |
116 | | |
117 | | /* |
118 | | * Base offsets to initialize each of the parallel FNV hashes into a |
119 | | * different initial state. |
120 | | */ |
121 | | static const uint32 checksumBaseOffsets[N_SUMS] = { |
122 | | 0x5B1F36E9, 0xB8525960, 0x02AB50AA, 0x1DE66D2A, |
123 | | 0x79FF467A, 0x9BB9F8A3, 0x217E7CD2, 0x83E13D2C, |
124 | | 0xF8D4474F, 0xE39EB970, 0x42C6AE16, 0x993216FA, |
125 | | 0x7B093B5D, 0x98DAFF3C, 0xF718902A, 0x0B1C9CDB, |
126 | | 0xE58F764B, 0x187636BC, 0x5D7B3BB1, 0xE73DE7DE, |
127 | | 0x92BEC979, 0xCCA6C0B2, 0x304A0979, 0x85AA43D4, |
128 | | 0x783125BB, 0x6CA8EAA2, 0xE407EAC6, 0x4B5CFC3E, |
129 | | 0x9FBF8C76, 0x15CA20BE, 0xF2CA9FD3, 0x959BD756 |
130 | | }; |
131 | | |
132 | | /* |
133 | | * Calculate one round of the checksum. |
134 | | */ |
135 | 0 | #define CHECKSUM_COMP(checksum, value) \ |
136 | 0 | do { \ |
137 | 0 | uint32 __tmp = (checksum) ^ (value); \ |
138 | 0 | (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \ |
139 | 0 | } while (0) |
140 | | |
141 | | /* |
142 | | * Block checksum algorithm. The page must be adequately aligned |
143 | | * (at least on 4-byte boundary). |
144 | | */ |
145 | | static uint32 |
146 | | pg_checksum_block(const PGChecksummablePage *page) |
147 | 0 | { |
148 | 0 | uint32 sums[N_SUMS]; |
149 | 0 | uint32 result = 0; |
150 | 0 | uint32 i, |
151 | 0 | j; |
152 | | |
153 | | /* ensure that the size is compatible with the algorithm */ |
154 | 0 | Assert(sizeof(PGChecksummablePage) == BLCKSZ); |
155 | | |
156 | | /* initialize partial checksums to their corresponding offsets */ |
157 | 0 | memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); |
158 | | |
159 | | /* main checksum calculation */ |
160 | 0 | for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) |
161 | 0 | for (j = 0; j < N_SUMS; j++) |
162 | 0 | CHECKSUM_COMP(sums[j], page->data[i][j]); |
163 | | |
164 | | /* finally add in two rounds of zeroes for additional mixing */ |
165 | 0 | for (i = 0; i < 2; i++) |
166 | 0 | for (j = 0; j < N_SUMS; j++) |
167 | 0 | CHECKSUM_COMP(sums[j], 0); |
168 | | |
169 | | /* xor fold partial checksums together */ |
170 | 0 | for (i = 0; i < N_SUMS; i++) |
171 | 0 | result ^= sums[i]; |
172 | |
|
173 | 0 | return result; |
174 | 0 | } |
175 | | |
176 | | /* |
177 | | * Compute the checksum for a Postgres page. |
178 | | * |
179 | | * The page must be adequately aligned (at least on a 4-byte boundary). |
180 | | * Beware also that the checksum field of the page is transiently zeroed. |
181 | | * |
182 | | * The checksum includes the block number (to detect the case where a page is |
183 | | * somehow moved to a different location), the page header (excluding the |
184 | | * checksum itself), and the page data. |
185 | | */ |
186 | | uint16 |
187 | | pg_checksum_page(char *page, BlockNumber blkno) |
188 | 0 | { |
189 | 0 | PGChecksummablePage *cpage = (PGChecksummablePage *) page; |
190 | 0 | uint16 save_checksum; |
191 | 0 | uint32 checksum; |
192 | | |
193 | | /* We only calculate the checksum for properly-initialized pages */ |
194 | 0 | Assert(!PageIsNew((Page) page)); |
195 | | |
196 | | /* |
197 | | * Save pd_checksum and temporarily set it to zero, so that the checksum |
198 | | * calculation isn't affected by the old checksum stored on the page. |
199 | | * Restore it after, because actually updating the checksum is NOT part of |
200 | | * the API of this function. |
201 | | */ |
202 | 0 | save_checksum = cpage->phdr.pd_checksum; |
203 | 0 | cpage->phdr.pd_checksum = 0; |
204 | 0 | checksum = pg_checksum_block(cpage); |
205 | 0 | cpage->phdr.pd_checksum = save_checksum; |
206 | | |
207 | | /* Mix in the block number to detect transposed pages */ |
208 | 0 | checksum ^= blkno; |
209 | | |
210 | | /* |
211 | | * Reduce to a uint16 (to fit in the pd_checksum field) with an offset of |
212 | | * one. That avoids checksums of zero, which seems like a good idea. |
213 | | */ |
214 | 0 | return (uint16) ((checksum % 65535) + 1); |
215 | 0 | } |