/src/postgres/src/common/blkreftable.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * blkreftable.c |
4 | | * Block reference tables. |
5 | | * |
6 | | * A block reference table is used to keep track of which blocks have |
7 | | * been modified by WAL records within a certain LSN range. |
8 | | * |
9 | | * For each relation fork, we keep track of all blocks that have appeared |
10 | | * in block reference in the WAL. We also keep track of the "limit block", |
11 | | * which is the smallest relation length in blocks known to have occurred |
12 | | * during that range of WAL records. This should be set to 0 if the relation |
13 | | * fork is created or destroyed, and to the post-truncation length if |
14 | | * truncated. |
15 | | * |
16 | | * Whenever we set the limit block, we also forget about any modified blocks |
17 | | * beyond that point. Those blocks don't exist any more. Such blocks can |
18 | | * later be marked as modified again; if that happens, it means the relation |
19 | | * was re-extended. |
20 | | * |
21 | | * Portions Copyright (c) 2010-2025, PostgreSQL Global Development Group |
22 | | * |
23 | | * src/common/blkreftable.c |
24 | | * |
25 | | *------------------------------------------------------------------------- |
26 | | */ |
27 | | |
28 | | |
29 | | #ifndef FRONTEND |
30 | | #include "postgres.h" |
31 | | #else |
32 | | #include "postgres_fe.h" |
33 | | #endif |
34 | | |
35 | | #ifdef FRONTEND |
36 | | #include "common/logging.h" |
37 | | #endif |
38 | | |
39 | | #include "common/blkreftable.h" |
40 | | #include "common/hashfn.h" |
41 | | #include "port/pg_crc32c.h" |
42 | | |
43 | | /* |
44 | | * A block reference table keeps track of the status of each relation |
45 | | * fork individually. |
46 | | */ |
47 | | typedef struct BlockRefTableKey |
48 | | { |
49 | | RelFileLocator rlocator; |
50 | | ForkNumber forknum; |
51 | | } BlockRefTableKey; |
52 | | |
53 | | /* |
54 | | * We could need to store data either for a relation in which only a |
55 | | * tiny fraction of the blocks have been modified or for a relation in |
56 | | * which nearly every block has been modified, and we want a |
57 | | * space-efficient representation in both cases. To accomplish this, |
58 | | * we divide the relation into chunks of 2^16 blocks and choose between |
59 | | * an array representation and a bitmap representation for each chunk. |
60 | | * |
61 | | * When the number of modified blocks in a given chunk is small, we |
62 | | * essentially store an array of block numbers, but we need not store the |
63 | | * entire block number: instead, we store each block number as a 2-byte |
64 | | * offset from the start of the chunk. |
65 | | * |
66 | | * When the number of modified blocks in a given chunk is large, we switch |
67 | | * to a bitmap representation. |
68 | | * |
69 | | * These same basic representational choices are used both when a block |
70 | | * reference table is stored in memory and when it is serialized to disk. |
71 | | * |
72 | | * In the in-memory representation, we initially allocate each chunk with |
73 | | * space for a number of entries given by INITIAL_ENTRIES_PER_CHUNK and |
74 | | * increase that as necessary until we reach MAX_ENTRIES_PER_CHUNK. |
75 | | * Any chunk whose allocated size reaches MAX_ENTRIES_PER_CHUNK is converted |
76 | | * to a bitmap, and thus never needs to grow further. |
77 | | */ |
78 | 0 | #define BLOCKS_PER_CHUNK (1 << 16) |
79 | 0 | #define BLOCKS_PER_ENTRY (BITS_PER_BYTE * sizeof(uint16)) |
80 | 0 | #define MAX_ENTRIES_PER_CHUNK (BLOCKS_PER_CHUNK / BLOCKS_PER_ENTRY) |
81 | 0 | #define INITIAL_ENTRIES_PER_CHUNK 16 |
82 | | typedef uint16 *BlockRefTableChunk; |
83 | | |
84 | | /* |
85 | | * State for one relation fork. |
86 | | * |
87 | | * 'rlocator' and 'forknum' identify the relation fork to which this entry |
88 | | * pertains. |
89 | | * |
90 | | * 'limit_block' is the shortest known length of the relation in blocks |
91 | | * within the LSN range covered by a particular block reference table. |
92 | | * It should be set to 0 if the relation fork is created or dropped. If the |
93 | | * relation fork is truncated, it should be set to the number of blocks that |
94 | | * remain after truncation. |
95 | | * |
96 | | * 'nchunks' is the allocated length of each of the three arrays that follow. |
97 | | * We can only represent the status of block numbers less than nchunks * |
98 | | * BLOCKS_PER_CHUNK. |
99 | | * |
100 | | * 'chunk_size' is an array storing the allocated size of each chunk. |
101 | | * |
102 | | * 'chunk_usage' is an array storing the number of elements used in each |
103 | | * chunk. If that value is less than MAX_ENTRIES_PER_CHUNK, the corresponding |
104 | | * chunk is used as an array; else the corresponding chunk is used as a bitmap. |
105 | | * When used as a bitmap, the least significant bit of the first array element |
106 | | * is the status of the lowest-numbered block covered by this chunk. |
107 | | * |
108 | | * 'chunk_data' is the array of chunks. |
109 | | */ |
110 | | struct BlockRefTableEntry |
111 | | { |
112 | | BlockRefTableKey key; |
113 | | BlockNumber limit_block; |
114 | | char status; |
115 | | uint32 nchunks; |
116 | | uint16 *chunk_size; |
117 | | uint16 *chunk_usage; |
118 | | BlockRefTableChunk *chunk_data; |
119 | | }; |
120 | | |
121 | | /* Declare and define a hash table over type BlockRefTableEntry. */ |
122 | | #define SH_PREFIX blockreftable |
123 | 0 | #define SH_ELEMENT_TYPE BlockRefTableEntry |
124 | | #define SH_KEY_TYPE BlockRefTableKey |
125 | 0 | #define SH_KEY key |
126 | | #define SH_HASH_KEY(tb, key) \ |
127 | 0 | hash_bytes((const unsigned char *) &key, sizeof(BlockRefTableKey)) |
128 | 0 | #define SH_EQUAL(tb, a, b) (memcmp(&a, &b, sizeof(BlockRefTableKey)) == 0) |
129 | | #define SH_SCOPE static inline |
130 | | #ifdef FRONTEND |
131 | | #define SH_RAW_ALLOCATOR pg_malloc0 |
132 | | #endif |
133 | | #define SH_DEFINE |
134 | | #define SH_DECLARE |
135 | | #include "lib/simplehash.h" |
136 | | |
137 | | /* |
138 | | * A block reference table is basically just the hash table, but we don't |
139 | | * want to expose that to outside callers. |
140 | | * |
141 | | * We keep track of the memory context in use explicitly too, so that it's |
142 | | * easy to place all of our allocations in the same context. |
143 | | */ |
144 | | struct BlockRefTable |
145 | | { |
146 | | blockreftable_hash *hash; |
147 | | #ifndef FRONTEND |
148 | | MemoryContext mcxt; |
149 | | #endif |
150 | | }; |
151 | | |
152 | | /* |
153 | | * On-disk serialization format for block reference table entries. |
154 | | */ |
155 | | typedef struct BlockRefTableSerializedEntry |
156 | | { |
157 | | RelFileLocator rlocator; |
158 | | ForkNumber forknum; |
159 | | BlockNumber limit_block; |
160 | | uint32 nchunks; |
161 | | } BlockRefTableSerializedEntry; |
162 | | |
163 | | /* |
164 | | * Buffer size, so that we avoid doing many small I/Os. |
165 | | */ |
166 | 0 | #define BUFSIZE 65536 |
167 | | |
168 | | /* |
169 | | * Ad-hoc buffer for file I/O. |
170 | | */ |
171 | | typedef struct BlockRefTableBuffer |
172 | | { |
173 | | io_callback_fn io_callback; |
174 | | void *io_callback_arg; |
175 | | char data[BUFSIZE]; |
176 | | int used; |
177 | | int cursor; |
178 | | pg_crc32c crc; |
179 | | } BlockRefTableBuffer; |
180 | | |
181 | | /* |
182 | | * State for keeping track of progress while incrementally reading a block |
183 | | * table reference file from disk. |
184 | | * |
185 | | * total_chunks means the number of chunks for the RelFileLocator/ForkNumber |
186 | | * combination that is currently being read, and consumed_chunks is the number |
187 | | * of those that have been read. (We always read all the information for |
188 | | * a single chunk at one time, so we don't need to be able to represent the |
189 | | * state where a chunk has been partially read.) |
190 | | * |
191 | | * chunk_size is the array of chunk sizes. The length is given by total_chunks. |
192 | | * |
193 | | * chunk_data holds the current chunk. |
194 | | * |
195 | | * chunk_position helps us figure out how much progress we've made in returning |
196 | | * the block numbers for the current chunk to the caller. If the chunk is a |
197 | | * bitmap, it's the number of bits we've scanned; otherwise, it's the number |
198 | | * of chunk entries we've scanned. |
199 | | */ |
200 | | struct BlockRefTableReader |
201 | | { |
202 | | BlockRefTableBuffer buffer; |
203 | | char *error_filename; |
204 | | report_error_fn error_callback; |
205 | | void *error_callback_arg; |
206 | | uint32 total_chunks; |
207 | | uint32 consumed_chunks; |
208 | | uint16 *chunk_size; |
209 | | uint16 chunk_data[MAX_ENTRIES_PER_CHUNK]; |
210 | | uint32 chunk_position; |
211 | | }; |
212 | | |
213 | | /* |
214 | | * State for keeping track of progress while incrementally writing a block |
215 | | * reference table file to disk. |
216 | | */ |
217 | | struct BlockRefTableWriter |
218 | | { |
219 | | BlockRefTableBuffer buffer; |
220 | | }; |
221 | | |
222 | | /* Function prototypes. */ |
223 | | static int BlockRefTableComparator(const void *a, const void *b); |
224 | | static void BlockRefTableFlush(BlockRefTableBuffer *buffer); |
225 | | static void BlockRefTableRead(BlockRefTableReader *reader, void *data, |
226 | | int length); |
227 | | static void BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, |
228 | | int length); |
229 | | static void BlockRefTableFileTerminate(BlockRefTableBuffer *buffer); |
230 | | |
231 | | /* |
232 | | * Create an empty block reference table. |
233 | | */ |
234 | | BlockRefTable * |
235 | | CreateEmptyBlockRefTable(void) |
236 | 0 | { |
237 | 0 | BlockRefTable *brtab = palloc(sizeof(BlockRefTable)); |
238 | | |
239 | | /* |
240 | | * Even completely empty database has a few hundred relation forks, so it |
241 | | * seems best to size the hash on the assumption that we're going to have |
242 | | * at least a few thousand entries. |
243 | | */ |
244 | | #ifdef FRONTEND |
245 | | brtab->hash = blockreftable_create(4096, NULL); |
246 | | #else |
247 | 0 | brtab->mcxt = CurrentMemoryContext; |
248 | 0 | brtab->hash = blockreftable_create(brtab->mcxt, 4096, NULL); |
249 | 0 | #endif |
250 | |
|
251 | 0 | return brtab; |
252 | 0 | } |
253 | | |
254 | | /* |
255 | | * Set the "limit block" for a relation fork and forget any modified blocks |
256 | | * with equal or higher block numbers. |
257 | | * |
258 | | * The "limit block" is the shortest known length of the relation within the |
259 | | * range of WAL records covered by this block reference table. |
260 | | */ |
261 | | void |
262 | | BlockRefTableSetLimitBlock(BlockRefTable *brtab, |
263 | | const RelFileLocator *rlocator, |
264 | | ForkNumber forknum, |
265 | | BlockNumber limit_block) |
266 | 0 | { |
267 | 0 | BlockRefTableEntry *brtentry; |
268 | 0 | BlockRefTableKey key = {0}; /* make sure any padding is zero */ |
269 | 0 | bool found; |
270 | |
|
271 | 0 | memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator)); |
272 | 0 | key.forknum = forknum; |
273 | 0 | brtentry = blockreftable_insert(brtab->hash, key, &found); |
274 | |
|
275 | 0 | if (!found) |
276 | 0 | { |
277 | | /* |
278 | | * We have no existing data about this relation fork, so just record |
279 | | * the limit_block value supplied by the caller, and make sure other |
280 | | * parts of the entry are properly initialized. |
281 | | */ |
282 | 0 | brtentry->limit_block = limit_block; |
283 | 0 | brtentry->nchunks = 0; |
284 | 0 | brtentry->chunk_size = NULL; |
285 | 0 | brtentry->chunk_usage = NULL; |
286 | 0 | brtentry->chunk_data = NULL; |
287 | 0 | return; |
288 | 0 | } |
289 | | |
290 | 0 | BlockRefTableEntrySetLimitBlock(brtentry, limit_block); |
291 | 0 | } |
292 | | |
293 | | /* |
294 | | * Mark a block in a given relation fork as known to have been modified. |
295 | | */ |
296 | | void |
297 | | BlockRefTableMarkBlockModified(BlockRefTable *brtab, |
298 | | const RelFileLocator *rlocator, |
299 | | ForkNumber forknum, |
300 | | BlockNumber blknum) |
301 | 0 | { |
302 | 0 | BlockRefTableEntry *brtentry; |
303 | 0 | BlockRefTableKey key = {0}; /* make sure any padding is zero */ |
304 | 0 | bool found; |
305 | 0 | #ifndef FRONTEND |
306 | 0 | MemoryContext oldcontext = MemoryContextSwitchTo(brtab->mcxt); |
307 | 0 | #endif |
308 | |
|
309 | 0 | memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator)); |
310 | 0 | key.forknum = forknum; |
311 | 0 | brtentry = blockreftable_insert(brtab->hash, key, &found); |
312 | |
|
313 | 0 | if (!found) |
314 | 0 | { |
315 | | /* |
316 | | * We want to set the initial limit block value to something higher |
317 | | * than any legal block number. InvalidBlockNumber fits the bill. |
318 | | */ |
319 | 0 | brtentry->limit_block = InvalidBlockNumber; |
320 | 0 | brtentry->nchunks = 0; |
321 | 0 | brtentry->chunk_size = NULL; |
322 | 0 | brtentry->chunk_usage = NULL; |
323 | 0 | brtentry->chunk_data = NULL; |
324 | 0 | } |
325 | |
|
326 | 0 | BlockRefTableEntryMarkBlockModified(brtentry, forknum, blknum); |
327 | |
|
328 | 0 | #ifndef FRONTEND |
329 | 0 | MemoryContextSwitchTo(oldcontext); |
330 | 0 | #endif |
331 | 0 | } |
332 | | |
333 | | /* |
334 | | * Get an entry from a block reference table. |
335 | | * |
336 | | * If the entry does not exist, this function returns NULL. Otherwise, it |
337 | | * returns the entry and sets *limit_block to the value from the entry. |
338 | | */ |
339 | | BlockRefTableEntry * |
340 | | BlockRefTableGetEntry(BlockRefTable *brtab, const RelFileLocator *rlocator, |
341 | | ForkNumber forknum, BlockNumber *limit_block) |
342 | 0 | { |
343 | 0 | BlockRefTableKey key = {0}; /* make sure any padding is zero */ |
344 | 0 | BlockRefTableEntry *entry; |
345 | |
|
346 | 0 | Assert(limit_block != NULL); |
347 | |
|
348 | 0 | memcpy(&key.rlocator, rlocator, sizeof(RelFileLocator)); |
349 | 0 | key.forknum = forknum; |
350 | 0 | entry = blockreftable_lookup(brtab->hash, key); |
351 | |
|
352 | 0 | if (entry != NULL) |
353 | 0 | *limit_block = entry->limit_block; |
354 | |
|
355 | 0 | return entry; |
356 | 0 | } |
357 | | |
358 | | /* |
359 | | * Get block numbers from a table entry. |
360 | | * |
361 | | * 'blocks' must point to enough space to hold at least 'nblocks' block |
362 | | * numbers, and any block numbers we manage to get will be written there. |
363 | | * The return value is the number of block numbers actually written. |
364 | | * |
365 | | * We do not return block numbers unless they are greater than or equal to |
366 | | * start_blkno and strictly less than stop_blkno. |
367 | | */ |
368 | | int |
369 | | BlockRefTableEntryGetBlocks(BlockRefTableEntry *entry, |
370 | | BlockNumber start_blkno, |
371 | | BlockNumber stop_blkno, |
372 | | BlockNumber *blocks, |
373 | | int nblocks) |
374 | 0 | { |
375 | 0 | uint32 start_chunkno; |
376 | 0 | uint32 stop_chunkno; |
377 | 0 | uint32 chunkno; |
378 | 0 | int nresults = 0; |
379 | |
|
380 | 0 | Assert(entry != NULL); |
381 | | |
382 | | /* |
383 | | * Figure out which chunks could potentially contain blocks of interest. |
384 | | * |
385 | | * We need to be careful about overflow here, because stop_blkno could be |
386 | | * InvalidBlockNumber or something very close to it. |
387 | | */ |
388 | 0 | start_chunkno = start_blkno / BLOCKS_PER_CHUNK; |
389 | 0 | stop_chunkno = stop_blkno / BLOCKS_PER_CHUNK; |
390 | 0 | if ((stop_blkno % BLOCKS_PER_CHUNK) != 0) |
391 | 0 | ++stop_chunkno; |
392 | 0 | if (stop_chunkno > entry->nchunks) |
393 | 0 | stop_chunkno = entry->nchunks; |
394 | | |
395 | | /* |
396 | | * Loop over chunks. |
397 | | */ |
398 | 0 | for (chunkno = start_chunkno; chunkno < stop_chunkno; ++chunkno) |
399 | 0 | { |
400 | 0 | uint16 chunk_usage = entry->chunk_usage[chunkno]; |
401 | 0 | BlockRefTableChunk chunk_data = entry->chunk_data[chunkno]; |
402 | 0 | unsigned start_offset = 0; |
403 | 0 | unsigned stop_offset = BLOCKS_PER_CHUNK; |
404 | | |
405 | | /* |
406 | | * If the start and/or stop block number falls within this chunk, the |
407 | | * whole chunk may not be of interest. Figure out which portion we |
408 | | * care about, if it's not the whole thing. |
409 | | */ |
410 | 0 | if (chunkno == start_chunkno) |
411 | 0 | start_offset = start_blkno % BLOCKS_PER_CHUNK; |
412 | 0 | if (chunkno == stop_chunkno - 1) |
413 | 0 | { |
414 | 0 | Assert(stop_blkno > chunkno * BLOCKS_PER_CHUNK); |
415 | 0 | stop_offset = stop_blkno - (chunkno * BLOCKS_PER_CHUNK); |
416 | 0 | Assert(stop_offset <= BLOCKS_PER_CHUNK); |
417 | 0 | } |
418 | | |
419 | | /* |
420 | | * Handling differs depending on whether this is an array of offsets |
421 | | * or a bitmap. |
422 | | */ |
423 | 0 | if (chunk_usage == MAX_ENTRIES_PER_CHUNK) |
424 | 0 | { |
425 | 0 | unsigned i; |
426 | | |
427 | | /* It's a bitmap, so test every relevant bit. */ |
428 | 0 | for (i = start_offset; i < stop_offset; ++i) |
429 | 0 | { |
430 | 0 | uint16 w = chunk_data[i / BLOCKS_PER_ENTRY]; |
431 | |
|
432 | 0 | if ((w & (1 << (i % BLOCKS_PER_ENTRY))) != 0) |
433 | 0 | { |
434 | 0 | BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + i; |
435 | |
|
436 | 0 | blocks[nresults++] = blkno; |
437 | | |
438 | | /* Early exit if we run out of output space. */ |
439 | 0 | if (nresults == nblocks) |
440 | 0 | return nresults; |
441 | 0 | } |
442 | 0 | } |
443 | 0 | } |
444 | 0 | else |
445 | 0 | { |
446 | 0 | unsigned i; |
447 | | |
448 | | /* It's an array of offsets, so check each one. */ |
449 | 0 | for (i = 0; i < chunk_usage; ++i) |
450 | 0 | { |
451 | 0 | uint16 offset = chunk_data[i]; |
452 | |
|
453 | 0 | if (offset >= start_offset && offset < stop_offset) |
454 | 0 | { |
455 | 0 | BlockNumber blkno = chunkno * BLOCKS_PER_CHUNK + offset; |
456 | |
|
457 | 0 | blocks[nresults++] = blkno; |
458 | | |
459 | | /* Early exit if we run out of output space. */ |
460 | 0 | if (nresults == nblocks) |
461 | 0 | return nresults; |
462 | 0 | } |
463 | 0 | } |
464 | 0 | } |
465 | 0 | } |
466 | | |
467 | 0 | return nresults; |
468 | 0 | } |
469 | | |
470 | | /* |
471 | | * Serialize a block reference table to a file. |
472 | | */ |
473 | | void |
474 | | WriteBlockRefTable(BlockRefTable *brtab, |
475 | | io_callback_fn write_callback, |
476 | | void *write_callback_arg) |
477 | 0 | { |
478 | 0 | BlockRefTableSerializedEntry *sdata = NULL; |
479 | 0 | BlockRefTableBuffer buffer; |
480 | 0 | uint32 magic = BLOCKREFTABLE_MAGIC; |
481 | | |
482 | | /* Prepare buffer. */ |
483 | 0 | memset(&buffer, 0, sizeof(BlockRefTableBuffer)); |
484 | 0 | buffer.io_callback = write_callback; |
485 | 0 | buffer.io_callback_arg = write_callback_arg; |
486 | 0 | INIT_CRC32C(buffer.crc); |
487 | | |
488 | | /* Write magic number. */ |
489 | 0 | BlockRefTableWrite(&buffer, &magic, sizeof(uint32)); |
490 | | |
491 | | /* Write the entries, assuming there are some. */ |
492 | 0 | if (brtab->hash->members > 0) |
493 | 0 | { |
494 | 0 | unsigned i = 0; |
495 | 0 | blockreftable_iterator it; |
496 | 0 | BlockRefTableEntry *brtentry; |
497 | | |
498 | | /* Extract entries into serializable format and sort them. */ |
499 | 0 | sdata = |
500 | 0 | palloc(brtab->hash->members * sizeof(BlockRefTableSerializedEntry)); |
501 | 0 | blockreftable_start_iterate(brtab->hash, &it); |
502 | 0 | while ((brtentry = blockreftable_iterate(brtab->hash, &it)) != NULL) |
503 | 0 | { |
504 | 0 | BlockRefTableSerializedEntry *sentry = &sdata[i++]; |
505 | |
|
506 | 0 | sentry->rlocator = brtentry->key.rlocator; |
507 | 0 | sentry->forknum = brtentry->key.forknum; |
508 | 0 | sentry->limit_block = brtentry->limit_block; |
509 | 0 | sentry->nchunks = brtentry->nchunks; |
510 | | |
511 | | /* trim trailing zero entries */ |
512 | 0 | while (sentry->nchunks > 0 && |
513 | 0 | brtentry->chunk_usage[sentry->nchunks - 1] == 0) |
514 | 0 | sentry->nchunks--; |
515 | 0 | } |
516 | 0 | Assert(i == brtab->hash->members); |
517 | 0 | qsort(sdata, i, sizeof(BlockRefTableSerializedEntry), |
518 | 0 | BlockRefTableComparator); |
519 | | |
520 | | /* Loop over entries in sorted order and serialize each one. */ |
521 | 0 | for (i = 0; i < brtab->hash->members; ++i) |
522 | 0 | { |
523 | 0 | BlockRefTableSerializedEntry *sentry = &sdata[i]; |
524 | 0 | BlockRefTableKey key = {0}; /* make sure any padding is zero */ |
525 | 0 | unsigned j; |
526 | | |
527 | | /* Write the serialized entry itself. */ |
528 | 0 | BlockRefTableWrite(&buffer, sentry, |
529 | 0 | sizeof(BlockRefTableSerializedEntry)); |
530 | | |
531 | | /* Look up the original entry so we can access the chunks. */ |
532 | 0 | memcpy(&key.rlocator, &sentry->rlocator, sizeof(RelFileLocator)); |
533 | 0 | key.forknum = sentry->forknum; |
534 | 0 | brtentry = blockreftable_lookup(brtab->hash, key); |
535 | 0 | Assert(brtentry != NULL); |
536 | | |
537 | | /* Write the untruncated portion of the chunk length array. */ |
538 | 0 | if (sentry->nchunks != 0) |
539 | 0 | BlockRefTableWrite(&buffer, brtentry->chunk_usage, |
540 | 0 | sentry->nchunks * sizeof(uint16)); |
541 | | |
542 | | /* Write the contents of each chunk. */ |
543 | 0 | for (j = 0; j < brtentry->nchunks; ++j) |
544 | 0 | { |
545 | 0 | if (brtentry->chunk_usage[j] == 0) |
546 | 0 | continue; |
547 | 0 | BlockRefTableWrite(&buffer, brtentry->chunk_data[j], |
548 | 0 | brtentry->chunk_usage[j] * sizeof(uint16)); |
549 | 0 | } |
550 | 0 | } |
551 | 0 | } |
552 | | |
553 | | /* Write out appropriate terminator and CRC and flush buffer. */ |
554 | 0 | BlockRefTableFileTerminate(&buffer); |
555 | 0 | } |
556 | | |
557 | | /* |
558 | | * Prepare to incrementally read a block reference table file. |
559 | | * |
560 | | * 'read_callback' is a function that can be called to read data from the |
561 | | * underlying file (or other data source) into our internal buffer. |
562 | | * |
563 | | * 'read_callback_arg' is an opaque argument to be passed to read_callback. |
564 | | * |
565 | | * 'error_filename' is the filename that should be included in error messages |
566 | | * if the file is found to be malformed. The value is not copied, so the |
567 | | * caller should ensure that it remains valid until done with this |
568 | | * BlockRefTableReader. |
569 | | * |
570 | | * 'error_callback' is a function to be called if the file is found to be |
571 | | * malformed. This is not used for I/O errors, which must be handled internally |
572 | | * by read_callback. |
573 | | * |
574 | | * 'error_callback_arg' is an opaque argument to be passed to error_callback. |
575 | | */ |
576 | | BlockRefTableReader * |
577 | | CreateBlockRefTableReader(io_callback_fn read_callback, |
578 | | void *read_callback_arg, |
579 | | char *error_filename, |
580 | | report_error_fn error_callback, |
581 | | void *error_callback_arg) |
582 | 0 | { |
583 | 0 | BlockRefTableReader *reader; |
584 | 0 | uint32 magic; |
585 | | |
586 | | /* Initialize data structure. */ |
587 | 0 | reader = palloc0(sizeof(BlockRefTableReader)); |
588 | 0 | reader->buffer.io_callback = read_callback; |
589 | 0 | reader->buffer.io_callback_arg = read_callback_arg; |
590 | 0 | reader->error_filename = error_filename; |
591 | 0 | reader->error_callback = error_callback; |
592 | 0 | reader->error_callback_arg = error_callback_arg; |
593 | 0 | INIT_CRC32C(reader->buffer.crc); |
594 | | |
595 | | /* Verify magic number. */ |
596 | 0 | BlockRefTableRead(reader, &magic, sizeof(uint32)); |
597 | 0 | if (magic != BLOCKREFTABLE_MAGIC) |
598 | 0 | error_callback(error_callback_arg, |
599 | 0 | "file \"%s\" has wrong magic number: expected %u, found %u", |
600 | 0 | error_filename, |
601 | 0 | BLOCKREFTABLE_MAGIC, magic); |
602 | |
|
603 | 0 | return reader; |
604 | 0 | } |
605 | | |
606 | | /* |
607 | | * Read next relation fork covered by this block reference table file. |
608 | | * |
609 | | * After calling this function, you must call BlockRefTableReaderGetBlocks |
610 | | * until it returns 0 before calling it again. |
611 | | */ |
612 | | bool |
613 | | BlockRefTableReaderNextRelation(BlockRefTableReader *reader, |
614 | | RelFileLocator *rlocator, |
615 | | ForkNumber *forknum, |
616 | | BlockNumber *limit_block) |
617 | 0 | { |
618 | 0 | BlockRefTableSerializedEntry sentry; |
619 | 0 | BlockRefTableSerializedEntry zentry = {0}; |
620 | | |
621 | | /* |
622 | | * Sanity check: caller must read all blocks from all chunks before moving |
623 | | * on to the next relation. |
624 | | */ |
625 | 0 | Assert(reader->total_chunks == reader->consumed_chunks); |
626 | | |
627 | | /* Read serialized entry. */ |
628 | 0 | BlockRefTableRead(reader, &sentry, |
629 | 0 | sizeof(BlockRefTableSerializedEntry)); |
630 | | |
631 | | /* |
632 | | * If we just read the sentinel entry indicating that we've reached the |
633 | | * end, read and check the CRC. |
634 | | */ |
635 | 0 | if (memcmp(&sentry, &zentry, sizeof(BlockRefTableSerializedEntry)) == 0) |
636 | 0 | { |
637 | 0 | pg_crc32c expected_crc; |
638 | 0 | pg_crc32c actual_crc; |
639 | | |
640 | | /* |
641 | | * We want to know the CRC of the file excluding the 4-byte CRC |
642 | | * itself, so copy the current value of the CRC accumulator before |
643 | | * reading those bytes, and use the copy to finalize the calculation. |
644 | | */ |
645 | 0 | expected_crc = reader->buffer.crc; |
646 | 0 | FIN_CRC32C(expected_crc); |
647 | | |
648 | | /* Now we can read the actual value. */ |
649 | 0 | BlockRefTableRead(reader, &actual_crc, sizeof(pg_crc32c)); |
650 | | |
651 | | /* Throw an error if there is a mismatch. */ |
652 | 0 | if (!EQ_CRC32C(expected_crc, actual_crc)) |
653 | 0 | reader->error_callback(reader->error_callback_arg, |
654 | 0 | "file \"%s\" has wrong checksum: expected %08X, found %08X", |
655 | 0 | reader->error_filename, expected_crc, actual_crc); |
656 | |
|
657 | 0 | return false; |
658 | 0 | } |
659 | | |
660 | | /* Read chunk size array. */ |
661 | 0 | if (reader->chunk_size != NULL) |
662 | 0 | pfree(reader->chunk_size); |
663 | 0 | reader->chunk_size = palloc(sentry.nchunks * sizeof(uint16)); |
664 | 0 | BlockRefTableRead(reader, reader->chunk_size, |
665 | 0 | sentry.nchunks * sizeof(uint16)); |
666 | | |
667 | | /* Set up for chunk scan. */ |
668 | 0 | reader->total_chunks = sentry.nchunks; |
669 | 0 | reader->consumed_chunks = 0; |
670 | | |
671 | | /* Return data to caller. */ |
672 | 0 | memcpy(rlocator, &sentry.rlocator, sizeof(RelFileLocator)); |
673 | 0 | *forknum = sentry.forknum; |
674 | 0 | *limit_block = sentry.limit_block; |
675 | 0 | return true; |
676 | 0 | } |
677 | | |
678 | | /* |
679 | | * Get modified blocks associated with the relation fork returned by |
680 | | * the most recent call to BlockRefTableReaderNextRelation. |
681 | | * |
682 | | * On return, block numbers will be written into the 'blocks' array, whose |
683 | | * length should be passed via 'nblocks'. The return value is the number of |
684 | | * entries actually written into the 'blocks' array, which may be less than |
685 | | * 'nblocks' if we run out of modified blocks in the relation fork before |
686 | | * we run out of room in the array. |
687 | | */ |
688 | | unsigned |
689 | | BlockRefTableReaderGetBlocks(BlockRefTableReader *reader, |
690 | | BlockNumber *blocks, |
691 | | int nblocks) |
692 | 0 | { |
693 | 0 | unsigned blocks_found = 0; |
694 | | |
695 | | /* Must provide space for at least one block number to be returned. */ |
696 | 0 | Assert(nblocks > 0); |
697 | | |
698 | | /* Loop collecting blocks to return to caller. */ |
699 | 0 | for (;;) |
700 | 0 | { |
701 | 0 | uint16 next_chunk_size; |
702 | | |
703 | | /* |
704 | | * If we've read at least one chunk, maybe it contains some block |
705 | | * numbers that could satisfy caller's request. |
706 | | */ |
707 | 0 | if (reader->consumed_chunks > 0) |
708 | 0 | { |
709 | 0 | uint32 chunkno = reader->consumed_chunks - 1; |
710 | 0 | uint16 chunk_size = reader->chunk_size[chunkno]; |
711 | |
|
712 | 0 | if (chunk_size == MAX_ENTRIES_PER_CHUNK) |
713 | 0 | { |
714 | | /* Bitmap format, so search for bits that are set. */ |
715 | 0 | while (reader->chunk_position < BLOCKS_PER_CHUNK && |
716 | 0 | blocks_found < nblocks) |
717 | 0 | { |
718 | 0 | uint16 chunkoffset = reader->chunk_position; |
719 | 0 | uint16 w; |
720 | |
|
721 | 0 | w = reader->chunk_data[chunkoffset / BLOCKS_PER_ENTRY]; |
722 | 0 | if ((w & (1u << (chunkoffset % BLOCKS_PER_ENTRY))) != 0) |
723 | 0 | blocks[blocks_found++] = |
724 | 0 | chunkno * BLOCKS_PER_CHUNK + chunkoffset; |
725 | 0 | ++reader->chunk_position; |
726 | 0 | } |
727 | 0 | } |
728 | 0 | else |
729 | 0 | { |
730 | | /* Not in bitmap format, so each entry is a 2-byte offset. */ |
731 | 0 | while (reader->chunk_position < chunk_size && |
732 | 0 | blocks_found < nblocks) |
733 | 0 | { |
734 | 0 | blocks[blocks_found++] = chunkno * BLOCKS_PER_CHUNK |
735 | 0 | + reader->chunk_data[reader->chunk_position]; |
736 | 0 | ++reader->chunk_position; |
737 | 0 | } |
738 | 0 | } |
739 | 0 | } |
740 | | |
741 | | /* We found enough blocks, so we're done. */ |
742 | 0 | if (blocks_found >= nblocks) |
743 | 0 | break; |
744 | | |
745 | | /* |
746 | | * We didn't find enough blocks, so we must need the next chunk. If |
747 | | * there are none left, though, then we're done anyway. |
748 | | */ |
749 | 0 | if (reader->consumed_chunks == reader->total_chunks) |
750 | 0 | break; |
751 | | |
752 | | /* |
753 | | * Read data for next chunk and reset scan position to beginning of |
754 | | * chunk. Note that the next chunk might be empty, in which case we |
755 | | * consume the chunk without actually consuming any bytes from the |
756 | | * underlying file. |
757 | | */ |
758 | 0 | next_chunk_size = reader->chunk_size[reader->consumed_chunks]; |
759 | 0 | if (next_chunk_size > 0) |
760 | 0 | BlockRefTableRead(reader, reader->chunk_data, |
761 | 0 | next_chunk_size * sizeof(uint16)); |
762 | 0 | ++reader->consumed_chunks; |
763 | 0 | reader->chunk_position = 0; |
764 | 0 | } |
765 | |
|
766 | 0 | return blocks_found; |
767 | 0 | } |
768 | | |
769 | | /* |
770 | | * Release memory used while reading a block reference table from a file. |
771 | | */ |
772 | | void |
773 | | DestroyBlockRefTableReader(BlockRefTableReader *reader) |
774 | 0 | { |
775 | 0 | if (reader->chunk_size != NULL) |
776 | 0 | { |
777 | 0 | pfree(reader->chunk_size); |
778 | 0 | reader->chunk_size = NULL; |
779 | 0 | } |
780 | 0 | pfree(reader); |
781 | 0 | } |
782 | | |
783 | | /* |
784 | | * Prepare to write a block reference table file incrementally. |
785 | | * |
786 | | * Caller must be able to supply BlockRefTableEntry objects sorted in the |
787 | | * appropriate order. |
788 | | */ |
789 | | BlockRefTableWriter * |
790 | | CreateBlockRefTableWriter(io_callback_fn write_callback, |
791 | | void *write_callback_arg) |
792 | 0 | { |
793 | 0 | BlockRefTableWriter *writer; |
794 | 0 | uint32 magic = BLOCKREFTABLE_MAGIC; |
795 | | |
796 | | /* Prepare buffer and CRC check and save callbacks. */ |
797 | 0 | writer = palloc0(sizeof(BlockRefTableWriter)); |
798 | 0 | writer->buffer.io_callback = write_callback; |
799 | 0 | writer->buffer.io_callback_arg = write_callback_arg; |
800 | 0 | INIT_CRC32C(writer->buffer.crc); |
801 | | |
802 | | /* Write magic number. */ |
803 | 0 | BlockRefTableWrite(&writer->buffer, &magic, sizeof(uint32)); |
804 | |
|
805 | 0 | return writer; |
806 | 0 | } |
807 | | |
808 | | /* |
809 | | * Append one entry to a block reference table file. |
810 | | * |
811 | | * Note that entries must be written in the proper order, that is, sorted by |
812 | | * tablespace, then database, then relfilenumber, then fork number. Caller |
813 | | * is responsible for supplying data in the correct order. If that seems hard, |
814 | | * use an in-memory BlockRefTable instead. |
815 | | */ |
816 | | void |
817 | | BlockRefTableWriteEntry(BlockRefTableWriter *writer, BlockRefTableEntry *entry) |
818 | 0 | { |
819 | 0 | BlockRefTableSerializedEntry sentry; |
820 | 0 | unsigned j; |
821 | | |
822 | | /* Convert to serialized entry format. */ |
823 | 0 | sentry.rlocator = entry->key.rlocator; |
824 | 0 | sentry.forknum = entry->key.forknum; |
825 | 0 | sentry.limit_block = entry->limit_block; |
826 | 0 | sentry.nchunks = entry->nchunks; |
827 | | |
828 | | /* Trim trailing zero entries. */ |
829 | 0 | while (sentry.nchunks > 0 && entry->chunk_usage[sentry.nchunks - 1] == 0) |
830 | 0 | sentry.nchunks--; |
831 | | |
832 | | /* Write the serialized entry itself. */ |
833 | 0 | BlockRefTableWrite(&writer->buffer, &sentry, |
834 | 0 | sizeof(BlockRefTableSerializedEntry)); |
835 | | |
836 | | /* Write the untruncated portion of the chunk length array. */ |
837 | 0 | if (sentry.nchunks != 0) |
838 | 0 | BlockRefTableWrite(&writer->buffer, entry->chunk_usage, |
839 | 0 | sentry.nchunks * sizeof(uint16)); |
840 | | |
841 | | /* Write the contents of each chunk. */ |
842 | 0 | for (j = 0; j < entry->nchunks; ++j) |
843 | 0 | { |
844 | 0 | if (entry->chunk_usage[j] == 0) |
845 | 0 | continue; |
846 | 0 | BlockRefTableWrite(&writer->buffer, entry->chunk_data[j], |
847 | 0 | entry->chunk_usage[j] * sizeof(uint16)); |
848 | 0 | } |
849 | 0 | } |
850 | | |
851 | | /* |
852 | | * Finalize an incremental write of a block reference table file. |
853 | | */ |
854 | | void |
855 | | DestroyBlockRefTableWriter(BlockRefTableWriter *writer) |
856 | 0 | { |
857 | 0 | BlockRefTableFileTerminate(&writer->buffer); |
858 | 0 | pfree(writer); |
859 | 0 | } |
860 | | |
861 | | /* |
862 | | * Allocate a standalone BlockRefTableEntry. |
863 | | * |
864 | | * When we're manipulating a full in-memory BlockRefTable, the entries are |
865 | | * part of the hash table and are allocated by simplehash. This routine is |
866 | | * used by callers that want to write out a BlockRefTable to a file without |
867 | | * needing to store the whole thing in memory at once. |
868 | | * |
869 | | * Entries allocated by this function can be manipulated using the functions |
870 | | * BlockRefTableEntrySetLimitBlock and BlockRefTableEntryMarkBlockModified |
871 | | * and then written using BlockRefTableWriteEntry and freed using |
872 | | * BlockRefTableFreeEntry. |
873 | | */ |
874 | | BlockRefTableEntry * |
875 | | CreateBlockRefTableEntry(RelFileLocator rlocator, ForkNumber forknum) |
876 | 0 | { |
877 | 0 | BlockRefTableEntry *entry = palloc0(sizeof(BlockRefTableEntry)); |
878 | |
|
879 | 0 | memcpy(&entry->key.rlocator, &rlocator, sizeof(RelFileLocator)); |
880 | 0 | entry->key.forknum = forknum; |
881 | 0 | entry->limit_block = InvalidBlockNumber; |
882 | |
|
883 | 0 | return entry; |
884 | 0 | } |
885 | | |
886 | | /* |
887 | | * Update a BlockRefTableEntry with a new value for the "limit block" and |
888 | | * forget any equal-or-higher-numbered modified blocks. |
889 | | * |
890 | | * The "limit block" is the shortest known length of the relation within the |
891 | | * range of WAL records covered by this block reference table. |
892 | | */ |
893 | | void |
894 | | BlockRefTableEntrySetLimitBlock(BlockRefTableEntry *entry, |
895 | | BlockNumber limit_block) |
896 | 0 | { |
897 | 0 | unsigned chunkno; |
898 | 0 | unsigned limit_chunkno; |
899 | 0 | unsigned limit_chunkoffset; |
900 | 0 | BlockRefTableChunk limit_chunk; |
901 | | |
902 | | /* If we already have an equal or lower limit block, do nothing. */ |
903 | 0 | if (limit_block >= entry->limit_block) |
904 | 0 | return; |
905 | | |
906 | | /* Record the new limit block value. */ |
907 | 0 | entry->limit_block = limit_block; |
908 | | |
909 | | /* |
910 | | * Figure out which chunk would store the state of the new limit block, |
911 | | * and which offset within that chunk. |
912 | | */ |
913 | 0 | limit_chunkno = limit_block / BLOCKS_PER_CHUNK; |
914 | 0 | limit_chunkoffset = limit_block % BLOCKS_PER_CHUNK; |
915 | | |
916 | | /* |
917 | | * If the number of chunks is not large enough for any blocks with equal |
918 | | * or higher block numbers to exist, then there is nothing further to do. |
919 | | */ |
920 | 0 | if (limit_chunkno >= entry->nchunks) |
921 | 0 | return; |
922 | | |
923 | | /* Discard entire contents of any higher-numbered chunks. */ |
924 | 0 | for (chunkno = limit_chunkno + 1; chunkno < entry->nchunks; ++chunkno) |
925 | 0 | entry->chunk_usage[chunkno] = 0; |
926 | | |
927 | | /* |
928 | | * Next, we need to discard any offsets within the chunk that would |
929 | | * contain the limit_block. We must handle this differently depending on |
930 | | * whether the chunk that would contain limit_block is a bitmap or an |
931 | | * array of offsets. |
932 | | */ |
933 | 0 | limit_chunk = entry->chunk_data[limit_chunkno]; |
934 | 0 | if (entry->chunk_usage[limit_chunkno] == MAX_ENTRIES_PER_CHUNK) |
935 | 0 | { |
936 | 0 | unsigned chunkoffset; |
937 | | |
938 | | /* It's a bitmap. Unset bits. */ |
939 | 0 | for (chunkoffset = limit_chunkoffset; chunkoffset < BLOCKS_PER_CHUNK; |
940 | 0 | ++chunkoffset) |
941 | 0 | limit_chunk[chunkoffset / BLOCKS_PER_ENTRY] &= |
942 | 0 | ~(1 << (chunkoffset % BLOCKS_PER_ENTRY)); |
943 | 0 | } |
944 | 0 | else |
945 | 0 | { |
946 | 0 | unsigned i, |
947 | 0 | j = 0; |
948 | | |
949 | | /* It's an offset array. Filter out large offsets. */ |
950 | 0 | for (i = 0; i < entry->chunk_usage[limit_chunkno]; ++i) |
951 | 0 | { |
952 | 0 | Assert(j <= i); |
953 | 0 | if (limit_chunk[i] < limit_chunkoffset) |
954 | 0 | limit_chunk[j++] = limit_chunk[i]; |
955 | 0 | } |
956 | 0 | Assert(j <= entry->chunk_usage[limit_chunkno]); |
957 | 0 | entry->chunk_usage[limit_chunkno] = j; |
958 | 0 | } |
959 | 0 | } |
960 | | |
961 | | /* |
962 | | * Mark a block in a given BlockRefTableEntry as known to have been modified. |
963 | | */ |
964 | | void |
965 | | BlockRefTableEntryMarkBlockModified(BlockRefTableEntry *entry, |
966 | | ForkNumber forknum, |
967 | | BlockNumber blknum) |
968 | 0 | { |
969 | 0 | unsigned chunkno; |
970 | 0 | unsigned chunkoffset; |
971 | 0 | unsigned i; |
972 | | |
973 | | /* |
974 | | * Which chunk should store the state of this block? And what is the |
975 | | * offset of this block relative to the start of that chunk? |
976 | | */ |
977 | 0 | chunkno = blknum / BLOCKS_PER_CHUNK; |
978 | 0 | chunkoffset = blknum % BLOCKS_PER_CHUNK; |
979 | | |
980 | | /* |
981 | | * If 'nchunks' isn't big enough for us to be able to represent the state |
982 | | * of this block, we need to enlarge our arrays. |
983 | | */ |
984 | 0 | if (chunkno >= entry->nchunks) |
985 | 0 | { |
986 | 0 | unsigned max_chunks; |
987 | 0 | unsigned extra_chunks; |
988 | | |
989 | | /* |
990 | | * New array size is a power of 2, at least 16, big enough so that |
991 | | * chunkno will be a valid array index. |
992 | | */ |
993 | 0 | max_chunks = Max(16, entry->nchunks); |
994 | 0 | while (max_chunks < chunkno + 1) |
995 | 0 | max_chunks *= 2; |
996 | 0 | extra_chunks = max_chunks - entry->nchunks; |
997 | |
|
998 | 0 | if (entry->nchunks == 0) |
999 | 0 | { |
1000 | 0 | entry->chunk_size = palloc0(sizeof(uint16) * max_chunks); |
1001 | 0 | entry->chunk_usage = palloc0(sizeof(uint16) * max_chunks); |
1002 | 0 | entry->chunk_data = |
1003 | 0 | palloc0(sizeof(BlockRefTableChunk) * max_chunks); |
1004 | 0 | } |
1005 | 0 | else |
1006 | 0 | { |
1007 | 0 | entry->chunk_size = repalloc(entry->chunk_size, |
1008 | 0 | sizeof(uint16) * max_chunks); |
1009 | 0 | memset(&entry->chunk_size[entry->nchunks], 0, |
1010 | 0 | extra_chunks * sizeof(uint16)); |
1011 | 0 | entry->chunk_usage = repalloc(entry->chunk_usage, |
1012 | 0 | sizeof(uint16) * max_chunks); |
1013 | 0 | memset(&entry->chunk_usage[entry->nchunks], 0, |
1014 | 0 | extra_chunks * sizeof(uint16)); |
1015 | 0 | entry->chunk_data = repalloc(entry->chunk_data, |
1016 | 0 | sizeof(BlockRefTableChunk) * max_chunks); |
1017 | 0 | memset(&entry->chunk_data[entry->nchunks], 0, |
1018 | 0 | extra_chunks * sizeof(BlockRefTableChunk)); |
1019 | 0 | } |
1020 | 0 | entry->nchunks = max_chunks; |
1021 | 0 | } |
1022 | | |
1023 | | /* |
1024 | | * If the chunk that covers this block number doesn't exist yet, create it |
1025 | | * as an array and add the appropriate offset to it. We make it pretty |
1026 | | * small initially, because there might only be 1 or a few block |
1027 | | * references in this chunk and we don't want to use up too much memory. |
1028 | | */ |
1029 | 0 | if (entry->chunk_size[chunkno] == 0) |
1030 | 0 | { |
1031 | 0 | entry->chunk_data[chunkno] = |
1032 | 0 | palloc(sizeof(uint16) * INITIAL_ENTRIES_PER_CHUNK); |
1033 | 0 | entry->chunk_size[chunkno] = INITIAL_ENTRIES_PER_CHUNK; |
1034 | 0 | entry->chunk_data[chunkno][0] = chunkoffset; |
1035 | 0 | entry->chunk_usage[chunkno] = 1; |
1036 | 0 | return; |
1037 | 0 | } |
1038 | | |
1039 | | /* |
1040 | | * If the number of entries in this chunk is already maximum, it must be a |
1041 | | * bitmap. Just set the appropriate bit. |
1042 | | */ |
1043 | 0 | if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK) |
1044 | 0 | { |
1045 | 0 | BlockRefTableChunk chunk = entry->chunk_data[chunkno]; |
1046 | |
|
1047 | 0 | chunk[chunkoffset / BLOCKS_PER_ENTRY] |= |
1048 | 0 | 1 << (chunkoffset % BLOCKS_PER_ENTRY); |
1049 | 0 | return; |
1050 | 0 | } |
1051 | | |
1052 | | /* |
1053 | | * There is an existing chunk and it's in array format. Let's find out |
1054 | | * whether it already has an entry for this block. If so, we do not need |
1055 | | * to do anything. |
1056 | | */ |
1057 | 0 | for (i = 0; i < entry->chunk_usage[chunkno]; ++i) |
1058 | 0 | { |
1059 | 0 | if (entry->chunk_data[chunkno][i] == chunkoffset) |
1060 | 0 | return; |
1061 | 0 | } |
1062 | | |
1063 | | /* |
1064 | | * If the number of entries currently used is one less than the maximum, |
1065 | | * it's time to convert to bitmap format. |
1066 | | */ |
1067 | 0 | if (entry->chunk_usage[chunkno] == MAX_ENTRIES_PER_CHUNK - 1) |
1068 | 0 | { |
1069 | 0 | BlockRefTableChunk newchunk; |
1070 | 0 | unsigned j; |
1071 | | |
1072 | | /* Allocate a new chunk. */ |
1073 | 0 | newchunk = palloc0(MAX_ENTRIES_PER_CHUNK * sizeof(uint16)); |
1074 | | |
1075 | | /* Set the bit for each existing entry. */ |
1076 | 0 | for (j = 0; j < entry->chunk_usage[chunkno]; ++j) |
1077 | 0 | { |
1078 | 0 | unsigned coff = entry->chunk_data[chunkno][j]; |
1079 | |
|
1080 | 0 | newchunk[coff / BLOCKS_PER_ENTRY] |= |
1081 | 0 | 1 << (coff % BLOCKS_PER_ENTRY); |
1082 | 0 | } |
1083 | | |
1084 | | /* Set the bit for the new entry. */ |
1085 | 0 | newchunk[chunkoffset / BLOCKS_PER_ENTRY] |= |
1086 | 0 | 1 << (chunkoffset % BLOCKS_PER_ENTRY); |
1087 | | |
1088 | | /* Swap the new chunk into place and update metadata. */ |
1089 | 0 | pfree(entry->chunk_data[chunkno]); |
1090 | 0 | entry->chunk_data[chunkno] = newchunk; |
1091 | 0 | entry->chunk_size[chunkno] = MAX_ENTRIES_PER_CHUNK; |
1092 | 0 | entry->chunk_usage[chunkno] = MAX_ENTRIES_PER_CHUNK; |
1093 | 0 | return; |
1094 | 0 | } |
1095 | | |
1096 | | /* |
1097 | | * OK, we currently have an array, and we don't need to convert to a |
1098 | | * bitmap, but we do need to add a new element. If there's not enough |
1099 | | * room, we'll have to expand the array. |
1100 | | */ |
1101 | 0 | if (entry->chunk_usage[chunkno] == entry->chunk_size[chunkno]) |
1102 | 0 | { |
1103 | 0 | unsigned newsize = entry->chunk_size[chunkno] * 2; |
1104 | |
|
1105 | 0 | Assert(newsize <= MAX_ENTRIES_PER_CHUNK); |
1106 | 0 | entry->chunk_data[chunkno] = repalloc(entry->chunk_data[chunkno], |
1107 | 0 | newsize * sizeof(uint16)); |
1108 | 0 | entry->chunk_size[chunkno] = newsize; |
1109 | 0 | } |
1110 | | |
1111 | | /* Now we can add the new entry. */ |
1112 | 0 | entry->chunk_data[chunkno][entry->chunk_usage[chunkno]] = |
1113 | 0 | chunkoffset; |
1114 | 0 | entry->chunk_usage[chunkno]++; |
1115 | 0 | } |
1116 | | |
1117 | | /* |
1118 | | * Release memory for a BlockRefTableEntry that was created by |
1119 | | * CreateBlockRefTableEntry. |
1120 | | */ |
1121 | | void |
1122 | | BlockRefTableFreeEntry(BlockRefTableEntry *entry) |
1123 | 0 | { |
1124 | 0 | if (entry->chunk_size != NULL) |
1125 | 0 | { |
1126 | 0 | pfree(entry->chunk_size); |
1127 | 0 | entry->chunk_size = NULL; |
1128 | 0 | } |
1129 | |
|
1130 | 0 | if (entry->chunk_usage != NULL) |
1131 | 0 | { |
1132 | 0 | pfree(entry->chunk_usage); |
1133 | 0 | entry->chunk_usage = NULL; |
1134 | 0 | } |
1135 | |
|
1136 | 0 | if (entry->chunk_data != NULL) |
1137 | 0 | { |
1138 | 0 | pfree(entry->chunk_data); |
1139 | 0 | entry->chunk_data = NULL; |
1140 | 0 | } |
1141 | |
|
1142 | 0 | pfree(entry); |
1143 | 0 | } |
1144 | | |
1145 | | /* |
1146 | | * Comparator for BlockRefTableSerializedEntry objects. |
1147 | | * |
1148 | | * We make the tablespace OID the first column of the sort key to match |
1149 | | * the on-disk tree structure. |
1150 | | */ |
1151 | | static int |
1152 | | BlockRefTableComparator(const void *a, const void *b) |
1153 | 0 | { |
1154 | 0 | const BlockRefTableSerializedEntry *sa = a; |
1155 | 0 | const BlockRefTableSerializedEntry *sb = b; |
1156 | |
|
1157 | 0 | if (sa->rlocator.spcOid > sb->rlocator.spcOid) |
1158 | 0 | return 1; |
1159 | 0 | if (sa->rlocator.spcOid < sb->rlocator.spcOid) |
1160 | 0 | return -1; |
1161 | | |
1162 | 0 | if (sa->rlocator.dbOid > sb->rlocator.dbOid) |
1163 | 0 | return 1; |
1164 | 0 | if (sa->rlocator.dbOid < sb->rlocator.dbOid) |
1165 | 0 | return -1; |
1166 | | |
1167 | 0 | if (sa->rlocator.relNumber > sb->rlocator.relNumber) |
1168 | 0 | return 1; |
1169 | 0 | if (sa->rlocator.relNumber < sb->rlocator.relNumber) |
1170 | 0 | return -1; |
1171 | | |
1172 | 0 | if (sa->forknum > sb->forknum) |
1173 | 0 | return 1; |
1174 | 0 | if (sa->forknum < sb->forknum) |
1175 | 0 | return -1; |
1176 | | |
1177 | 0 | return 0; |
1178 | 0 | } |
1179 | | |
1180 | | /* |
1181 | | * Flush any buffered data out of a BlockRefTableBuffer. |
1182 | | */ |
1183 | | static void |
1184 | | BlockRefTableFlush(BlockRefTableBuffer *buffer) |
1185 | 0 | { |
1186 | 0 | buffer->io_callback(buffer->io_callback_arg, buffer->data, buffer->used); |
1187 | 0 | buffer->used = 0; |
1188 | 0 | } |
1189 | | |
1190 | | /* |
1191 | | * Read data from a BlockRefTableBuffer, and update the running CRC |
1192 | | * calculation for the returned data (but not any data that we may have |
1193 | | * buffered but not yet actually returned). |
1194 | | */ |
1195 | | static void |
1196 | | BlockRefTableRead(BlockRefTableReader *reader, void *data, int length) |
1197 | 0 | { |
1198 | 0 | BlockRefTableBuffer *buffer = &reader->buffer; |
1199 | | |
1200 | | /* Loop until read is fully satisfied. */ |
1201 | 0 | while (length > 0) |
1202 | 0 | { |
1203 | 0 | if (buffer->cursor < buffer->used) |
1204 | 0 | { |
1205 | | /* |
1206 | | * If any buffered data is available, use that to satisfy as much |
1207 | | * of the request as possible. |
1208 | | */ |
1209 | 0 | int bytes_to_copy = Min(length, buffer->used - buffer->cursor); |
1210 | |
|
1211 | 0 | memcpy(data, &buffer->data[buffer->cursor], bytes_to_copy); |
1212 | 0 | COMP_CRC32C(buffer->crc, &buffer->data[buffer->cursor], |
1213 | 0 | bytes_to_copy); |
1214 | 0 | buffer->cursor += bytes_to_copy; |
1215 | 0 | data = ((char *) data) + bytes_to_copy; |
1216 | 0 | length -= bytes_to_copy; |
1217 | 0 | } |
1218 | 0 | else if (length >= BUFSIZE) |
1219 | 0 | { |
1220 | | /* |
1221 | | * If the request length is long, read directly into caller's |
1222 | | * buffer. |
1223 | | */ |
1224 | 0 | int bytes_read; |
1225 | |
|
1226 | 0 | bytes_read = buffer->io_callback(buffer->io_callback_arg, |
1227 | 0 | data, length); |
1228 | 0 | COMP_CRC32C(buffer->crc, data, bytes_read); |
1229 | 0 | data = ((char *) data) + bytes_read; |
1230 | 0 | length -= bytes_read; |
1231 | | |
1232 | | /* If we didn't get anything, that's bad. */ |
1233 | 0 | if (bytes_read == 0) |
1234 | 0 | reader->error_callback(reader->error_callback_arg, |
1235 | 0 | "file \"%s\" ends unexpectedly", |
1236 | 0 | reader->error_filename); |
1237 | 0 | } |
1238 | 0 | else |
1239 | 0 | { |
1240 | | /* |
1241 | | * Refill our buffer. |
1242 | | */ |
1243 | 0 | buffer->used = buffer->io_callback(buffer->io_callback_arg, |
1244 | 0 | buffer->data, BUFSIZE); |
1245 | 0 | buffer->cursor = 0; |
1246 | | |
1247 | | /* If we didn't get anything, that's bad. */ |
1248 | 0 | if (buffer->used == 0) |
1249 | 0 | reader->error_callback(reader->error_callback_arg, |
1250 | 0 | "file \"%s\" ends unexpectedly", |
1251 | 0 | reader->error_filename); |
1252 | 0 | } |
1253 | 0 | } |
1254 | 0 | } |
1255 | | |
1256 | | /* |
1257 | | * Supply data to a BlockRefTableBuffer for write to the underlying File, |
1258 | | * and update the running CRC calculation for that data. |
1259 | | */ |
1260 | | static void |
1261 | | BlockRefTableWrite(BlockRefTableBuffer *buffer, void *data, int length) |
1262 | 0 | { |
1263 | | /* Update running CRC calculation. */ |
1264 | 0 | COMP_CRC32C(buffer->crc, data, length); |
1265 | | |
1266 | | /* If the new data can't fit into the buffer, flush the buffer. */ |
1267 | 0 | if (buffer->used + length > BUFSIZE) |
1268 | 0 | { |
1269 | 0 | buffer->io_callback(buffer->io_callback_arg, buffer->data, |
1270 | 0 | buffer->used); |
1271 | 0 | buffer->used = 0; |
1272 | 0 | } |
1273 | | |
1274 | | /* If the new data would fill the buffer, or more, write it directly. */ |
1275 | 0 | if (length >= BUFSIZE) |
1276 | 0 | { |
1277 | 0 | buffer->io_callback(buffer->io_callback_arg, data, length); |
1278 | 0 | return; |
1279 | 0 | } |
1280 | | |
1281 | | /* Otherwise, copy the new data into the buffer. */ |
1282 | 0 | memcpy(&buffer->data[buffer->used], data, length); |
1283 | 0 | buffer->used += length; |
1284 | 0 | Assert(buffer->used <= BUFSIZE); |
1285 | 0 | } |
1286 | | |
1287 | | /* |
1288 | | * Generate the sentinel and CRC required at the end of a block reference |
1289 | | * table file and flush them out of our internal buffer. |
1290 | | */ |
1291 | | static void |
1292 | | BlockRefTableFileTerminate(BlockRefTableBuffer *buffer) |
1293 | 0 | { |
1294 | 0 | BlockRefTableSerializedEntry zentry = {0}; |
1295 | 0 | pg_crc32c crc; |
1296 | | |
1297 | | /* Write a sentinel indicating that there are no more entries. */ |
1298 | 0 | BlockRefTableWrite(buffer, &zentry, |
1299 | 0 | sizeof(BlockRefTableSerializedEntry)); |
1300 | | |
1301 | | /* |
1302 | | * Writing the checksum will perturb the ongoing checksum calculation, so |
1303 | | * copy the state first and finalize the computation using the copy. |
1304 | | */ |
1305 | 0 | crc = buffer->crc; |
1306 | 0 | FIN_CRC32C(crc); |
1307 | 0 | BlockRefTableWrite(buffer, &crc, sizeof(pg_crc32c)); |
1308 | | |
1309 | | /* Flush any leftover data out of our buffer. */ |
1310 | 0 | BlockRefTableFlush(buffer); |
1311 | 0 | } |