/src/postgres/src/backend/access/transam/xlogutils.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * xlogutils.c |
4 | | * |
5 | | * PostgreSQL write-ahead log manager utility routines |
6 | | * |
7 | | * This file contains support routines that are used by XLOG replay functions. |
8 | | * None of this code is used during normal system operation. |
9 | | * |
10 | | * |
11 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
12 | | * Portions Copyright (c) 1994, Regents of the University of California |
13 | | * |
14 | | * src/backend/access/transam/xlogutils.c |
15 | | * |
16 | | *------------------------------------------------------------------------- |
17 | | */ |
18 | | #include "postgres.h" |
19 | | |
20 | | #include <unistd.h> |
21 | | |
22 | | #include "access/timeline.h" |
23 | | #include "access/xlogrecovery.h" |
24 | | #include "access/xlog_internal.h" |
25 | | #include "access/xlogutils.h" |
26 | | #include "miscadmin.h" |
27 | | #include "storage/fd.h" |
28 | | #include "storage/smgr.h" |
29 | | #include "utils/hsearch.h" |
30 | | #include "utils/rel.h" |
31 | | |
32 | | |
33 | | /* GUC variable */ |
34 | | bool ignore_invalid_pages = false; |
35 | | |
36 | | /* |
37 | | * Are we doing recovery from XLOG? |
38 | | * |
39 | | * This is only ever true in the startup process; it should be read as meaning |
40 | | * "this process is replaying WAL records", rather than "the system is in |
41 | | * recovery mode". It should be examined primarily by functions that need |
42 | | * to act differently when called from a WAL redo function (e.g., to skip WAL |
43 | | * logging). To check whether the system is in recovery regardless of which |
44 | | * process you're running in, use RecoveryInProgress() but only after shared |
45 | | * memory startup and lock initialization. |
46 | | * |
47 | | * This is updated from xlog.c and xlogrecovery.c, but lives here because |
48 | | * it's mostly read by WAL redo functions. |
49 | | */ |
50 | | bool InRecovery = false; |
51 | | |
52 | | /* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */ |
53 | | HotStandbyState standbyState = STANDBY_DISABLED; |
54 | | |
55 | | /* |
56 | | * During XLOG replay, we may see XLOG records for incremental updates of |
57 | | * pages that no longer exist, because their relation was later dropped or |
58 | | * truncated. (Note: this is only possible when full_page_writes = OFF, |
59 | | * since when it's ON, the first reference we see to a page should always |
60 | | * be a full-page rewrite not an incremental update.) Rather than simply |
61 | | * ignoring such records, we make a note of the referenced page, and then |
62 | | * complain if we don't actually see a drop or truncate covering the page |
63 | | * later in replay. |
64 | | */ |
65 | | typedef struct xl_invalid_page_key |
66 | | { |
67 | | RelFileLocator locator; /* the relation */ |
68 | | ForkNumber forkno; /* the fork number */ |
69 | | BlockNumber blkno; /* the page */ |
70 | | } xl_invalid_page_key; |
71 | | |
72 | | typedef struct xl_invalid_page |
73 | | { |
74 | | xl_invalid_page_key key; /* hash key ... must be first */ |
75 | | bool present; /* page existed but contained zeroes */ |
76 | | } xl_invalid_page; |
77 | | |
78 | | static HTAB *invalid_page_tab = NULL; |
79 | | |
80 | | static int read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr, |
81 | | int reqLen, XLogRecPtr targetRecPtr, |
82 | | char *cur_page, bool wait_for_wal); |
83 | | |
84 | | /* Report a reference to an invalid page */ |
85 | | static void |
86 | | report_invalid_page(int elevel, RelFileLocator locator, ForkNumber forkno, |
87 | | BlockNumber blkno, bool present) |
88 | 0 | { |
89 | 0 | RelPathStr path = relpathperm(locator, forkno); |
90 | |
|
91 | 0 | if (present) |
92 | 0 | elog(elevel, "page %u of relation %s is uninitialized", |
93 | 0 | blkno, path.str); |
94 | 0 | else |
95 | 0 | elog(elevel, "page %u of relation %s does not exist", |
96 | 0 | blkno, path.str); |
97 | 0 | } |
98 | | |
99 | | /* Log a reference to an invalid page */ |
100 | | static void |
101 | | log_invalid_page(RelFileLocator locator, ForkNumber forkno, BlockNumber blkno, |
102 | | bool present) |
103 | 0 | { |
104 | 0 | xl_invalid_page_key key; |
105 | 0 | xl_invalid_page *hentry; |
106 | 0 | bool found; |
107 | | |
108 | | /* |
109 | | * Once recovery has reached a consistent state, the invalid-page table |
110 | | * should be empty and remain so. If a reference to an invalid page is |
111 | | * found after consistency is reached, PANIC immediately. This might seem |
112 | | * aggressive, but it's better than letting the invalid reference linger |
113 | | * in the hash table until the end of recovery and PANIC there, which |
114 | | * might come only much later if this is a standby server. |
115 | | */ |
116 | 0 | if (reachedConsistency) |
117 | 0 | { |
118 | 0 | report_invalid_page(WARNING, locator, forkno, blkno, present); |
119 | 0 | elog(ignore_invalid_pages ? WARNING : PANIC, |
120 | 0 | "WAL contains references to invalid pages"); |
121 | 0 | } |
122 | | |
123 | | /* |
124 | | * Log references to invalid pages at DEBUG1 level. This allows some |
125 | | * tracing of the cause (note the elog context mechanism will tell us |
126 | | * something about the XLOG record that generated the reference). |
127 | | */ |
128 | 0 | if (message_level_is_interesting(DEBUG1)) |
129 | 0 | report_invalid_page(DEBUG1, locator, forkno, blkno, present); |
130 | |
|
131 | 0 | if (invalid_page_tab == NULL) |
132 | 0 | { |
133 | | /* create hash table when first needed */ |
134 | 0 | HASHCTL ctl; |
135 | |
|
136 | 0 | ctl.keysize = sizeof(xl_invalid_page_key); |
137 | 0 | ctl.entrysize = sizeof(xl_invalid_page); |
138 | |
|
139 | 0 | invalid_page_tab = hash_create("XLOG invalid-page table", |
140 | 0 | 100, |
141 | 0 | &ctl, |
142 | 0 | HASH_ELEM | HASH_BLOBS); |
143 | 0 | } |
144 | | |
145 | | /* we currently assume xl_invalid_page_key contains no padding */ |
146 | 0 | key.locator = locator; |
147 | 0 | key.forkno = forkno; |
148 | 0 | key.blkno = blkno; |
149 | 0 | hentry = (xl_invalid_page *) |
150 | 0 | hash_search(invalid_page_tab, &key, HASH_ENTER, &found); |
151 | |
|
152 | 0 | if (!found) |
153 | 0 | { |
154 | | /* hash_search already filled in the key */ |
155 | 0 | hentry->present = present; |
156 | 0 | } |
157 | 0 | else |
158 | 0 | { |
159 | | /* repeat reference ... leave "present" as it was */ |
160 | 0 | } |
161 | 0 | } |
162 | | |
163 | | /* Forget any invalid pages >= minblkno, because they've been dropped */ |
164 | | static void |
165 | | forget_invalid_pages(RelFileLocator locator, ForkNumber forkno, |
166 | | BlockNumber minblkno) |
167 | 0 | { |
168 | 0 | HASH_SEQ_STATUS status; |
169 | 0 | xl_invalid_page *hentry; |
170 | |
|
171 | 0 | if (invalid_page_tab == NULL) |
172 | 0 | return; /* nothing to do */ |
173 | | |
174 | 0 | hash_seq_init(&status, invalid_page_tab); |
175 | |
|
176 | 0 | while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) |
177 | 0 | { |
178 | 0 | if (RelFileLocatorEquals(hentry->key.locator, locator) && |
179 | 0 | hentry->key.forkno == forkno && |
180 | 0 | hentry->key.blkno >= minblkno) |
181 | 0 | { |
182 | 0 | elog(DEBUG2, "page %u of relation %s has been dropped", |
183 | 0 | hentry->key.blkno, |
184 | 0 | relpathperm(hentry->key.locator, forkno).str); |
185 | | |
186 | 0 | if (hash_search(invalid_page_tab, |
187 | 0 | &hentry->key, |
188 | 0 | HASH_REMOVE, NULL) == NULL) |
189 | 0 | elog(ERROR, "hash table corrupted"); |
190 | 0 | } |
191 | 0 | } |
192 | 0 | } |
193 | | |
194 | | /* Forget any invalid pages in a whole database */ |
195 | | static void |
196 | | forget_invalid_pages_db(Oid dbid) |
197 | 0 | { |
198 | 0 | HASH_SEQ_STATUS status; |
199 | 0 | xl_invalid_page *hentry; |
200 | |
|
201 | 0 | if (invalid_page_tab == NULL) |
202 | 0 | return; /* nothing to do */ |
203 | | |
204 | 0 | hash_seq_init(&status, invalid_page_tab); |
205 | |
|
206 | 0 | while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) |
207 | 0 | { |
208 | 0 | if (hentry->key.locator.dbOid == dbid) |
209 | 0 | { |
210 | 0 | elog(DEBUG2, "page %u of relation %s has been dropped", |
211 | 0 | hentry->key.blkno, |
212 | 0 | relpathperm(hentry->key.locator, hentry->key.forkno).str); |
213 | | |
214 | 0 | if (hash_search(invalid_page_tab, |
215 | 0 | &hentry->key, |
216 | 0 | HASH_REMOVE, NULL) == NULL) |
217 | 0 | elog(ERROR, "hash table corrupted"); |
218 | 0 | } |
219 | 0 | } |
220 | 0 | } |
221 | | |
222 | | /* Are there any unresolved references to invalid pages? */ |
223 | | bool |
224 | | XLogHaveInvalidPages(void) |
225 | 0 | { |
226 | 0 | if (invalid_page_tab != NULL && |
227 | 0 | hash_get_num_entries(invalid_page_tab) > 0) |
228 | 0 | return true; |
229 | 0 | return false; |
230 | 0 | } |
231 | | |
232 | | /* Complain about any remaining invalid-page entries */ |
233 | | void |
234 | | XLogCheckInvalidPages(void) |
235 | 0 | { |
236 | 0 | HASH_SEQ_STATUS status; |
237 | 0 | xl_invalid_page *hentry; |
238 | 0 | bool foundone = false; |
239 | |
|
240 | 0 | if (invalid_page_tab == NULL) |
241 | 0 | return; /* nothing to do */ |
242 | | |
243 | 0 | hash_seq_init(&status, invalid_page_tab); |
244 | | |
245 | | /* |
246 | | * Our strategy is to emit WARNING messages for all remaining entries and |
247 | | * only PANIC after we've dumped all the available info. |
248 | | */ |
249 | 0 | while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL) |
250 | 0 | { |
251 | 0 | report_invalid_page(WARNING, hentry->key.locator, hentry->key.forkno, |
252 | 0 | hentry->key.blkno, hentry->present); |
253 | 0 | foundone = true; |
254 | 0 | } |
255 | |
|
256 | 0 | if (foundone) |
257 | 0 | elog(ignore_invalid_pages ? WARNING : PANIC, |
258 | 0 | "WAL contains references to invalid pages"); |
259 | | |
260 | 0 | hash_destroy(invalid_page_tab); |
261 | 0 | invalid_page_tab = NULL; |
262 | 0 | } |
263 | | |
264 | | |
265 | | /* |
266 | | * XLogReadBufferForRedo |
267 | | * Read a page during XLOG replay |
268 | | * |
269 | | * Reads a block referenced by a WAL record into shared buffer cache, and |
270 | | * determines what needs to be done to redo the changes to it. If the WAL |
271 | | * record includes a full-page image of the page, it is restored. |
272 | | * |
273 | | * 'record.EndRecPtr' is compared to the page's LSN to determine if the record |
274 | | * has already been replayed. 'block_id' is the ID number the block was |
275 | | * registered with, when the WAL record was created. |
276 | | * |
277 | | * Returns one of the following: |
278 | | * |
279 | | * BLK_NEEDS_REDO - changes from the WAL record need to be applied |
280 | | * BLK_DONE - block doesn't need replaying |
281 | | * BLK_RESTORED - block was restored from a full-page image included in |
282 | | * the record |
283 | | * BLK_NOTFOUND - block was not found (because it was truncated away by |
284 | | * an operation later in the WAL stream) |
285 | | * |
286 | | * On return, the buffer is locked in exclusive-mode, and returned in *buf. |
287 | | * Note that the buffer is locked and returned even if it doesn't need |
288 | | * replaying. (Getting the buffer lock is not really necessary during |
289 | | * single-process crash recovery, but some subroutines such as MarkBufferDirty |
290 | | * will complain if we don't have the lock. In hot standby mode it's |
291 | | * definitely necessary.) |
292 | | * |
293 | | * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag |
294 | | * set, we restore it, even if the page in the database appears newer. This |
295 | | * is to protect ourselves against database pages that were partially or |
296 | | * incorrectly written during a crash. We assume that the XLOG data must be |
297 | | * good because it has passed a CRC check, while the database page might not |
298 | | * be. This will force us to replay all subsequent modifications of the page |
299 | | * that appear in XLOG, rather than possibly ignoring them as already |
300 | | * applied, but that's not a huge drawback. |
301 | | */ |
302 | | XLogRedoAction |
303 | | XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, |
304 | | Buffer *buf) |
305 | 0 | { |
306 | 0 | return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL, |
307 | 0 | false, buf); |
308 | 0 | } |
309 | | |
310 | | /* |
311 | | * Pin and lock a buffer referenced by a WAL record, for the purpose of |
312 | | * re-initializing it. |
313 | | */ |
314 | | Buffer |
315 | | XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id) |
316 | 0 | { |
317 | 0 | Buffer buf; |
318 | |
|
319 | 0 | XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false, |
320 | 0 | &buf); |
321 | 0 | return buf; |
322 | 0 | } |
323 | | |
324 | | /* |
325 | | * XLogReadBufferForRedoExtended |
326 | | * Like XLogReadBufferForRedo, but with extra options. |
327 | | * |
328 | | * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended |
329 | | * with all-zeroes pages up to the referenced block number. In |
330 | | * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value |
331 | | * is always BLK_NEEDS_REDO. |
332 | | * |
333 | | * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock |
334 | | * parameter. Do not use an inconsistent combination!) |
335 | | * |
336 | | * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer |
337 | | * using LockBufferForCleanup(), instead of a regular exclusive lock. |
338 | | */ |
339 | | XLogRedoAction |
340 | | XLogReadBufferForRedoExtended(XLogReaderState *record, |
341 | | uint8 block_id, |
342 | | ReadBufferMode mode, bool get_cleanup_lock, |
343 | | Buffer *buf) |
344 | 0 | { |
345 | 0 | XLogRecPtr lsn = record->EndRecPtr; |
346 | 0 | RelFileLocator rlocator; |
347 | 0 | ForkNumber forknum; |
348 | 0 | BlockNumber blkno; |
349 | 0 | Buffer prefetch_buffer; |
350 | 0 | Page page; |
351 | 0 | bool zeromode; |
352 | 0 | bool willinit; |
353 | |
|
354 | 0 | if (!XLogRecGetBlockTagExtended(record, block_id, &rlocator, &forknum, &blkno, |
355 | 0 | &prefetch_buffer)) |
356 | 0 | { |
357 | | /* Caller specified a bogus block_id */ |
358 | 0 | elog(PANIC, "failed to locate backup block with ID %d in WAL record", |
359 | 0 | block_id); |
360 | 0 | } |
361 | | |
362 | | /* |
363 | | * Make sure that if the block is marked with WILL_INIT, the caller is |
364 | | * going to initialize it. And vice versa. |
365 | | */ |
366 | 0 | zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); |
367 | 0 | willinit = (XLogRecGetBlock(record, block_id)->flags & BKPBLOCK_WILL_INIT) != 0; |
368 | 0 | if (willinit && !zeromode) |
369 | 0 | elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine"); |
370 | 0 | if (!willinit && zeromode) |
371 | 0 | elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record"); |
372 | | |
373 | | /* If it has a full-page image and it should be restored, do it. */ |
374 | 0 | if (XLogRecBlockImageApply(record, block_id)) |
375 | 0 | { |
376 | 0 | Assert(XLogRecHasBlockImage(record, block_id)); |
377 | 0 | *buf = XLogReadBufferExtended(rlocator, forknum, blkno, |
378 | 0 | get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK, |
379 | 0 | prefetch_buffer); |
380 | 0 | page = BufferGetPage(*buf); |
381 | 0 | if (!RestoreBlockImage(record, block_id, page)) |
382 | 0 | ereport(ERROR, |
383 | 0 | (errcode(ERRCODE_INTERNAL_ERROR), |
384 | 0 | errmsg_internal("%s", record->errormsg_buf))); |
385 | | |
386 | | /* |
387 | | * The page may be uninitialized. If so, we can't set the LSN because |
388 | | * that would corrupt the page. |
389 | | */ |
390 | 0 | if (!PageIsNew(page)) |
391 | 0 | { |
392 | 0 | PageSetLSN(page, lsn); |
393 | 0 | } |
394 | |
|
395 | 0 | MarkBufferDirty(*buf); |
396 | | |
397 | | /* |
398 | | * At the end of crash recovery the init forks of unlogged relations |
399 | | * are copied, without going through shared buffers. So we need to |
400 | | * force the on-disk state of init forks to always be in sync with the |
401 | | * state in shared buffers. |
402 | | */ |
403 | 0 | if (forknum == INIT_FORKNUM) |
404 | 0 | FlushOneBuffer(*buf); |
405 | |
|
406 | 0 | return BLK_RESTORED; |
407 | 0 | } |
408 | 0 | else |
409 | 0 | { |
410 | 0 | *buf = XLogReadBufferExtended(rlocator, forknum, blkno, mode, prefetch_buffer); |
411 | 0 | if (BufferIsValid(*buf)) |
412 | 0 | { |
413 | 0 | if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) |
414 | 0 | { |
415 | 0 | if (get_cleanup_lock) |
416 | 0 | LockBufferForCleanup(*buf); |
417 | 0 | else |
418 | 0 | LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); |
419 | 0 | } |
420 | 0 | if (lsn <= PageGetLSN(BufferGetPage(*buf))) |
421 | 0 | return BLK_DONE; |
422 | 0 | else |
423 | 0 | return BLK_NEEDS_REDO; |
424 | 0 | } |
425 | 0 | else |
426 | 0 | return BLK_NOTFOUND; |
427 | 0 | } |
428 | 0 | } |
429 | | |
430 | | /* |
431 | | * XLogReadBufferExtended |
432 | | * Read a page during XLOG replay |
433 | | * |
434 | | * This is functionally comparable to ReadBufferExtended. There's some |
435 | | * differences in the behavior wrt. the "mode" argument: |
436 | | * |
437 | | * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we |
438 | | * return InvalidBuffer. In this case the caller should silently skip the |
439 | | * update on this page. (In this situation, we expect that the page was later |
440 | | * dropped or truncated. If we don't see evidence of that later in the WAL |
441 | | * sequence, we'll complain at the end of WAL replay.) |
442 | | * |
443 | | * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended |
444 | | * with all-zeroes pages up to the given block number. |
445 | | * |
446 | | * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't |
447 | | * exist, and we don't check for all-zeroes. Thus, no log entry is made |
448 | | * to imply that the page should be dropped or truncated later. |
449 | | * |
450 | | * Optionally, recent_buffer can be used to provide a hint about the location |
451 | | * of the page in the buffer pool; it does not have to be correct, but avoids |
452 | | * a buffer mapping table probe if it is. |
453 | | * |
454 | | * NB: A redo function should normally not call this directly. To get a page |
455 | | * to modify, use XLogReadBufferForRedoExtended instead. It is important that |
456 | | * all pages modified by a WAL record are registered in the WAL records, or |
457 | | * they will be invisible to tools that need to know which pages are modified. |
458 | | */ |
459 | | Buffer |
460 | | XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum, |
461 | | BlockNumber blkno, ReadBufferMode mode, |
462 | | Buffer recent_buffer) |
463 | 0 | { |
464 | 0 | BlockNumber lastblock; |
465 | 0 | Buffer buffer; |
466 | 0 | SMgrRelation smgr; |
467 | |
|
468 | 0 | Assert(blkno != P_NEW); |
469 | | |
470 | | /* Do we have a clue where the buffer might be already? */ |
471 | 0 | if (BufferIsValid(recent_buffer) && |
472 | 0 | mode == RBM_NORMAL && |
473 | 0 | ReadRecentBuffer(rlocator, forknum, blkno, recent_buffer)) |
474 | 0 | { |
475 | 0 | buffer = recent_buffer; |
476 | 0 | goto recent_buffer_fast_path; |
477 | 0 | } |
478 | | |
479 | | /* Open the relation at smgr level */ |
480 | 0 | smgr = smgropen(rlocator, INVALID_PROC_NUMBER); |
481 | | |
482 | | /* |
483 | | * Create the target file if it doesn't already exist. This lets us cope |
484 | | * if the replay sequence contains writes to a relation that is later |
485 | | * deleted. (The original coding of this routine would instead suppress |
486 | | * the writes, but that seems like it risks losing valuable data if the |
487 | | * filesystem loses an inode during a crash. Better to write the data |
488 | | * until we are actually told to delete the file.) |
489 | | */ |
490 | 0 | smgrcreate(smgr, forknum, true); |
491 | |
|
492 | 0 | lastblock = smgrnblocks(smgr, forknum); |
493 | |
|
494 | 0 | if (blkno < lastblock) |
495 | 0 | { |
496 | | /* page exists in file */ |
497 | 0 | buffer = ReadBufferWithoutRelcache(rlocator, forknum, blkno, |
498 | 0 | mode, NULL, true); |
499 | 0 | } |
500 | 0 | else |
501 | 0 | { |
502 | | /* hm, page doesn't exist in file */ |
503 | 0 | if (mode == RBM_NORMAL) |
504 | 0 | { |
505 | 0 | log_invalid_page(rlocator, forknum, blkno, false); |
506 | 0 | return InvalidBuffer; |
507 | 0 | } |
508 | 0 | if (mode == RBM_NORMAL_NO_LOG) |
509 | 0 | return InvalidBuffer; |
510 | | /* OK to extend the file */ |
511 | | /* we do this in recovery only - no rel-extension lock needed */ |
512 | 0 | Assert(InRecovery); |
513 | 0 | buffer = ExtendBufferedRelTo(BMR_SMGR(smgr, RELPERSISTENCE_PERMANENT), |
514 | 0 | forknum, |
515 | 0 | NULL, |
516 | 0 | EB_PERFORMING_RECOVERY | |
517 | 0 | EB_SKIP_EXTENSION_LOCK, |
518 | 0 | blkno + 1, |
519 | 0 | mode); |
520 | 0 | } |
521 | | |
522 | 0 | recent_buffer_fast_path: |
523 | 0 | if (mode == RBM_NORMAL) |
524 | 0 | { |
525 | | /* check that page has been initialized */ |
526 | 0 | Page page = (Page) BufferGetPage(buffer); |
527 | | |
528 | | /* |
529 | | * We assume that PageIsNew is safe without a lock. During recovery, |
530 | | * there should be no other backends that could modify the buffer at |
531 | | * the same time. |
532 | | */ |
533 | 0 | if (PageIsNew(page)) |
534 | 0 | { |
535 | 0 | ReleaseBuffer(buffer); |
536 | 0 | log_invalid_page(rlocator, forknum, blkno, true); |
537 | 0 | return InvalidBuffer; |
538 | 0 | } |
539 | 0 | } |
540 | | |
541 | 0 | return buffer; |
542 | 0 | } |
543 | | |
544 | | /* |
545 | | * Struct actually returned by CreateFakeRelcacheEntry, though the declared |
546 | | * return type is Relation. |
547 | | */ |
548 | | typedef struct |
549 | | { |
550 | | RelationData reldata; /* Note: this must be first */ |
551 | | FormData_pg_class pgc; |
552 | | } FakeRelCacheEntryData; |
553 | | |
554 | | typedef FakeRelCacheEntryData *FakeRelCacheEntry; |
555 | | |
556 | | /* |
557 | | * Create a fake relation cache entry for a physical relation |
558 | | * |
559 | | * It's often convenient to use the same functions in XLOG replay as in the |
560 | | * main codepath, but those functions typically work with a relcache entry. |
561 | | * We don't have a working relation cache during XLOG replay, but this |
562 | | * function can be used to create a fake relcache entry instead. Only the |
563 | | * fields related to physical storage, like rd_rel, are initialized, so the |
564 | | * fake entry is only usable in low-level operations like ReadBuffer(). |
565 | | * |
566 | | * This is also used for syncing WAL-skipped files. |
567 | | * |
568 | | * Caller must free the returned entry with FreeFakeRelcacheEntry(). |
569 | | */ |
570 | | Relation |
571 | | CreateFakeRelcacheEntry(RelFileLocator rlocator) |
572 | 0 | { |
573 | 0 | FakeRelCacheEntry fakeentry; |
574 | 0 | Relation rel; |
575 | | |
576 | | /* Allocate the Relation struct and all related space in one block. */ |
577 | 0 | fakeentry = palloc0(sizeof(FakeRelCacheEntryData)); |
578 | 0 | rel = (Relation) fakeentry; |
579 | |
|
580 | 0 | rel->rd_rel = &fakeentry->pgc; |
581 | 0 | rel->rd_locator = rlocator; |
582 | | |
583 | | /* |
584 | | * We will never be working with temp rels during recovery or while |
585 | | * syncing WAL-skipped files. |
586 | | */ |
587 | 0 | rel->rd_backend = INVALID_PROC_NUMBER; |
588 | | |
589 | | /* It must be a permanent table here */ |
590 | 0 | rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT; |
591 | | |
592 | | /* We don't know the name of the relation; use relfilenumber instead */ |
593 | 0 | sprintf(RelationGetRelationName(rel), "%u", rlocator.relNumber); |
594 | | |
595 | | /* |
596 | | * We set up the lockRelId in case anything tries to lock the dummy |
597 | | * relation. Note that this is fairly bogus since relNumber may be |
598 | | * different from the relation's OID. It shouldn't really matter though. |
599 | | * In recovery, we are running by ourselves and can't have any lock |
600 | | * conflicts. While syncing, we already hold AccessExclusiveLock. |
601 | | */ |
602 | 0 | rel->rd_lockInfo.lockRelId.dbId = rlocator.dbOid; |
603 | 0 | rel->rd_lockInfo.lockRelId.relId = rlocator.relNumber; |
604 | | |
605 | | /* |
606 | | * Set up a non-pinned SMgrRelation reference, so that we don't need to |
607 | | * worry about unpinning it on error. |
608 | | */ |
609 | 0 | rel->rd_smgr = smgropen(rlocator, INVALID_PROC_NUMBER); |
610 | |
|
611 | 0 | return rel; |
612 | 0 | } |
613 | | |
614 | | /* |
615 | | * Free a fake relation cache entry. |
616 | | */ |
617 | | void |
618 | | FreeFakeRelcacheEntry(Relation fakerel) |
619 | 0 | { |
620 | 0 | pfree(fakerel); |
621 | 0 | } |
622 | | |
623 | | /* |
624 | | * Drop a relation during XLOG replay |
625 | | * |
626 | | * This is called when the relation is about to be deleted; we need to remove |
627 | | * any open "invalid-page" records for the relation. |
628 | | */ |
629 | | void |
630 | | XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum) |
631 | 0 | { |
632 | 0 | forget_invalid_pages(rlocator, forknum, 0); |
633 | 0 | } |
634 | | |
635 | | /* |
636 | | * Drop a whole database during XLOG replay |
637 | | * |
638 | | * As above, but for DROP DATABASE instead of dropping a single rel |
639 | | */ |
640 | | void |
641 | | XLogDropDatabase(Oid dbid) |
642 | 0 | { |
643 | | /* |
644 | | * This is unnecessarily heavy-handed, as it will close SMgrRelation |
645 | | * objects for other databases as well. DROP DATABASE occurs seldom enough |
646 | | * that it's not worth introducing a variant of smgrdestroy for just this |
647 | | * purpose. |
648 | | */ |
649 | 0 | smgrdestroyall(); |
650 | |
|
651 | 0 | forget_invalid_pages_db(dbid); |
652 | 0 | } |
653 | | |
654 | | /* |
655 | | * Truncate a relation during XLOG replay |
656 | | * |
657 | | * We need to clean up any open "invalid-page" records for the dropped pages. |
658 | | */ |
659 | | void |
660 | | XLogTruncateRelation(RelFileLocator rlocator, ForkNumber forkNum, |
661 | | BlockNumber nblocks) |
662 | 0 | { |
663 | 0 | forget_invalid_pages(rlocator, forkNum, nblocks); |
664 | 0 | } |
665 | | |
666 | | /* |
667 | | * Determine which timeline to read an xlog page from and set the |
668 | | * XLogReaderState's currTLI to that timeline ID. |
669 | | * |
670 | | * We care about timelines in xlogreader when we might be reading xlog |
671 | | * generated prior to a promotion, either if we're currently a standby in |
672 | | * recovery or if we're a promoted primary reading xlogs generated by the old |
673 | | * primary before our promotion. |
674 | | * |
675 | | * wantPage must be set to the start address of the page to read and |
676 | | * wantLength to the amount of the page that will be read, up to |
677 | | * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ. |
678 | | * |
679 | | * The currTLI argument should be the system-wide current timeline. |
680 | | * Note that this may be different from state->currTLI, which is the timeline |
681 | | * from which the caller is currently reading previous xlog records. |
682 | | * |
683 | | * We switch to an xlog segment from the new timeline eagerly when on a |
684 | | * historical timeline, as soon as we reach the start of the xlog segment |
685 | | * containing the timeline switch. The server copied the segment to the new |
686 | | * timeline so all the data up to the switch point is the same, but there's no |
687 | | * guarantee the old segment will still exist. It may have been deleted or |
688 | | * renamed with a .partial suffix so we can't necessarily keep reading from |
689 | | * the old TLI even though tliSwitchPoint says it's OK. |
690 | | * |
691 | | * We can't just check the timeline when we read a page on a different segment |
692 | | * to the last page. We could've received a timeline switch from a cascading |
693 | | * upstream, so the current segment ends abruptly (possibly getting renamed to |
694 | | * .partial) and we have to switch to a new one. Even in the middle of reading |
695 | | * a page we could have to dump the cached page and switch to a new TLI. |
696 | | * |
697 | | * Because of this, callers MAY NOT assume that currTLI is the timeline that |
698 | | * will be in a page's xlp_tli; the page may begin on an older timeline or we |
699 | | * might be reading from historical timeline data on a segment that's been |
700 | | * copied to a new timeline. |
701 | | * |
702 | | * The caller must also make sure it doesn't read past the current replay |
703 | | * position (using GetXLogReplayRecPtr) if executing in recovery, so it |
704 | | * doesn't fail to notice that the current timeline became historical. |
705 | | */ |
706 | | void |
707 | | XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage, |
708 | | uint32 wantLength, TimeLineID currTLI) |
709 | 0 | { |
710 | 0 | const XLogRecPtr lastReadPage = (state->seg.ws_segno * |
711 | 0 | state->segcxt.ws_segsize + state->segoff); |
712 | |
|
713 | 0 | Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0); |
714 | 0 | Assert(wantLength <= XLOG_BLCKSZ); |
715 | 0 | Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ); |
716 | 0 | Assert(currTLI != 0); |
717 | | |
718 | | /* |
719 | | * If the desired page is currently read in and valid, we have nothing to |
720 | | * do. |
721 | | * |
722 | | * The caller should've ensured that it didn't previously advance readOff |
723 | | * past the valid limit of this timeline, so it doesn't matter if the |
724 | | * current TLI has since become historical. |
725 | | */ |
726 | 0 | if (lastReadPage == wantPage && |
727 | 0 | state->readLen != 0 && |
728 | 0 | lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1)) |
729 | 0 | return; |
730 | | |
731 | | /* |
732 | | * If we're reading from the current timeline, it hasn't become historical |
733 | | * and the page we're reading is after the last page read, we can again |
734 | | * just carry on. (Seeking backwards requires a check to make sure the |
735 | | * older page isn't on a prior timeline). |
736 | | * |
737 | | * currTLI might've become historical since the caller obtained the value, |
738 | | * but the caller is required not to read past the flush limit it saw at |
739 | | * the time it looked up the timeline. There's nothing we can do about it |
740 | | * if StartupXLOG() renames it to .partial concurrently. |
741 | | */ |
742 | 0 | if (state->currTLI == currTLI && wantPage >= lastReadPage) |
743 | 0 | { |
744 | 0 | Assert(state->currTLIValidUntil == InvalidXLogRecPtr); |
745 | 0 | return; |
746 | 0 | } |
747 | | |
748 | | /* |
749 | | * If we're just reading pages from a previously validated historical |
750 | | * timeline and the timeline we're reading from is valid until the end of |
751 | | * the current segment we can just keep reading. |
752 | | */ |
753 | 0 | if (state->currTLIValidUntil != InvalidXLogRecPtr && |
754 | 0 | state->currTLI != currTLI && |
755 | 0 | state->currTLI != 0 && |
756 | 0 | ((wantPage + wantLength) / state->segcxt.ws_segsize) < |
757 | 0 | (state->currTLIValidUntil / state->segcxt.ws_segsize)) |
758 | 0 | return; |
759 | | |
760 | | /* |
761 | | * If we reach this point we're either looking up a page for random |
762 | | * access, the current timeline just became historical, or we're reading |
763 | | * from a new segment containing a timeline switch. In all cases we need |
764 | | * to determine the newest timeline on the segment. |
765 | | * |
766 | | * If it's the current timeline we can just keep reading from here unless |
767 | | * we detect a timeline switch that makes the current timeline historical. |
768 | | * If it's a historical timeline we can read all the segment on the newest |
769 | | * timeline because it contains all the old timelines' data too. So only |
770 | | * one switch check is required. |
771 | | */ |
772 | 0 | { |
773 | | /* |
774 | | * We need to re-read the timeline history in case it's been changed |
775 | | * by a promotion or replay from a cascaded replica. |
776 | | */ |
777 | 0 | List *timelineHistory = readTimeLineHistory(currTLI); |
778 | 0 | XLogRecPtr endOfSegment; |
779 | |
|
780 | 0 | endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) * |
781 | 0 | state->segcxt.ws_segsize - 1; |
782 | 0 | Assert(wantPage / state->segcxt.ws_segsize == |
783 | 0 | endOfSegment / state->segcxt.ws_segsize); |
784 | | |
785 | | /* |
786 | | * Find the timeline of the last LSN on the segment containing |
787 | | * wantPage. |
788 | | */ |
789 | 0 | state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory); |
790 | 0 | state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory, |
791 | 0 | &state->nextTLI); |
792 | |
|
793 | 0 | Assert(state->currTLIValidUntil == InvalidXLogRecPtr || |
794 | 0 | wantPage + wantLength < state->currTLIValidUntil); |
795 | |
|
796 | 0 | list_free_deep(timelineHistory); |
797 | |
|
798 | 0 | elog(DEBUG3, "switched to timeline %u valid until %X/%X", |
799 | 0 | state->currTLI, |
800 | 0 | LSN_FORMAT_ARGS(state->currTLIValidUntil)); |
801 | 0 | } |
802 | 0 | } |
803 | | |
804 | | /* XLogReaderRoutine->segment_open callback for local pg_wal files */ |
805 | | void |
806 | | wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo, |
807 | | TimeLineID *tli_p) |
808 | 0 | { |
809 | 0 | TimeLineID tli = *tli_p; |
810 | 0 | char path[MAXPGPATH]; |
811 | |
|
812 | 0 | XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize); |
813 | 0 | state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY); |
814 | 0 | if (state->seg.ws_file >= 0) |
815 | 0 | return; |
816 | | |
817 | 0 | if (errno == ENOENT) |
818 | 0 | ereport(ERROR, |
819 | 0 | (errcode_for_file_access(), |
820 | 0 | errmsg("requested WAL segment %s has already been removed", |
821 | 0 | path))); |
822 | 0 | else |
823 | 0 | ereport(ERROR, |
824 | 0 | (errcode_for_file_access(), |
825 | 0 | errmsg("could not open file \"%s\": %m", |
826 | 0 | path))); |
827 | 0 | } |
828 | | |
829 | | /* stock XLogReaderRoutine->segment_close callback */ |
830 | | void |
831 | | wal_segment_close(XLogReaderState *state) |
832 | 0 | { |
833 | 0 | close(state->seg.ws_file); |
834 | | /* need to check errno? */ |
835 | 0 | state->seg.ws_file = -1; |
836 | 0 | } |
837 | | |
838 | | /* |
839 | | * XLogReaderRoutine->page_read callback for reading local xlog files |
840 | | * |
841 | | * Public because it would likely be very helpful for someone writing another |
842 | | * output method outside walsender, e.g. in a bgworker. |
843 | | */ |
844 | | int |
845 | | read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, |
846 | | int reqLen, XLogRecPtr targetRecPtr, char *cur_page) |
847 | 0 | { |
848 | 0 | return read_local_xlog_page_guts(state, targetPagePtr, reqLen, |
849 | 0 | targetRecPtr, cur_page, true); |
850 | 0 | } |
851 | | |
852 | | /* |
853 | | * Same as read_local_xlog_page except that it doesn't wait for future WAL |
854 | | * to be available. |
855 | | */ |
856 | | int |
857 | | read_local_xlog_page_no_wait(XLogReaderState *state, XLogRecPtr targetPagePtr, |
858 | | int reqLen, XLogRecPtr targetRecPtr, |
859 | | char *cur_page) |
860 | 0 | { |
861 | 0 | return read_local_xlog_page_guts(state, targetPagePtr, reqLen, |
862 | 0 | targetRecPtr, cur_page, false); |
863 | 0 | } |
864 | | |
865 | | /* |
866 | | * Implementation of read_local_xlog_page and its no wait version. |
867 | | */ |
868 | | static int |
869 | | read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr, |
870 | | int reqLen, XLogRecPtr targetRecPtr, |
871 | | char *cur_page, bool wait_for_wal) |
872 | 0 | { |
873 | 0 | XLogRecPtr read_upto, |
874 | 0 | loc; |
875 | 0 | TimeLineID tli; |
876 | 0 | int count; |
877 | 0 | WALReadError errinfo; |
878 | 0 | TimeLineID currTLI; |
879 | |
|
880 | 0 | loc = targetPagePtr + reqLen; |
881 | | |
882 | | /* |
883 | | * Loop waiting for xlog to be available if necessary |
884 | | * |
885 | | * TODO: The walsender has its own version of this function, which uses a |
886 | | * condition variable to wake up whenever WAL is flushed. We could use the |
887 | | * same infrastructure here, instead of the check/sleep/repeat style of |
888 | | * loop. |
889 | | */ |
890 | 0 | while (1) |
891 | 0 | { |
892 | | /* |
893 | | * Determine the limit of xlog we can currently read to, and what the |
894 | | * most recent timeline is. |
895 | | */ |
896 | 0 | if (!RecoveryInProgress()) |
897 | 0 | read_upto = GetFlushRecPtr(&currTLI); |
898 | 0 | else |
899 | 0 | read_upto = GetXLogReplayRecPtr(&currTLI); |
900 | 0 | tli = currTLI; |
901 | | |
902 | | /* |
903 | | * Check which timeline to get the record from. |
904 | | * |
905 | | * We have to do it each time through the loop because if we're in |
906 | | * recovery as a cascading standby, the current timeline might've |
907 | | * become historical. We can't rely on RecoveryInProgress() because in |
908 | | * a standby configuration like |
909 | | * |
910 | | * A => B => C |
911 | | * |
912 | | * if we're a logical decoding session on C, and B gets promoted, our |
913 | | * timeline will change while we remain in recovery. |
914 | | * |
915 | | * We can't just keep reading from the old timeline as the last WAL |
916 | | * archive in the timeline will get renamed to .partial by |
917 | | * StartupXLOG(). |
918 | | * |
919 | | * If that happens after our caller determined the TLI but before we |
920 | | * actually read the xlog page, we might still try to read from the |
921 | | * old (now renamed) segment and fail. There's not much we can do |
922 | | * about this, but it can only happen when we're a leaf of a cascading |
923 | | * standby whose primary gets promoted while we're decoding, so a |
924 | | * one-off ERROR isn't too bad. |
925 | | */ |
926 | 0 | XLogReadDetermineTimeline(state, targetPagePtr, reqLen, tli); |
927 | |
|
928 | 0 | if (state->currTLI == currTLI) |
929 | 0 | { |
930 | |
|
931 | 0 | if (loc <= read_upto) |
932 | 0 | break; |
933 | | |
934 | | /* If asked, let's not wait for future WAL. */ |
935 | 0 | if (!wait_for_wal) |
936 | 0 | { |
937 | 0 | ReadLocalXLogPageNoWaitPrivate *private_data; |
938 | | |
939 | | /* |
940 | | * Inform the caller of read_local_xlog_page_no_wait that the |
941 | | * end of WAL has been reached. |
942 | | */ |
943 | 0 | private_data = (ReadLocalXLogPageNoWaitPrivate *) |
944 | 0 | state->private_data; |
945 | 0 | private_data->end_of_wal = true; |
946 | 0 | break; |
947 | 0 | } |
948 | | |
949 | 0 | CHECK_FOR_INTERRUPTS(); |
950 | 0 | pg_usleep(1000L); |
951 | 0 | } |
952 | 0 | else |
953 | 0 | { |
954 | | /* |
955 | | * We're on a historical timeline, so limit reading to the switch |
956 | | * point where we moved to the next timeline. |
957 | | * |
958 | | * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know |
959 | | * about the new timeline, so we must've received past the end of |
960 | | * it. |
961 | | */ |
962 | 0 | read_upto = state->currTLIValidUntil; |
963 | | |
964 | | /* |
965 | | * Setting tli to our wanted record's TLI is slightly wrong; the |
966 | | * page might begin on an older timeline if it contains a timeline |
967 | | * switch, since its xlog segment will have been copied from the |
968 | | * prior timeline. This is pretty harmless though, as nothing |
969 | | * cares so long as the timeline doesn't go backwards. We should |
970 | | * read the page header instead; FIXME someday. |
971 | | */ |
972 | 0 | tli = state->currTLI; |
973 | | |
974 | | /* No need to wait on a historical timeline */ |
975 | 0 | break; |
976 | 0 | } |
977 | 0 | } |
978 | |
|
979 | 0 | if (targetPagePtr + XLOG_BLCKSZ <= read_upto) |
980 | 0 | { |
981 | | /* |
982 | | * more than one block available; read only that block, have caller |
983 | | * come back if they need more. |
984 | | */ |
985 | 0 | count = XLOG_BLCKSZ; |
986 | 0 | } |
987 | 0 | else if (targetPagePtr + reqLen > read_upto) |
988 | 0 | { |
989 | | /* not enough data there */ |
990 | 0 | return -1; |
991 | 0 | } |
992 | 0 | else |
993 | 0 | { |
994 | | /* enough bytes available to satisfy the request */ |
995 | 0 | count = read_upto - targetPagePtr; |
996 | 0 | } |
997 | | |
998 | 0 | if (!WALRead(state, cur_page, targetPagePtr, count, tli, |
999 | 0 | &errinfo)) |
1000 | 0 | WALReadRaiseError(&errinfo); |
1001 | | |
1002 | | /* number of valid bytes in the buffer */ |
1003 | 0 | return count; |
1004 | 0 | } |
1005 | | |
1006 | | /* |
1007 | | * Backend-specific convenience code to handle read errors encountered by |
1008 | | * WALRead(). |
1009 | | */ |
1010 | | void |
1011 | | WALReadRaiseError(WALReadError *errinfo) |
1012 | 0 | { |
1013 | 0 | WALOpenSegment *seg = &errinfo->wre_seg; |
1014 | 0 | char fname[MAXFNAMELEN]; |
1015 | |
|
1016 | 0 | XLogFileName(fname, seg->ws_tli, seg->ws_segno, wal_segment_size); |
1017 | |
|
1018 | 0 | if (errinfo->wre_read < 0) |
1019 | 0 | { |
1020 | 0 | errno = errinfo->wre_errno; |
1021 | 0 | ereport(ERROR, |
1022 | 0 | (errcode_for_file_access(), |
1023 | 0 | errmsg("could not read from WAL segment %s, offset %d: %m", |
1024 | 0 | fname, errinfo->wre_off))); |
1025 | 0 | } |
1026 | 0 | else if (errinfo->wre_read == 0) |
1027 | 0 | { |
1028 | 0 | ereport(ERROR, |
1029 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
1030 | 0 | errmsg("could not read from WAL segment %s, offset %d: read %d of %d", |
1031 | 0 | fname, errinfo->wre_off, errinfo->wre_read, |
1032 | 0 | errinfo->wre_req))); |
1033 | 0 | } |
1034 | 0 | } |