Coverage Report

Created: 2025-07-03 06:49

/src/postgres/src/backend/access/transam/xlogutils.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * xlogutils.c
4
 *
5
 * PostgreSQL write-ahead log manager utility routines
6
 *
7
 * This file contains support routines that are used by XLOG replay functions.
8
 * None of this code is used during normal system operation.
9
 *
10
 *
11
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
12
 * Portions Copyright (c) 1994, Regents of the University of California
13
 *
14
 * src/backend/access/transam/xlogutils.c
15
 *
16
 *-------------------------------------------------------------------------
17
 */
18
#include "postgres.h"
19
20
#include <unistd.h>
21
22
#include "access/timeline.h"
23
#include "access/xlogrecovery.h"
24
#include "access/xlog_internal.h"
25
#include "access/xlogutils.h"
26
#include "miscadmin.h"
27
#include "storage/fd.h"
28
#include "storage/smgr.h"
29
#include "utils/hsearch.h"
30
#include "utils/rel.h"
31
32
33
/* GUC variable */
34
bool    ignore_invalid_pages = false;
35
36
/*
37
 * Are we doing recovery from XLOG?
38
 *
39
 * This is only ever true in the startup process; it should be read as meaning
40
 * "this process is replaying WAL records", rather than "the system is in
41
 * recovery mode".  It should be examined primarily by functions that need
42
 * to act differently when called from a WAL redo function (e.g., to skip WAL
43
 * logging).  To check whether the system is in recovery regardless of which
44
 * process you're running in, use RecoveryInProgress() but only after shared
45
 * memory startup and lock initialization.
46
 *
47
 * This is updated from xlog.c and xlogrecovery.c, but lives here because
48
 * it's mostly read by WAL redo functions.
49
 */
50
bool    InRecovery = false;
51
52
/* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */
53
HotStandbyState standbyState = STANDBY_DISABLED;
54
55
/*
56
 * During XLOG replay, we may see XLOG records for incremental updates of
57
 * pages that no longer exist, because their relation was later dropped or
58
 * truncated.  (Note: this is only possible when full_page_writes = OFF,
59
 * since when it's ON, the first reference we see to a page should always
60
 * be a full-page rewrite not an incremental update.)  Rather than simply
61
 * ignoring such records, we make a note of the referenced page, and then
62
 * complain if we don't actually see a drop or truncate covering the page
63
 * later in replay.
64
 */
65
typedef struct xl_invalid_page_key
66
{
67
  RelFileLocator locator;   /* the relation */
68
  ForkNumber  forkno;     /* the fork number */
69
  BlockNumber blkno;      /* the page */
70
} xl_invalid_page_key;
71
72
typedef struct xl_invalid_page
73
{
74
  xl_invalid_page_key key;  /* hash key ... must be first */
75
  bool    present;    /* page existed but contained zeroes */
76
} xl_invalid_page;
77
78
static HTAB *invalid_page_tab = NULL;
79
80
static int  read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr,
81
                    int reqLen, XLogRecPtr targetRecPtr,
82
                    char *cur_page, bool wait_for_wal);
83
84
/* Report a reference to an invalid page */
85
static void
86
report_invalid_page(int elevel, RelFileLocator locator, ForkNumber forkno,
87
          BlockNumber blkno, bool present)
88
0
{
89
0
  RelPathStr  path = relpathperm(locator, forkno);
90
91
0
  if (present)
92
0
    elog(elevel, "page %u of relation %s is uninitialized",
93
0
       blkno, path.str);
94
0
  else
95
0
    elog(elevel, "page %u of relation %s does not exist",
96
0
       blkno, path.str);
97
0
}
98
99
/* Log a reference to an invalid page */
100
static void
101
log_invalid_page(RelFileLocator locator, ForkNumber forkno, BlockNumber blkno,
102
         bool present)
103
0
{
104
0
  xl_invalid_page_key key;
105
0
  xl_invalid_page *hentry;
106
0
  bool    found;
107
108
  /*
109
   * Once recovery has reached a consistent state, the invalid-page table
110
   * should be empty and remain so. If a reference to an invalid page is
111
   * found after consistency is reached, PANIC immediately. This might seem
112
   * aggressive, but it's better than letting the invalid reference linger
113
   * in the hash table until the end of recovery and PANIC there, which
114
   * might come only much later if this is a standby server.
115
   */
116
0
  if (reachedConsistency)
117
0
  {
118
0
    report_invalid_page(WARNING, locator, forkno, blkno, present);
119
0
    elog(ignore_invalid_pages ? WARNING : PANIC,
120
0
       "WAL contains references to invalid pages");
121
0
  }
122
123
  /*
124
   * Log references to invalid pages at DEBUG1 level.  This allows some
125
   * tracing of the cause (note the elog context mechanism will tell us
126
   * something about the XLOG record that generated the reference).
127
   */
128
0
  if (message_level_is_interesting(DEBUG1))
129
0
    report_invalid_page(DEBUG1, locator, forkno, blkno, present);
130
131
0
  if (invalid_page_tab == NULL)
132
0
  {
133
    /* create hash table when first needed */
134
0
    HASHCTL   ctl;
135
136
0
    ctl.keysize = sizeof(xl_invalid_page_key);
137
0
    ctl.entrysize = sizeof(xl_invalid_page);
138
139
0
    invalid_page_tab = hash_create("XLOG invalid-page table",
140
0
                     100,
141
0
                     &ctl,
142
0
                     HASH_ELEM | HASH_BLOBS);
143
0
  }
144
145
  /* we currently assume xl_invalid_page_key contains no padding */
146
0
  key.locator = locator;
147
0
  key.forkno = forkno;
148
0
  key.blkno = blkno;
149
0
  hentry = (xl_invalid_page *)
150
0
    hash_search(invalid_page_tab, &key, HASH_ENTER, &found);
151
152
0
  if (!found)
153
0
  {
154
    /* hash_search already filled in the key */
155
0
    hentry->present = present;
156
0
  }
157
0
  else
158
0
  {
159
    /* repeat reference ... leave "present" as it was */
160
0
  }
161
0
}
162
163
/* Forget any invalid pages >= minblkno, because they've been dropped */
164
static void
165
forget_invalid_pages(RelFileLocator locator, ForkNumber forkno,
166
           BlockNumber minblkno)
167
0
{
168
0
  HASH_SEQ_STATUS status;
169
0
  xl_invalid_page *hentry;
170
171
0
  if (invalid_page_tab == NULL)
172
0
    return;         /* nothing to do */
173
174
0
  hash_seq_init(&status, invalid_page_tab);
175
176
0
  while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
177
0
  {
178
0
    if (RelFileLocatorEquals(hentry->key.locator, locator) &&
179
0
      hentry->key.forkno == forkno &&
180
0
      hentry->key.blkno >= minblkno)
181
0
    {
182
0
      elog(DEBUG2, "page %u of relation %s has been dropped",
183
0
         hentry->key.blkno,
184
0
         relpathperm(hentry->key.locator, forkno).str);
185
186
0
      if (hash_search(invalid_page_tab,
187
0
              &hentry->key,
188
0
              HASH_REMOVE, NULL) == NULL)
189
0
        elog(ERROR, "hash table corrupted");
190
0
    }
191
0
  }
192
0
}
193
194
/* Forget any invalid pages in a whole database */
195
static void
196
forget_invalid_pages_db(Oid dbid)
197
0
{
198
0
  HASH_SEQ_STATUS status;
199
0
  xl_invalid_page *hentry;
200
201
0
  if (invalid_page_tab == NULL)
202
0
    return;         /* nothing to do */
203
204
0
  hash_seq_init(&status, invalid_page_tab);
205
206
0
  while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
207
0
  {
208
0
    if (hentry->key.locator.dbOid == dbid)
209
0
    {
210
0
      elog(DEBUG2, "page %u of relation %s has been dropped",
211
0
         hentry->key.blkno,
212
0
         relpathperm(hentry->key.locator, hentry->key.forkno).str);
213
214
0
      if (hash_search(invalid_page_tab,
215
0
              &hentry->key,
216
0
              HASH_REMOVE, NULL) == NULL)
217
0
        elog(ERROR, "hash table corrupted");
218
0
    }
219
0
  }
220
0
}
221
222
/* Are there any unresolved references to invalid pages? */
223
bool
224
XLogHaveInvalidPages(void)
225
0
{
226
0
  if (invalid_page_tab != NULL &&
227
0
    hash_get_num_entries(invalid_page_tab) > 0)
228
0
    return true;
229
0
  return false;
230
0
}
231
232
/* Complain about any remaining invalid-page entries */
233
void
234
XLogCheckInvalidPages(void)
235
0
{
236
0
  HASH_SEQ_STATUS status;
237
0
  xl_invalid_page *hentry;
238
0
  bool    foundone = false;
239
240
0
  if (invalid_page_tab == NULL)
241
0
    return;         /* nothing to do */
242
243
0
  hash_seq_init(&status, invalid_page_tab);
244
245
  /*
246
   * Our strategy is to emit WARNING messages for all remaining entries and
247
   * only PANIC after we've dumped all the available info.
248
   */
249
0
  while ((hentry = (xl_invalid_page *) hash_seq_search(&status)) != NULL)
250
0
  {
251
0
    report_invalid_page(WARNING, hentry->key.locator, hentry->key.forkno,
252
0
              hentry->key.blkno, hentry->present);
253
0
    foundone = true;
254
0
  }
255
256
0
  if (foundone)
257
0
    elog(ignore_invalid_pages ? WARNING : PANIC,
258
0
       "WAL contains references to invalid pages");
259
260
0
  hash_destroy(invalid_page_tab);
261
0
  invalid_page_tab = NULL;
262
0
}
263
264
265
/*
266
 * XLogReadBufferForRedo
267
 *    Read a page during XLOG replay
268
 *
269
 * Reads a block referenced by a WAL record into shared buffer cache, and
270
 * determines what needs to be done to redo the changes to it.  If the WAL
271
 * record includes a full-page image of the page, it is restored.
272
 *
273
 * 'record.EndRecPtr' is compared to the page's LSN to determine if the record
274
 * has already been replayed.  'block_id' is the ID number the block was
275
 * registered with, when the WAL record was created.
276
 *
277
 * Returns one of the following:
278
 *
279
 *  BLK_NEEDS_REDO  - changes from the WAL record need to be applied
280
 *  BLK_DONE    - block doesn't need replaying
281
 *  BLK_RESTORED  - block was restored from a full-page image included in
282
 *            the record
283
 *  BLK_NOTFOUND  - block was not found (because it was truncated away by
284
 *            an operation later in the WAL stream)
285
 *
286
 * On return, the buffer is locked in exclusive-mode, and returned in *buf.
287
 * Note that the buffer is locked and returned even if it doesn't need
288
 * replaying.  (Getting the buffer lock is not really necessary during
289
 * single-process crash recovery, but some subroutines such as MarkBufferDirty
290
 * will complain if we don't have the lock.  In hot standby mode it's
291
 * definitely necessary.)
292
 *
293
 * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag
294
 * set, we restore it, even if the page in the database appears newer.  This
295
 * is to protect ourselves against database pages that were partially or
296
 * incorrectly written during a crash.  We assume that the XLOG data must be
297
 * good because it has passed a CRC check, while the database page might not
298
 * be.  This will force us to replay all subsequent modifications of the page
299
 * that appear in XLOG, rather than possibly ignoring them as already
300
 * applied, but that's not a huge drawback.
301
 */
302
XLogRedoAction
303
XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id,
304
            Buffer *buf)
305
0
{
306
0
  return XLogReadBufferForRedoExtended(record, block_id, RBM_NORMAL,
307
0
                     false, buf);
308
0
}
309
310
/*
311
 * Pin and lock a buffer referenced by a WAL record, for the purpose of
312
 * re-initializing it.
313
 */
314
Buffer
315
XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
316
0
{
317
0
  Buffer    buf;
318
319
0
  XLogReadBufferForRedoExtended(record, block_id, RBM_ZERO_AND_LOCK, false,
320
0
                  &buf);
321
0
  return buf;
322
0
}
323
324
/*
325
 * XLogReadBufferForRedoExtended
326
 *    Like XLogReadBufferForRedo, but with extra options.
327
 *
328
 * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
329
 * with all-zeroes pages up to the referenced block number.  In
330
 * RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value
331
 * is always BLK_NEEDS_REDO.
332
 *
333
 * (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock
334
 * parameter. Do not use an inconsistent combination!)
335
 *
336
 * If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer
337
 * using LockBufferForCleanup(), instead of a regular exclusive lock.
338
 */
339
XLogRedoAction
340
XLogReadBufferForRedoExtended(XLogReaderState *record,
341
                uint8 block_id,
342
                ReadBufferMode mode, bool get_cleanup_lock,
343
                Buffer *buf)
344
0
{
345
0
  XLogRecPtr  lsn = record->EndRecPtr;
346
0
  RelFileLocator rlocator;
347
0
  ForkNumber  forknum;
348
0
  BlockNumber blkno;
349
0
  Buffer    prefetch_buffer;
350
0
  Page    page;
351
0
  bool    zeromode;
352
0
  bool    willinit;
353
354
0
  if (!XLogRecGetBlockTagExtended(record, block_id, &rlocator, &forknum, &blkno,
355
0
                  &prefetch_buffer))
356
0
  {
357
    /* Caller specified a bogus block_id */
358
0
    elog(PANIC, "failed to locate backup block with ID %d in WAL record",
359
0
       block_id);
360
0
  }
361
362
  /*
363
   * Make sure that if the block is marked with WILL_INIT, the caller is
364
   * going to initialize it. And vice versa.
365
   */
366
0
  zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
367
0
  willinit = (XLogRecGetBlock(record, block_id)->flags & BKPBLOCK_WILL_INIT) != 0;
368
0
  if (willinit && !zeromode)
369
0
    elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine");
370
0
  if (!willinit && zeromode)
371
0
    elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record");
372
373
  /* If it has a full-page image and it should be restored, do it. */
374
0
  if (XLogRecBlockImageApply(record, block_id))
375
0
  {
376
0
    Assert(XLogRecHasBlockImage(record, block_id));
377
0
    *buf = XLogReadBufferExtended(rlocator, forknum, blkno,
378
0
                    get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK,
379
0
                    prefetch_buffer);
380
0
    page = BufferGetPage(*buf);
381
0
    if (!RestoreBlockImage(record, block_id, page))
382
0
      ereport(ERROR,
383
0
          (errcode(ERRCODE_INTERNAL_ERROR),
384
0
           errmsg_internal("%s", record->errormsg_buf)));
385
386
    /*
387
     * The page may be uninitialized. If so, we can't set the LSN because
388
     * that would corrupt the page.
389
     */
390
0
    if (!PageIsNew(page))
391
0
    {
392
0
      PageSetLSN(page, lsn);
393
0
    }
394
395
0
    MarkBufferDirty(*buf);
396
397
    /*
398
     * At the end of crash recovery the init forks of unlogged relations
399
     * are copied, without going through shared buffers. So we need to
400
     * force the on-disk state of init forks to always be in sync with the
401
     * state in shared buffers.
402
     */
403
0
    if (forknum == INIT_FORKNUM)
404
0
      FlushOneBuffer(*buf);
405
406
0
    return BLK_RESTORED;
407
0
  }
408
0
  else
409
0
  {
410
0
    *buf = XLogReadBufferExtended(rlocator, forknum, blkno, mode, prefetch_buffer);
411
0
    if (BufferIsValid(*buf))
412
0
    {
413
0
      if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK)
414
0
      {
415
0
        if (get_cleanup_lock)
416
0
          LockBufferForCleanup(*buf);
417
0
        else
418
0
          LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
419
0
      }
420
0
      if (lsn <= PageGetLSN(BufferGetPage(*buf)))
421
0
        return BLK_DONE;
422
0
      else
423
0
        return BLK_NEEDS_REDO;
424
0
    }
425
0
    else
426
0
      return BLK_NOTFOUND;
427
0
  }
428
0
}
429
430
/*
431
 * XLogReadBufferExtended
432
 *    Read a page during XLOG replay
433
 *
434
 * This is functionally comparable to ReadBufferExtended. There's some
435
 * differences in the behavior wrt. the "mode" argument:
436
 *
437
 * In RBM_NORMAL mode, if the page doesn't exist, or contains all-zeroes, we
438
 * return InvalidBuffer. In this case the caller should silently skip the
439
 * update on this page. (In this situation, we expect that the page was later
440
 * dropped or truncated. If we don't see evidence of that later in the WAL
441
 * sequence, we'll complain at the end of WAL replay.)
442
 *
443
 * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
444
 * with all-zeroes pages up to the given block number.
445
 *
446
 * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
447
 * exist, and we don't check for all-zeroes.  Thus, no log entry is made
448
 * to imply that the page should be dropped or truncated later.
449
 *
450
 * Optionally, recent_buffer can be used to provide a hint about the location
451
 * of the page in the buffer pool; it does not have to be correct, but avoids
452
 * a buffer mapping table probe if it is.
453
 *
454
 * NB: A redo function should normally not call this directly. To get a page
455
 * to modify, use XLogReadBufferForRedoExtended instead. It is important that
456
 * all pages modified by a WAL record are registered in the WAL records, or
457
 * they will be invisible to tools that need to know which pages are modified.
458
 */
459
Buffer
460
XLogReadBufferExtended(RelFileLocator rlocator, ForkNumber forknum,
461
             BlockNumber blkno, ReadBufferMode mode,
462
             Buffer recent_buffer)
463
0
{
464
0
  BlockNumber lastblock;
465
0
  Buffer    buffer;
466
0
  SMgrRelation smgr;
467
468
0
  Assert(blkno != P_NEW);
469
470
  /* Do we have a clue where the buffer might be already? */
471
0
  if (BufferIsValid(recent_buffer) &&
472
0
    mode == RBM_NORMAL &&
473
0
    ReadRecentBuffer(rlocator, forknum, blkno, recent_buffer))
474
0
  {
475
0
    buffer = recent_buffer;
476
0
    goto recent_buffer_fast_path;
477
0
  }
478
479
  /* Open the relation at smgr level */
480
0
  smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
481
482
  /*
483
   * Create the target file if it doesn't already exist.  This lets us cope
484
   * if the replay sequence contains writes to a relation that is later
485
   * deleted.  (The original coding of this routine would instead suppress
486
   * the writes, but that seems like it risks losing valuable data if the
487
   * filesystem loses an inode during a crash.  Better to write the data
488
   * until we are actually told to delete the file.)
489
   */
490
0
  smgrcreate(smgr, forknum, true);
491
492
0
  lastblock = smgrnblocks(smgr, forknum);
493
494
0
  if (blkno < lastblock)
495
0
  {
496
    /* page exists in file */
497
0
    buffer = ReadBufferWithoutRelcache(rlocator, forknum, blkno,
498
0
                       mode, NULL, true);
499
0
  }
500
0
  else
501
0
  {
502
    /* hm, page doesn't exist in file */
503
0
    if (mode == RBM_NORMAL)
504
0
    {
505
0
      log_invalid_page(rlocator, forknum, blkno, false);
506
0
      return InvalidBuffer;
507
0
    }
508
0
    if (mode == RBM_NORMAL_NO_LOG)
509
0
      return InvalidBuffer;
510
    /* OK to extend the file */
511
    /* we do this in recovery only - no rel-extension lock needed */
512
0
    Assert(InRecovery);
513
0
    buffer = ExtendBufferedRelTo(BMR_SMGR(smgr, RELPERSISTENCE_PERMANENT),
514
0
                   forknum,
515
0
                   NULL,
516
0
                   EB_PERFORMING_RECOVERY |
517
0
                   EB_SKIP_EXTENSION_LOCK,
518
0
                   blkno + 1,
519
0
                   mode);
520
0
  }
521
522
0
recent_buffer_fast_path:
523
0
  if (mode == RBM_NORMAL)
524
0
  {
525
    /* check that page has been initialized */
526
0
    Page    page = (Page) BufferGetPage(buffer);
527
528
    /*
529
     * We assume that PageIsNew is safe without a lock. During recovery,
530
     * there should be no other backends that could modify the buffer at
531
     * the same time.
532
     */
533
0
    if (PageIsNew(page))
534
0
    {
535
0
      ReleaseBuffer(buffer);
536
0
      log_invalid_page(rlocator, forknum, blkno, true);
537
0
      return InvalidBuffer;
538
0
    }
539
0
  }
540
541
0
  return buffer;
542
0
}
543
544
/*
545
 * Struct actually returned by CreateFakeRelcacheEntry, though the declared
546
 * return type is Relation.
547
 */
548
typedef struct
549
{
550
  RelationData reldata;   /* Note: this must be first */
551
  FormData_pg_class pgc;
552
} FakeRelCacheEntryData;
553
554
typedef FakeRelCacheEntryData *FakeRelCacheEntry;
555
556
/*
557
 * Create a fake relation cache entry for a physical relation
558
 *
559
 * It's often convenient to use the same functions in XLOG replay as in the
560
 * main codepath, but those functions typically work with a relcache entry.
561
 * We don't have a working relation cache during XLOG replay, but this
562
 * function can be used to create a fake relcache entry instead. Only the
563
 * fields related to physical storage, like rd_rel, are initialized, so the
564
 * fake entry is only usable in low-level operations like ReadBuffer().
565
 *
566
 * This is also used for syncing WAL-skipped files.
567
 *
568
 * Caller must free the returned entry with FreeFakeRelcacheEntry().
569
 */
570
Relation
571
CreateFakeRelcacheEntry(RelFileLocator rlocator)
572
0
{
573
0
  FakeRelCacheEntry fakeentry;
574
0
  Relation  rel;
575
576
  /* Allocate the Relation struct and all related space in one block. */
577
0
  fakeentry = palloc0(sizeof(FakeRelCacheEntryData));
578
0
  rel = (Relation) fakeentry;
579
580
0
  rel->rd_rel = &fakeentry->pgc;
581
0
  rel->rd_locator = rlocator;
582
583
  /*
584
   * We will never be working with temp rels during recovery or while
585
   * syncing WAL-skipped files.
586
   */
587
0
  rel->rd_backend = INVALID_PROC_NUMBER;
588
589
  /* It must be a permanent table here */
590
0
  rel->rd_rel->relpersistence = RELPERSISTENCE_PERMANENT;
591
592
  /* We don't know the name of the relation; use relfilenumber instead */
593
0
  sprintf(RelationGetRelationName(rel), "%u", rlocator.relNumber);
594
595
  /*
596
   * We set up the lockRelId in case anything tries to lock the dummy
597
   * relation.  Note that this is fairly bogus since relNumber may be
598
   * different from the relation's OID.  It shouldn't really matter though.
599
   * In recovery, we are running by ourselves and can't have any lock
600
   * conflicts.  While syncing, we already hold AccessExclusiveLock.
601
   */
602
0
  rel->rd_lockInfo.lockRelId.dbId = rlocator.dbOid;
603
0
  rel->rd_lockInfo.lockRelId.relId = rlocator.relNumber;
604
605
  /*
606
   * Set up a non-pinned SMgrRelation reference, so that we don't need to
607
   * worry about unpinning it on error.
608
   */
609
0
  rel->rd_smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
610
611
0
  return rel;
612
0
}
613
614
/*
615
 * Free a fake relation cache entry.
616
 */
617
void
618
FreeFakeRelcacheEntry(Relation fakerel)
619
0
{
620
0
  pfree(fakerel);
621
0
}
622
623
/*
624
 * Drop a relation during XLOG replay
625
 *
626
 * This is called when the relation is about to be deleted; we need to remove
627
 * any open "invalid-page" records for the relation.
628
 */
629
void
630
XLogDropRelation(RelFileLocator rlocator, ForkNumber forknum)
631
0
{
632
0
  forget_invalid_pages(rlocator, forknum, 0);
633
0
}
634
635
/*
636
 * Drop a whole database during XLOG replay
637
 *
638
 * As above, but for DROP DATABASE instead of dropping a single rel
639
 */
640
void
641
XLogDropDatabase(Oid dbid)
642
0
{
643
  /*
644
   * This is unnecessarily heavy-handed, as it will close SMgrRelation
645
   * objects for other databases as well. DROP DATABASE occurs seldom enough
646
   * that it's not worth introducing a variant of smgrdestroy for just this
647
   * purpose.
648
   */
649
0
  smgrdestroyall();
650
651
0
  forget_invalid_pages_db(dbid);
652
0
}
653
654
/*
655
 * Truncate a relation during XLOG replay
656
 *
657
 * We need to clean up any open "invalid-page" records for the dropped pages.
658
 */
659
void
660
XLogTruncateRelation(RelFileLocator rlocator, ForkNumber forkNum,
661
           BlockNumber nblocks)
662
0
{
663
0
  forget_invalid_pages(rlocator, forkNum, nblocks);
664
0
}
665
666
/*
667
 * Determine which timeline to read an xlog page from and set the
668
 * XLogReaderState's currTLI to that timeline ID.
669
 *
670
 * We care about timelines in xlogreader when we might be reading xlog
671
 * generated prior to a promotion, either if we're currently a standby in
672
 * recovery or if we're a promoted primary reading xlogs generated by the old
673
 * primary before our promotion.
674
 *
675
 * wantPage must be set to the start address of the page to read and
676
 * wantLength to the amount of the page that will be read, up to
677
 * XLOG_BLCKSZ. If the amount to be read isn't known, pass XLOG_BLCKSZ.
678
 *
679
 * The currTLI argument should be the system-wide current timeline.
680
 * Note that this may be different from state->currTLI, which is the timeline
681
 * from which the caller is currently reading previous xlog records.
682
 *
683
 * We switch to an xlog segment from the new timeline eagerly when on a
684
 * historical timeline, as soon as we reach the start of the xlog segment
685
 * containing the timeline switch.  The server copied the segment to the new
686
 * timeline so all the data up to the switch point is the same, but there's no
687
 * guarantee the old segment will still exist. It may have been deleted or
688
 * renamed with a .partial suffix so we can't necessarily keep reading from
689
 * the old TLI even though tliSwitchPoint says it's OK.
690
 *
691
 * We can't just check the timeline when we read a page on a different segment
692
 * to the last page. We could've received a timeline switch from a cascading
693
 * upstream, so the current segment ends abruptly (possibly getting renamed to
694
 * .partial) and we have to switch to a new one.  Even in the middle of reading
695
 * a page we could have to dump the cached page and switch to a new TLI.
696
 *
697
 * Because of this, callers MAY NOT assume that currTLI is the timeline that
698
 * will be in a page's xlp_tli; the page may begin on an older timeline or we
699
 * might be reading from historical timeline data on a segment that's been
700
 * copied to a new timeline.
701
 *
702
 * The caller must also make sure it doesn't read past the current replay
703
 * position (using GetXLogReplayRecPtr) if executing in recovery, so it
704
 * doesn't fail to notice that the current timeline became historical.
705
 */
706
void
707
XLogReadDetermineTimeline(XLogReaderState *state, XLogRecPtr wantPage,
708
              uint32 wantLength, TimeLineID currTLI)
709
0
{
710
0
  const XLogRecPtr lastReadPage = (state->seg.ws_segno *
711
0
                   state->segcxt.ws_segsize + state->segoff);
712
713
0
  Assert(wantPage != InvalidXLogRecPtr && wantPage % XLOG_BLCKSZ == 0);
714
0
  Assert(wantLength <= XLOG_BLCKSZ);
715
0
  Assert(state->readLen == 0 || state->readLen <= XLOG_BLCKSZ);
716
0
  Assert(currTLI != 0);
717
718
  /*
719
   * If the desired page is currently read in and valid, we have nothing to
720
   * do.
721
   *
722
   * The caller should've ensured that it didn't previously advance readOff
723
   * past the valid limit of this timeline, so it doesn't matter if the
724
   * current TLI has since become historical.
725
   */
726
0
  if (lastReadPage == wantPage &&
727
0
    state->readLen != 0 &&
728
0
    lastReadPage + state->readLen >= wantPage + Min(wantLength, XLOG_BLCKSZ - 1))
729
0
    return;
730
731
  /*
732
   * If we're reading from the current timeline, it hasn't become historical
733
   * and the page we're reading is after the last page read, we can again
734
   * just carry on. (Seeking backwards requires a check to make sure the
735
   * older page isn't on a prior timeline).
736
   *
737
   * currTLI might've become historical since the caller obtained the value,
738
   * but the caller is required not to read past the flush limit it saw at
739
   * the time it looked up the timeline. There's nothing we can do about it
740
   * if StartupXLOG() renames it to .partial concurrently.
741
   */
742
0
  if (state->currTLI == currTLI && wantPage >= lastReadPage)
743
0
  {
744
0
    Assert(state->currTLIValidUntil == InvalidXLogRecPtr);
745
0
    return;
746
0
  }
747
748
  /*
749
   * If we're just reading pages from a previously validated historical
750
   * timeline and the timeline we're reading from is valid until the end of
751
   * the current segment we can just keep reading.
752
   */
753
0
  if (state->currTLIValidUntil != InvalidXLogRecPtr &&
754
0
    state->currTLI != currTLI &&
755
0
    state->currTLI != 0 &&
756
0
    ((wantPage + wantLength) / state->segcxt.ws_segsize) <
757
0
    (state->currTLIValidUntil / state->segcxt.ws_segsize))
758
0
    return;
759
760
  /*
761
   * If we reach this point we're either looking up a page for random
762
   * access, the current timeline just became historical, or we're reading
763
   * from a new segment containing a timeline switch. In all cases we need
764
   * to determine the newest timeline on the segment.
765
   *
766
   * If it's the current timeline we can just keep reading from here unless
767
   * we detect a timeline switch that makes the current timeline historical.
768
   * If it's a historical timeline we can read all the segment on the newest
769
   * timeline because it contains all the old timelines' data too. So only
770
   * one switch check is required.
771
   */
772
0
  {
773
    /*
774
     * We need to re-read the timeline history in case it's been changed
775
     * by a promotion or replay from a cascaded replica.
776
     */
777
0
    List     *timelineHistory = readTimeLineHistory(currTLI);
778
0
    XLogRecPtr  endOfSegment;
779
780
0
    endOfSegment = ((wantPage / state->segcxt.ws_segsize) + 1) *
781
0
      state->segcxt.ws_segsize - 1;
782
0
    Assert(wantPage / state->segcxt.ws_segsize ==
783
0
         endOfSegment / state->segcxt.ws_segsize);
784
785
    /*
786
     * Find the timeline of the last LSN on the segment containing
787
     * wantPage.
788
     */
789
0
    state->currTLI = tliOfPointInHistory(endOfSegment, timelineHistory);
790
0
    state->currTLIValidUntil = tliSwitchPoint(state->currTLI, timelineHistory,
791
0
                          &state->nextTLI);
792
793
0
    Assert(state->currTLIValidUntil == InvalidXLogRecPtr ||
794
0
         wantPage + wantLength < state->currTLIValidUntil);
795
796
0
    list_free_deep(timelineHistory);
797
798
0
    elog(DEBUG3, "switched to timeline %u valid until %X/%X",
799
0
       state->currTLI,
800
0
       LSN_FORMAT_ARGS(state->currTLIValidUntil));
801
0
  }
802
0
}
803
804
/* XLogReaderRoutine->segment_open callback for local pg_wal files */
805
void
806
wal_segment_open(XLogReaderState *state, XLogSegNo nextSegNo,
807
         TimeLineID *tli_p)
808
0
{
809
0
  TimeLineID  tli = *tli_p;
810
0
  char    path[MAXPGPATH];
811
812
0
  XLogFilePath(path, tli, nextSegNo, state->segcxt.ws_segsize);
813
0
  state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
814
0
  if (state->seg.ws_file >= 0)
815
0
    return;
816
817
0
  if (errno == ENOENT)
818
0
    ereport(ERROR,
819
0
        (errcode_for_file_access(),
820
0
         errmsg("requested WAL segment %s has already been removed",
821
0
            path)));
822
0
  else
823
0
    ereport(ERROR,
824
0
        (errcode_for_file_access(),
825
0
         errmsg("could not open file \"%s\": %m",
826
0
            path)));
827
0
}
828
829
/* stock XLogReaderRoutine->segment_close callback */
830
void
831
wal_segment_close(XLogReaderState *state)
832
0
{
833
0
  close(state->seg.ws_file);
834
  /* need to check errno? */
835
0
  state->seg.ws_file = -1;
836
0
}
837
838
/*
839
 * XLogReaderRoutine->page_read callback for reading local xlog files
840
 *
841
 * Public because it would likely be very helpful for someone writing another
842
 * output method outside walsender, e.g. in a bgworker.
843
 */
844
int
845
read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
846
           int reqLen, XLogRecPtr targetRecPtr, char *cur_page)
847
0
{
848
0
  return read_local_xlog_page_guts(state, targetPagePtr, reqLen,
849
0
                   targetRecPtr, cur_page, true);
850
0
}
851
852
/*
853
 * Same as read_local_xlog_page except that it doesn't wait for future WAL
854
 * to be available.
855
 */
856
int
857
read_local_xlog_page_no_wait(XLogReaderState *state, XLogRecPtr targetPagePtr,
858
               int reqLen, XLogRecPtr targetRecPtr,
859
               char *cur_page)
860
0
{
861
0
  return read_local_xlog_page_guts(state, targetPagePtr, reqLen,
862
0
                   targetRecPtr, cur_page, false);
863
0
}
864
865
/*
866
 * Implementation of read_local_xlog_page and its no wait version.
867
 */
868
static int
869
read_local_xlog_page_guts(XLogReaderState *state, XLogRecPtr targetPagePtr,
870
              int reqLen, XLogRecPtr targetRecPtr,
871
              char *cur_page, bool wait_for_wal)
872
0
{
873
0
  XLogRecPtr  read_upto,
874
0
        loc;
875
0
  TimeLineID  tli;
876
0
  int     count;
877
0
  WALReadError errinfo;
878
0
  TimeLineID  currTLI;
879
880
0
  loc = targetPagePtr + reqLen;
881
882
  /*
883
   * Loop waiting for xlog to be available if necessary
884
   *
885
   * TODO: The walsender has its own version of this function, which uses a
886
   * condition variable to wake up whenever WAL is flushed. We could use the
887
   * same infrastructure here, instead of the check/sleep/repeat style of
888
   * loop.
889
   */
890
0
  while (1)
891
0
  {
892
    /*
893
     * Determine the limit of xlog we can currently read to, and what the
894
     * most recent timeline is.
895
     */
896
0
    if (!RecoveryInProgress())
897
0
      read_upto = GetFlushRecPtr(&currTLI);
898
0
    else
899
0
      read_upto = GetXLogReplayRecPtr(&currTLI);
900
0
    tli = currTLI;
901
902
    /*
903
     * Check which timeline to get the record from.
904
     *
905
     * We have to do it each time through the loop because if we're in
906
     * recovery as a cascading standby, the current timeline might've
907
     * become historical. We can't rely on RecoveryInProgress() because in
908
     * a standby configuration like
909
     *
910
     * A => B => C
911
     *
912
     * if we're a logical decoding session on C, and B gets promoted, our
913
     * timeline will change while we remain in recovery.
914
     *
915
     * We can't just keep reading from the old timeline as the last WAL
916
     * archive in the timeline will get renamed to .partial by
917
     * StartupXLOG().
918
     *
919
     * If that happens after our caller determined the TLI but before we
920
     * actually read the xlog page, we might still try to read from the
921
     * old (now renamed) segment and fail. There's not much we can do
922
     * about this, but it can only happen when we're a leaf of a cascading
923
     * standby whose primary gets promoted while we're decoding, so a
924
     * one-off ERROR isn't too bad.
925
     */
926
0
    XLogReadDetermineTimeline(state, targetPagePtr, reqLen, tli);
927
928
0
    if (state->currTLI == currTLI)
929
0
    {
930
931
0
      if (loc <= read_upto)
932
0
        break;
933
934
      /* If asked, let's not wait for future WAL. */
935
0
      if (!wait_for_wal)
936
0
      {
937
0
        ReadLocalXLogPageNoWaitPrivate *private_data;
938
939
        /*
940
         * Inform the caller of read_local_xlog_page_no_wait that the
941
         * end of WAL has been reached.
942
         */
943
0
        private_data = (ReadLocalXLogPageNoWaitPrivate *)
944
0
          state->private_data;
945
0
        private_data->end_of_wal = true;
946
0
        break;
947
0
      }
948
949
0
      CHECK_FOR_INTERRUPTS();
950
0
      pg_usleep(1000L);
951
0
    }
952
0
    else
953
0
    {
954
      /*
955
       * We're on a historical timeline, so limit reading to the switch
956
       * point where we moved to the next timeline.
957
       *
958
       * We don't need to GetFlushRecPtr or GetXLogReplayRecPtr. We know
959
       * about the new timeline, so we must've received past the end of
960
       * it.
961
       */
962
0
      read_upto = state->currTLIValidUntil;
963
964
      /*
965
       * Setting tli to our wanted record's TLI is slightly wrong; the
966
       * page might begin on an older timeline if it contains a timeline
967
       * switch, since its xlog segment will have been copied from the
968
       * prior timeline. This is pretty harmless though, as nothing
969
       * cares so long as the timeline doesn't go backwards.  We should
970
       * read the page header instead; FIXME someday.
971
       */
972
0
      tli = state->currTLI;
973
974
      /* No need to wait on a historical timeline */
975
0
      break;
976
0
    }
977
0
  }
978
979
0
  if (targetPagePtr + XLOG_BLCKSZ <= read_upto)
980
0
  {
981
    /*
982
     * more than one block available; read only that block, have caller
983
     * come back if they need more.
984
     */
985
0
    count = XLOG_BLCKSZ;
986
0
  }
987
0
  else if (targetPagePtr + reqLen > read_upto)
988
0
  {
989
    /* not enough data there */
990
0
    return -1;
991
0
  }
992
0
  else
993
0
  {
994
    /* enough bytes available to satisfy the request */
995
0
    count = read_upto - targetPagePtr;
996
0
  }
997
998
0
  if (!WALRead(state, cur_page, targetPagePtr, count, tli,
999
0
         &errinfo))
1000
0
    WALReadRaiseError(&errinfo);
1001
1002
  /* number of valid bytes in the buffer */
1003
0
  return count;
1004
0
}
1005
1006
/*
1007
 * Backend-specific convenience code to handle read errors encountered by
1008
 * WALRead().
1009
 */
1010
void
1011
WALReadRaiseError(WALReadError *errinfo)
1012
0
{
1013
0
  WALOpenSegment *seg = &errinfo->wre_seg;
1014
0
  char    fname[MAXFNAMELEN];
1015
1016
0
  XLogFileName(fname, seg->ws_tli, seg->ws_segno, wal_segment_size);
1017
1018
0
  if (errinfo->wre_read < 0)
1019
0
  {
1020
0
    errno = errinfo->wre_errno;
1021
0
    ereport(ERROR,
1022
0
        (errcode_for_file_access(),
1023
0
         errmsg("could not read from WAL segment %s, offset %d: %m",
1024
0
            fname, errinfo->wre_off)));
1025
0
  }
1026
0
  else if (errinfo->wre_read == 0)
1027
0
  {
1028
0
    ereport(ERROR,
1029
0
        (errcode(ERRCODE_DATA_CORRUPTED),
1030
0
         errmsg("could not read from WAL segment %s, offset %d: read %d of %d",
1031
0
            fname, errinfo->wre_off, errinfo->wre_read,
1032
0
            errinfo->wre_req)));
1033
0
  }
1034
0
}