Coverage Report

Created: 2025-08-12 06:43

/src/postgres/src/backend/access/transam/xlogrecovery.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * xlogrecovery.c
4
 *    Functions for WAL recovery, standby mode
5
 *
6
 * This source file contains functions controlling WAL recovery.
7
 * InitWalRecovery() initializes the system for crash or archive recovery,
8
 * or standby mode, depending on configuration options and the state of
9
 * the control file and possible backup label file.  PerformWalRecovery()
10
 * performs the actual WAL replay, calling the rmgr-specific redo routines.
11
 * FinishWalRecovery() performs end-of-recovery checks and cleanup actions,
12
 * and prepares information needed to initialize the WAL for writes.  In
13
 * addition to these three main functions, there are a bunch of functions
14
 * for interrogating recovery state and controlling the recovery process.
15
 *
16
 *
17
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
18
 * Portions Copyright (c) 1994, Regents of the University of California
19
 *
20
 * src/backend/access/transam/xlogrecovery.c
21
 *
22
 *-------------------------------------------------------------------------
23
 */
24
25
#include "postgres.h"
26
27
#include <ctype.h>
28
#include <math.h>
29
#include <time.h>
30
#include <sys/stat.h>
31
#include <sys/time.h>
32
#include <unistd.h>
33
34
#include "access/timeline.h"
35
#include "access/transam.h"
36
#include "access/xact.h"
37
#include "access/xlog_internal.h"
38
#include "access/xlogarchive.h"
39
#include "access/xlogprefetcher.h"
40
#include "access/xlogreader.h"
41
#include "access/xlogrecovery.h"
42
#include "access/xlogutils.h"
43
#include "backup/basebackup.h"
44
#include "catalog/pg_control.h"
45
#include "commands/tablespace.h"
46
#include "common/file_utils.h"
47
#include "miscadmin.h"
48
#include "pgstat.h"
49
#include "postmaster/bgwriter.h"
50
#include "postmaster/startup.h"
51
#include "replication/slot.h"
52
#include "replication/slotsync.h"
53
#include "replication/walreceiver.h"
54
#include "storage/fd.h"
55
#include "storage/ipc.h"
56
#include "storage/latch.h"
57
#include "storage/pmsignal.h"
58
#include "storage/procarray.h"
59
#include "storage/spin.h"
60
#include "utils/datetime.h"
61
#include "utils/fmgrprotos.h"
62
#include "utils/guc_hooks.h"
63
#include "utils/pgstat_internal.h"
64
#include "utils/pg_lsn.h"
65
#include "utils/ps_status.h"
66
#include "utils/pg_rusage.h"
67
68
/* Unsupported old recovery command file names (relative to $PGDATA) */
69
0
#define RECOVERY_COMMAND_FILE "recovery.conf"
70
0
#define RECOVERY_COMMAND_DONE "recovery.done"
71
72
/*
73
 * GUC support
74
 */
75
const struct config_enum_entry recovery_target_action_options[] = {
76
  {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
77
  {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
78
  {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
79
  {NULL, 0, false}
80
};
81
82
/* options formerly taken from recovery.conf for archive recovery */
83
char     *recoveryRestoreCommand = NULL;
84
char     *recoveryEndCommand = NULL;
85
char     *archiveCleanupCommand = NULL;
86
RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
87
bool    recoveryTargetInclusive = true;
88
int     recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
89
TransactionId recoveryTargetXid;
90
char     *recovery_target_time_string;
91
TimestampTz recoveryTargetTime;
92
const char *recoveryTargetName;
93
XLogRecPtr  recoveryTargetLSN;
94
int     recovery_min_apply_delay = 0;
95
96
/* options formerly taken from recovery.conf for XLOG streaming */
97
char     *PrimaryConnInfo = NULL;
98
char     *PrimarySlotName = NULL;
99
bool    wal_receiver_create_temp_slot = false;
100
101
/*
102
 * recoveryTargetTimeLineGoal: what the user requested, if any
103
 *
104
 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
105
 *
106
 * recoveryTargetTLI: the currently understood target timeline; changes
107
 *
108
 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
109
 * the timelines of its known parents, newest first (so recoveryTargetTLI is
110
 * always the first list member).  Only these TLIs are expected to be seen in
111
 * the WAL segments we read, and indeed only these TLIs will be considered as
112
 * candidate WAL files to open at all.
113
 *
114
 * curFileTLI: the TLI appearing in the name of the current input WAL file.
115
 * (This is not necessarily the same as the timeline from which we are
116
 * replaying WAL, which StartupXLOG calls replayTLI, because we could be
117
 * scanning data that was copied from an ancestor timeline when the current
118
 * file was created.)  During a sequential scan we do not allow this value
119
 * to decrease.
120
 */
121
RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
122
TimeLineID  recoveryTargetTLIRequested = 0;
123
TimeLineID  recoveryTargetTLI = 0;
124
static List *expectedTLEs;
125
static TimeLineID curFileTLI;
126
127
/*
128
 * When ArchiveRecoveryRequested is set, archive recovery was requested,
129
 * ie. signal files were present.  When InArchiveRecovery is set, we are
130
 * currently recovering using offline XLOG archives.  These variables are only
131
 * valid in the startup process.
132
 *
133
 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
134
 * currently performing crash recovery using only XLOG files in pg_wal, but
135
 * will switch to using offline XLOG archives as soon as we reach the end of
136
 * WAL in pg_wal.
137
 */
138
bool    ArchiveRecoveryRequested = false;
139
bool    InArchiveRecovery = false;
140
141
/*
142
 * When StandbyModeRequested is set, standby mode was requested, i.e.
143
 * standby.signal file was present.  When StandbyMode is set, we are currently
144
 * in standby mode.  These variables are only valid in the startup process.
145
 * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery.
146
 */
147
static bool StandbyModeRequested = false;
148
bool    StandbyMode = false;
149
150
/* was a signal file present at startup? */
151
static bool standby_signal_file_found = false;
152
static bool recovery_signal_file_found = false;
153
154
/*
155
 * CheckPointLoc is the position of the checkpoint record that determines
156
 * where to start the replay.  It comes from the backup label file or the
157
 * control file.
158
 *
159
 * RedoStartLSN is the checkpoint's REDO location, also from the backup label
160
 * file or the control file.  In standby mode, XLOG streaming usually starts
161
 * from the position where an invalid record was found.  But if we fail to
162
 * read even the initial checkpoint record, we use the REDO location instead
163
 * of the checkpoint location as the start position of XLOG streaming.
164
 * Otherwise we would have to jump backwards to the REDO location after
165
 * reading the checkpoint record, because the REDO record can precede the
166
 * checkpoint record.
167
 */
168
static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
169
static TimeLineID CheckPointTLI = 0;
170
static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
171
static TimeLineID RedoStartTLI = 0;
172
173
/*
174
 * Local copy of SharedHotStandbyActive variable. False actually means "not
175
 * known, need to check the shared state".
176
 */
177
static bool LocalHotStandbyActive = false;
178
179
/*
180
 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
181
 * known, need to check the shared state".
182
 */
183
static bool LocalPromoteIsTriggered = false;
184
185
/* Has the recovery code requested a walreceiver wakeup? */
186
static bool doRequestWalReceiverReply;
187
188
/* XLogReader object used to parse the WAL records */
189
static XLogReaderState *xlogreader = NULL;
190
191
/* XLogPrefetcher object used to consume WAL records with read-ahead */
192
static XLogPrefetcher *xlogprefetcher = NULL;
193
194
/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
195
typedef struct XLogPageReadPrivate
196
{
197
  int     emode;
198
  bool    fetching_ckpt;  /* are we fetching a checkpoint record? */
199
  bool    randAccess;
200
  TimeLineID  replayTLI;
201
} XLogPageReadPrivate;
202
203
/* flag to tell XLogPageRead that we have started replaying */
204
static bool InRedo = false;
205
206
/*
207
 * Codes indicating where we got a WAL file from during recovery, or where
208
 * to attempt to get one.
209
 */
210
typedef enum
211
{
212
  XLOG_FROM_ANY = 0,      /* request to read WAL from any source */
213
  XLOG_FROM_ARCHIVE,      /* restored using restore_command */
214
  XLOG_FROM_PG_WAL,     /* existing file in pg_wal */
215
  XLOG_FROM_STREAM,     /* streamed from primary */
216
} XLogSource;
217
218
/* human-readable names for XLogSources, for debugging output */
219
static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
220
221
/*
222
 * readFile is -1 or a kernel FD for the log file segment that's currently
223
 * open for reading.  readSegNo identifies the segment.  readOff is the offset
224
 * of the page just read, readLen indicates how much of it has been read into
225
 * readBuf, and readSource indicates where we got the currently open file from.
226
 *
227
 * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
228
 * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
229
 * worthwhile, since the XLOG is not read by general-purpose sessions.
230
 */
231
static int  readFile = -1;
232
static XLogSegNo readSegNo = 0;
233
static uint32 readOff = 0;
234
static uint32 readLen = 0;
235
static XLogSource readSource = XLOG_FROM_ANY;
236
237
/*
238
 * Keeps track of which source we're currently reading from. This is
239
 * different from readSource in that this is always set, even when we don't
240
 * currently have a WAL file open. If lastSourceFailed is set, our last
241
 * attempt to read from currentSource failed, and we should try another source
242
 * next.
243
 *
244
 * pendingWalRcvRestart is set when a config change occurs that requires a
245
 * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
246
 */
247
static XLogSource currentSource = XLOG_FROM_ANY;
248
static bool lastSourceFailed = false;
249
static bool pendingWalRcvRestart = false;
250
251
/*
252
 * These variables track when we last obtained some WAL data to process,
253
 * and where we got it from.  (XLogReceiptSource is initially the same as
254
 * readSource, but readSource gets reset to zero when we don't have data
255
 * to process right now.  It is also different from currentSource, which
256
 * also changes when we try to read from a source and fail, while
257
 * XLogReceiptSource tracks where we last successfully read some WAL.)
258
 */
259
static TimestampTz XLogReceiptTime = 0;
260
static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
261
262
/* Local copy of WalRcv->flushedUpto */
263
static XLogRecPtr flushedUpto = 0;
264
static TimeLineID receiveTLI = 0;
265
266
/*
267
 * Copy of minRecoveryPoint and backupEndPoint from the control file.
268
 *
269
 * In order to reach consistency, we must replay the WAL up to
270
 * minRecoveryPoint.  If backupEndRequired is true, we must also reach
271
 * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
272
 * to backupStartPoint.
273
 *
274
 * Note: In archive recovery, after consistency has been reached, the
275
 * functions in xlog.c will start updating minRecoveryPoint in the control
276
 * file.  But this copy of minRecoveryPoint variable reflects the value at the
277
 * beginning of recovery, and is *not* updated after consistency is reached.
278
 */
279
static XLogRecPtr minRecoveryPoint;
280
static TimeLineID minRecoveryPointTLI;
281
282
static XLogRecPtr backupStartPoint;
283
static XLogRecPtr backupEndPoint;
284
static bool backupEndRequired = false;
285
286
/*
287
 * Have we reached a consistent database state?  In crash recovery, we have
288
 * to replay all the WAL, so reachedConsistency is never set.  During archive
289
 * recovery, the database is consistent once minRecoveryPoint is reached.
290
 *
291
 * Consistent state means that the system is internally consistent, all
292
 * the WAL has been replayed up to a certain point, and importantly, there
293
 * is no trace of later actions on disk.
294
 *
295
 * This flag is used only by the startup process and postmaster. When
296
 * minRecoveryPoint is reached, the startup process sets it to true and
297
 * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster,
298
 * which then sets it to true upon receiving the signal.
299
 */
300
bool    reachedConsistency = false;
301
302
/* Buffers dedicated to consistency checks of size BLCKSZ */
303
static char *replay_image_masked = NULL;
304
static char *primary_image_masked = NULL;
305
306
307
/*
308
 * Shared-memory state for WAL recovery.
309
 */
310
typedef struct XLogRecoveryCtlData
311
{
312
  /*
313
   * SharedHotStandbyActive indicates if we allow hot standby queries to be
314
   * run.  Protected by info_lck.
315
   */
316
  bool    SharedHotStandbyActive;
317
318
  /*
319
   * SharedPromoteIsTriggered indicates if a standby promotion has been
320
   * triggered.  Protected by info_lck.
321
   */
322
  bool    SharedPromoteIsTriggered;
323
324
  /*
325
   * recoveryWakeupLatch is used to wake up the startup process to continue
326
   * WAL replay, if it is waiting for WAL to arrive or promotion to be
327
   * requested.
328
   *
329
   * Note that the startup process also uses another latch, its procLatch,
330
   * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
331
   * signaling the startup process in favor of using its procLatch, which
332
   * comports better with possible generic signal handlers using that latch.
333
   * But we should not do that because the startup process doesn't assume
334
   * that it's waken up by walreceiver process or SIGHUP signal handler
335
   * while it's waiting for recovery conflict. The separate latches,
336
   * recoveryWakeupLatch and procLatch, should be used for inter-process
337
   * communication for WAL replay and recovery conflict, respectively.
338
   */
339
  Latch   recoveryWakeupLatch;
340
341
  /*
342
   * Last record successfully replayed.
343
   */
344
  XLogRecPtr  lastReplayedReadRecPtr; /* start position */
345
  XLogRecPtr  lastReplayedEndRecPtr;  /* end+1 position */
346
  TimeLineID  lastReplayedTLI;  /* timeline */
347
348
  /*
349
   * When we're currently replaying a record, ie. in a redo function,
350
   * replayEndRecPtr points to the end+1 of the record being replayed,
351
   * otherwise it's equal to lastReplayedEndRecPtr.
352
   */
353
  XLogRecPtr  replayEndRecPtr;
354
  TimeLineID  replayEndTLI;
355
  /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
356
  TimestampTz recoveryLastXTime;
357
358
  /*
359
   * timestamp of when we started replaying the current chunk of WAL data,
360
   * only relevant for replication or archive recovery
361
   */
362
  TimestampTz currentChunkStartTime;
363
  /* Recovery pause state */
364
  RecoveryPauseState recoveryPauseState;
365
  ConditionVariable recoveryNotPausedCV;
366
367
  slock_t   info_lck;   /* locks shared variables shown above */
368
} XLogRecoveryCtlData;
369
370
static XLogRecoveryCtlData *XLogRecoveryCtl = NULL;
371
372
/*
373
 * abortedRecPtr is the start pointer of a broken record at end of WAL when
374
 * recovery completes; missingContrecPtr is the location of the first
375
 * contrecord that went missing.  See CreateOverwriteContrecordRecord for
376
 * details.
377
 */
378
static XLogRecPtr abortedRecPtr;
379
static XLogRecPtr missingContrecPtr;
380
381
/*
382
 * if recoveryStopsBefore/After returns true, it saves information of the stop
383
 * point here
384
 */
385
static TransactionId recoveryStopXid;
386
static TimestampTz recoveryStopTime;
387
static XLogRecPtr recoveryStopLSN;
388
static char recoveryStopName[MAXFNAMELEN];
389
static bool recoveryStopAfter;
390
391
/* prototypes for local functions */
392
static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI);
393
394
static void EnableStandbyMode(void);
395
static void readRecoverySignalFile(void);
396
static void validateRecoveryParameters(void);
397
static bool read_backup_label(XLogRecPtr *checkPointLoc,
398
                TimeLineID *backupLabelTLI,
399
                bool *backupEndRequired, bool *backupFromStandby);
400
static bool read_tablespace_map(List **tablespaces);
401
402
static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI);
403
static void CheckRecoveryConsistency(void);
404
static void rm_redo_error_callback(void *arg);
405
#ifdef WAL_DEBUG
406
static void xlog_outrec(StringInfo buf, XLogReaderState *record);
407
#endif
408
static void xlog_block_info(StringInfo buf, XLogReaderState *record);
409
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
410
                TimeLineID prevTLI, TimeLineID replayTLI);
411
static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
412
static void verifyBackupPageConsistency(XLogReaderState *record);
413
414
static bool recoveryStopsBefore(XLogReaderState *record);
415
static bool recoveryStopsAfter(XLogReaderState *record);
416
static char *getRecoveryStopReason(void);
417
static void recoveryPausesHere(bool endOfRecovery);
418
static bool recoveryApplyDelay(XLogReaderState *record);
419
static void ConfirmRecoveryPaused(void);
420
421
static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher,
422
                int emode, bool fetching_ckpt,
423
                TimeLineID replayTLI);
424
425
static int  XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
426
             int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
427
static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
428
                            bool randAccess,
429
                            bool fetching_ckpt,
430
                            XLogRecPtr tliRecPtr,
431
                            TimeLineID replayTLI,
432
                            XLogRecPtr replayLSN,
433
                            bool nonblocking);
434
static int  emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
435
static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
436
                    XLogRecPtr RecPtr, TimeLineID replayTLI);
437
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
438
static int  XLogFileRead(XLogSegNo segno, TimeLineID tli,
439
             XLogSource source, bool notfoundOk);
440
static int  XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source);
441
442
static bool CheckForStandbyTrigger(void);
443
static void SetPromoteIsTriggered(void);
444
static bool HotStandbyActiveInReplay(void);
445
446
static void SetCurrentChunkStartTime(TimestampTz xtime);
447
static void SetLatestXTime(TimestampTz xtime);
448
449
/*
450
 * Initialization of shared memory for WAL recovery
451
 */
452
Size
453
XLogRecoveryShmemSize(void)
454
0
{
455
0
  Size    size;
456
457
  /* XLogRecoveryCtl */
458
0
  size = sizeof(XLogRecoveryCtlData);
459
460
0
  return size;
461
0
}
462
463
void
464
XLogRecoveryShmemInit(void)
465
0
{
466
0
  bool    found;
467
468
0
  XLogRecoveryCtl = (XLogRecoveryCtlData *)
469
0
    ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
470
0
  if (found)
471
0
    return;
472
0
  memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData));
473
474
0
  SpinLockInit(&XLogRecoveryCtl->info_lck);
475
0
  InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
476
0
  ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV);
477
0
}
478
479
/*
480
 * A thin wrapper to enable StandbyMode and do other preparatory work as
481
 * needed.
482
 */
483
static void
484
EnableStandbyMode(void)
485
0
{
486
0
  StandbyMode = true;
487
488
  /*
489
   * To avoid server log bloat, we don't report recovery progress in a
490
   * standby as it will always be in recovery unless promoted. We disable
491
   * startup progress timeout in standby mode to avoid calling
492
   * startup_progress_timeout_handler() unnecessarily.
493
   */
494
0
  disable_startup_progress_timeout();
495
0
}
496
497
/*
498
 * Prepare the system for WAL recovery, if needed.
499
 *
500
 * This is called by StartupXLOG() which coordinates the server startup
501
 * sequence.  This function analyzes the control file and the backup label
502
 * file, if any, and figures out whether we need to perform crash recovery or
503
 * archive recovery, and how far we need to replay the WAL to reach a
504
 * consistent state.
505
 *
506
 * This doesn't yet change the on-disk state, except for creating the symlinks
507
 * from table space map file if any, and for fetching WAL files needed to find
508
 * the checkpoint record.  On entry, the caller has already read the control
509
 * file into memory, and passes it as argument.  This function updates it to
510
 * reflect the recovery state, and the caller is expected to write it back to
511
 * disk does after initializing other subsystems, but before calling
512
 * PerformWalRecovery().
513
 *
514
 * This initializes some global variables like ArchiveRecoveryRequested, and
515
 * StandbyModeRequested and InRecovery.
516
 */
517
void
518
InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
519
        bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
520
0
{
521
0
  XLogPageReadPrivate *private;
522
0
  struct stat st;
523
0
  bool    wasShutdown;
524
0
  XLogRecord *record;
525
0
  DBState   dbstate_at_startup;
526
0
  bool    haveTblspcMap = false;
527
0
  bool    haveBackupLabel = false;
528
0
  CheckPoint  checkPoint;
529
0
  bool    backupFromStandby = false;
530
531
0
  dbstate_at_startup = ControlFile->state;
532
533
  /*
534
   * Initialize on the assumption we want to recover to the latest timeline
535
   * that's active according to pg_control.
536
   */
537
0
  if (ControlFile->minRecoveryPointTLI >
538
0
    ControlFile->checkPointCopy.ThisTimeLineID)
539
0
    recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
540
0
  else
541
0
    recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
542
543
  /*
544
   * Check for signal files, and if so set up state for offline recovery
545
   */
546
0
  readRecoverySignalFile();
547
0
  validateRecoveryParameters();
548
549
  /*
550
   * Take ownership of the wakeup latch if we're going to sleep during
551
   * recovery, if required.
552
   */
553
0
  if (ArchiveRecoveryRequested)
554
0
    OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
555
556
  /*
557
   * Set the WAL reading processor now, as it will be needed when reading
558
   * the checkpoint record required (backup_label or not).
559
   */
560
0
  private = palloc0(sizeof(XLogPageReadPrivate));
561
0
  xlogreader =
562
0
    XLogReaderAllocate(wal_segment_size, NULL,
563
0
               XL_ROUTINE(.page_read = &XLogPageRead,
564
0
                    .segment_open = NULL,
565
0
                    .segment_close = wal_segment_close),
566
0
               private);
567
0
  if (!xlogreader)
568
0
    ereport(ERROR,
569
0
        (errcode(ERRCODE_OUT_OF_MEMORY),
570
0
         errmsg("out of memory"),
571
0
         errdetail("Failed while allocating a WAL reading processor.")));
572
0
  xlogreader->system_identifier = ControlFile->system_identifier;
573
574
  /*
575
   * Set the WAL decode buffer size.  This limits how far ahead we can read
576
   * in the WAL.
577
   */
578
0
  XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size);
579
580
  /* Create a WAL prefetcher. */
581
0
  xlogprefetcher = XLogPrefetcherAllocate(xlogreader);
582
583
  /*
584
   * Allocate two page buffers dedicated to WAL consistency checks.  We do
585
   * it this way, rather than just making static arrays, for two reasons:
586
   * (1) no need to waste the storage in most instantiations of the backend;
587
   * (2) a static char array isn't guaranteed to have any particular
588
   * alignment, whereas palloc() will provide MAXALIGN'd storage.
589
   */
590
0
  replay_image_masked = (char *) palloc(BLCKSZ);
591
0
  primary_image_masked = (char *) palloc(BLCKSZ);
592
593
  /*
594
   * Read the backup_label file.  We want to run this part of the recovery
595
   * process after checking for signal files and after performing validation
596
   * of the recovery parameters.
597
   */
598
0
  if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired,
599
0
              &backupFromStandby))
600
0
  {
601
0
    List     *tablespaces = NIL;
602
603
    /*
604
     * Archive recovery was requested, and thanks to the backup label
605
     * file, we know how far we need to replay to reach consistency. Enter
606
     * archive recovery directly.
607
     */
608
0
    InArchiveRecovery = true;
609
0
    if (StandbyModeRequested)
610
0
      EnableStandbyMode();
611
612
    /*
613
     * Omitting backup_label when creating a new replica, PITR node etc.
614
     * unfortunately is a common cause of corruption.  Logging that
615
     * backup_label was used makes it a bit easier to exclude that as the
616
     * cause of observed corruption.
617
     *
618
     * Do so before we try to read the checkpoint record (which can fail),
619
     * as otherwise it can be hard to understand why a checkpoint other
620
     * than ControlFile->checkPoint is used.
621
     */
622
0
    ereport(LOG,
623
0
        errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u",
624
0
             LSN_FORMAT_ARGS(RedoStartLSN),
625
0
             LSN_FORMAT_ARGS(CheckPointLoc),
626
0
             CheckPointTLI));
627
628
    /*
629
     * When a backup_label file is present, we want to roll forward from
630
     * the checkpoint it identifies, rather than using pg_control.
631
     */
632
0
    record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
633
0
                    CheckPointTLI);
634
0
    if (record != NULL)
635
0
    {
636
0
      memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
637
0
      wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
638
0
      ereport(DEBUG1,
639
0
          errmsg_internal("checkpoint record is at %X/%08X",
640
0
                  LSN_FORMAT_ARGS(CheckPointLoc)));
641
0
      InRecovery = true;  /* force recovery even if SHUTDOWNED */
642
643
      /*
644
       * Make sure that REDO location exists. This may not be the case
645
       * if there was a crash during an online backup, which left a
646
       * backup_label around that references a WAL segment that's
647
       * already been archived.
648
       */
649
0
      if (checkPoint.redo < CheckPointLoc)
650
0
      {
651
0
        XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
652
0
        if (!ReadRecord(xlogprefetcher, LOG, false,
653
0
                checkPoint.ThisTimeLineID))
654
0
          ereport(FATAL,
655
0
              errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
656
0
                   LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)),
657
0
              errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
658
0
                  "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
659
0
                  "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
660
0
                  DataDir, DataDir, DataDir, DataDir));
661
0
      }
662
0
    }
663
0
    else
664
0
    {
665
0
      ereport(FATAL,
666
0
          errmsg("could not locate required checkpoint record at %X/%08X",
667
0
               LSN_FORMAT_ARGS(CheckPointLoc)),
668
0
          errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n"
669
0
              "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
670
0
              "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
671
0
              DataDir, DataDir, DataDir, DataDir));
672
0
      wasShutdown = false;  /* keep compiler quiet */
673
0
    }
674
675
    /* Read the tablespace_map file if present and create symlinks. */
676
0
    if (read_tablespace_map(&tablespaces))
677
0
    {
678
0
      ListCell   *lc;
679
680
0
      foreach(lc, tablespaces)
681
0
      {
682
0
        tablespaceinfo *ti = lfirst(lc);
683
0
        char     *linkloc;
684
685
0
        linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid);
686
687
        /*
688
         * Remove the existing symlink if any and Create the symlink
689
         * under PGDATA.
690
         */
691
0
        remove_tablespace_symlink(linkloc);
692
693
0
        if (symlink(ti->path, linkloc) < 0)
694
0
          ereport(ERROR,
695
0
              (errcode_for_file_access(),
696
0
               errmsg("could not create symbolic link \"%s\": %m",
697
0
                  linkloc)));
698
699
0
        pfree(ti->path);
700
0
        pfree(ti);
701
0
      }
702
703
      /* tell the caller to delete it later */
704
0
      haveTblspcMap = true;
705
0
    }
706
707
    /* tell the caller to delete it later */
708
0
    haveBackupLabel = true;
709
0
  }
710
0
  else
711
0
  {
712
    /* No backup_label file has been found if we are here. */
713
714
    /*
715
     * If tablespace_map file is present without backup_label file, there
716
     * is no use of such file.  There is no harm in retaining it, but it
717
     * is better to get rid of the map file so that we don't have any
718
     * redundant file in data directory and it will avoid any sort of
719
     * confusion.  It seems prudent though to just rename the file out of
720
     * the way rather than delete it completely, also we ignore any error
721
     * that occurs in rename operation as even if map file is present
722
     * without backup_label file, it is harmless.
723
     */
724
0
    if (stat(TABLESPACE_MAP, &st) == 0)
725
0
    {
726
0
      unlink(TABLESPACE_MAP_OLD);
727
0
      if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
728
0
        ereport(LOG,
729
0
            (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
730
0
                TABLESPACE_MAP, BACKUP_LABEL_FILE),
731
0
             errdetail("File \"%s\" was renamed to \"%s\".",
732
0
                   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
733
0
      else
734
0
        ereport(LOG,
735
0
            (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
736
0
                TABLESPACE_MAP, BACKUP_LABEL_FILE),
737
0
             errdetail("Could not rename file \"%s\" to \"%s\": %m.",
738
0
                   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
739
0
    }
740
741
    /*
742
     * It's possible that archive recovery was requested, but we don't
743
     * know how far we need to replay the WAL before we reach consistency.
744
     * This can happen for example if a base backup is taken from a
745
     * running server using an atomic filesystem snapshot, without calling
746
     * pg_backup_start/stop. Or if you just kill a running primary server
747
     * and put it into archive recovery by creating a recovery signal
748
     * file.
749
     *
750
     * Our strategy in that case is to perform crash recovery first,
751
     * replaying all the WAL present in pg_wal, and only enter archive
752
     * recovery after that.
753
     *
754
     * But usually we already know how far we need to replay the WAL (up
755
     * to minRecoveryPoint, up to backupEndPoint, or until we see an
756
     * end-of-backup record), and we can enter archive recovery directly.
757
     */
758
0
    if (ArchiveRecoveryRequested &&
759
0
      (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
760
0
       ControlFile->backupEndRequired ||
761
0
       ControlFile->backupEndPoint != InvalidXLogRecPtr ||
762
0
       ControlFile->state == DB_SHUTDOWNED))
763
0
    {
764
0
      InArchiveRecovery = true;
765
0
      if (StandbyModeRequested)
766
0
        EnableStandbyMode();
767
0
    }
768
769
    /*
770
     * For the same reason as when starting up with backup_label present,
771
     * emit a log message when we continue initializing from a base
772
     * backup.
773
     */
774
0
    if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
775
0
      ereport(LOG,
776
0
          errmsg("restarting backup recovery with redo LSN %X/%08X",
777
0
               LSN_FORMAT_ARGS(ControlFile->backupStartPoint)));
778
779
    /* Get the last valid checkpoint record. */
780
0
    CheckPointLoc = ControlFile->checkPoint;
781
0
    CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
782
0
    RedoStartLSN = ControlFile->checkPointCopy.redo;
783
0
    RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
784
0
    record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
785
0
                    CheckPointTLI);
786
0
    if (record != NULL)
787
0
    {
788
0
      ereport(DEBUG1,
789
0
          errmsg_internal("checkpoint record is at %X/%08X",
790
0
                  LSN_FORMAT_ARGS(CheckPointLoc)));
791
0
    }
792
0
    else
793
0
    {
794
      /*
795
       * We used to attempt to go back to a secondary checkpoint record
796
       * here, but only when not in standby mode. We now just fail if we
797
       * can't read the last checkpoint because this allows us to
798
       * simplify processing around checkpoints.
799
       */
800
0
      ereport(PANIC,
801
0
          errmsg("could not locate a valid checkpoint record at %X/%08X",
802
0
               LSN_FORMAT_ARGS(CheckPointLoc)));
803
0
    }
804
0
    memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
805
0
    wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
806
0
  }
807
808
0
  if (ArchiveRecoveryRequested)
809
0
  {
810
0
    if (StandbyModeRequested)
811
0
      ereport(LOG,
812
0
          (errmsg("entering standby mode")));
813
0
    else if (recoveryTarget == RECOVERY_TARGET_XID)
814
0
      ereport(LOG,
815
0
          (errmsg("starting point-in-time recovery to XID %u",
816
0
              recoveryTargetXid)));
817
0
    else if (recoveryTarget == RECOVERY_TARGET_TIME)
818
0
      ereport(LOG,
819
0
          (errmsg("starting point-in-time recovery to %s",
820
0
              timestamptz_to_str(recoveryTargetTime))));
821
0
    else if (recoveryTarget == RECOVERY_TARGET_NAME)
822
0
      ereport(LOG,
823
0
          (errmsg("starting point-in-time recovery to \"%s\"",
824
0
              recoveryTargetName)));
825
0
    else if (recoveryTarget == RECOVERY_TARGET_LSN)
826
0
      ereport(LOG,
827
0
          errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"",
828
0
               LSN_FORMAT_ARGS(recoveryTargetLSN)));
829
0
    else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
830
0
      ereport(LOG,
831
0
          (errmsg("starting point-in-time recovery to earliest consistent point")));
832
0
    else
833
0
      ereport(LOG,
834
0
          (errmsg("starting archive recovery")));
835
0
  }
836
837
  /*
838
   * If the location of the checkpoint record is not on the expected
839
   * timeline in the history of the requested timeline, we cannot proceed:
840
   * the backup is not part of the history of the requested timeline.
841
   */
842
0
  Assert(expectedTLEs);   /* was initialized by reading checkpoint
843
                 * record */
844
0
  if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
845
0
    CheckPointTLI)
846
0
  {
847
0
    XLogRecPtr  switchpoint;
848
849
    /*
850
     * tliSwitchPoint will throw an error if the checkpoint's timeline is
851
     * not in expectedTLEs at all.
852
     */
853
0
    switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL);
854
0
    ereport(FATAL,
855
0
        (errmsg("requested timeline %u is not a child of this server's history",
856
0
            recoveryTargetTLI),
857
    /* translator: %s is a backup_label file or a pg_control file */
858
0
         errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.",
859
0
               haveBackupLabel ? "backup_label" : "pg_control",
860
0
               LSN_FORMAT_ARGS(CheckPointLoc),
861
0
               CheckPointTLI,
862
0
               LSN_FORMAT_ARGS(switchpoint))));
863
0
  }
864
865
  /*
866
   * The min recovery point should be part of the requested timeline's
867
   * history, too.
868
   */
869
0
  if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
870
0
    tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
871
0
    ControlFile->minRecoveryPointTLI)
872
0
    ereport(FATAL,
873
0
        errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u",
874
0
             recoveryTargetTLI,
875
0
             LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
876
0
             ControlFile->minRecoveryPointTLI));
877
878
0
  ereport(DEBUG1,
879
0
      errmsg_internal("redo record is at %X/%08X; shutdown %s",
880
0
              LSN_FORMAT_ARGS(checkPoint.redo),
881
0
              wasShutdown ? "true" : "false"));
882
0
  ereport(DEBUG1,
883
0
      (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
884
0
               U64FromFullTransactionId(checkPoint.nextXid),
885
0
               checkPoint.nextOid)));
886
0
  ereport(DEBUG1,
887
0
      (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
888
0
               checkPoint.nextMulti, checkPoint.nextMultiOffset)));
889
0
  ereport(DEBUG1,
890
0
      (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
891
0
               checkPoint.oldestXid, checkPoint.oldestXidDB)));
892
0
  ereport(DEBUG1,
893
0
      (errmsg_internal("oldest MultiXactId: %u, in database %u",
894
0
               checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
895
0
  ereport(DEBUG1,
896
0
      (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
897
0
               checkPoint.oldestCommitTsXid,
898
0
               checkPoint.newestCommitTsXid)));
899
0
  if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
900
0
    ereport(PANIC,
901
0
        (errmsg("invalid next transaction ID")));
902
903
  /* sanity check */
904
0
  if (checkPoint.redo > CheckPointLoc)
905
0
    ereport(PANIC,
906
0
        (errmsg("invalid redo in checkpoint record")));
907
908
  /*
909
   * Check whether we need to force recovery from WAL.  If it appears to
910
   * have been a clean shutdown and we did not have a recovery signal file,
911
   * then assume no recovery needed.
912
   */
913
0
  if (checkPoint.redo < CheckPointLoc)
914
0
  {
915
0
    if (wasShutdown)
916
0
      ereport(PANIC,
917
0
          (errmsg("invalid redo record in shutdown checkpoint")));
918
0
    InRecovery = true;
919
0
  }
920
0
  else if (ControlFile->state != DB_SHUTDOWNED)
921
0
    InRecovery = true;
922
0
  else if (ArchiveRecoveryRequested)
923
0
  {
924
    /* force recovery due to presence of recovery signal file */
925
0
    InRecovery = true;
926
0
  }
927
928
  /*
929
   * If recovery is needed, update our in-memory copy of pg_control to show
930
   * that we are recovering and to show the selected checkpoint as the place
931
   * we are starting from. We also mark pg_control with any minimum recovery
932
   * stop point obtained from a backup history file.
933
   *
934
   * We don't write the changes to disk yet, though. Only do that after
935
   * initializing various subsystems.
936
   */
937
0
  if (InRecovery)
938
0
  {
939
0
    if (InArchiveRecovery)
940
0
    {
941
0
      ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
942
0
    }
943
0
    else
944
0
    {
945
0
      ereport(LOG,
946
0
          (errmsg("database system was not properly shut down; "
947
0
              "automatic recovery in progress")));
948
0
      if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
949
0
        ereport(LOG,
950
0
            (errmsg("crash recovery starts in timeline %u "
951
0
                "and has target timeline %u",
952
0
                ControlFile->checkPointCopy.ThisTimeLineID,
953
0
                recoveryTargetTLI)));
954
0
      ControlFile->state = DB_IN_CRASH_RECOVERY;
955
0
    }
956
0
    ControlFile->checkPoint = CheckPointLoc;
957
0
    ControlFile->checkPointCopy = checkPoint;
958
0
    if (InArchiveRecovery)
959
0
    {
960
      /* initialize minRecoveryPoint if not set yet */
961
0
      if (ControlFile->minRecoveryPoint < checkPoint.redo)
962
0
      {
963
0
        ControlFile->minRecoveryPoint = checkPoint.redo;
964
0
        ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
965
0
      }
966
0
    }
967
968
    /*
969
     * Set backupStartPoint if we're starting recovery from a base backup.
970
     *
971
     * Also set backupEndPoint and use minRecoveryPoint as the backup end
972
     * location if we're starting recovery from a base backup which was
973
     * taken from a standby. In this case, the database system status in
974
     * pg_control must indicate that the database was already in recovery.
975
     * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
976
     * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
977
     * before reaching this point; e.g. because restore_command or
978
     * primary_conninfo were faulty.
979
     *
980
     * Any other state indicates that the backup somehow became corrupted
981
     * and we can't sensibly continue with recovery.
982
     */
983
0
    if (haveBackupLabel)
984
0
    {
985
0
      ControlFile->backupStartPoint = checkPoint.redo;
986
0
      ControlFile->backupEndRequired = backupEndRequired;
987
988
0
      if (backupFromStandby)
989
0
      {
990
0
        if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
991
0
          dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
992
0
          ereport(FATAL,
993
0
              (errmsg("backup_label contains data inconsistent with control file"),
994
0
               errhint("This means that the backup is corrupted and you will "
995
0
                   "have to use another backup for recovery.")));
996
0
        ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
997
0
      }
998
0
    }
999
0
  }
1000
1001
  /* remember these, so that we know when we have reached consistency */
1002
0
  backupStartPoint = ControlFile->backupStartPoint;
1003
0
  backupEndRequired = ControlFile->backupEndRequired;
1004
0
  backupEndPoint = ControlFile->backupEndPoint;
1005
0
  if (InArchiveRecovery)
1006
0
  {
1007
0
    minRecoveryPoint = ControlFile->minRecoveryPoint;
1008
0
    minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
1009
0
  }
1010
0
  else
1011
0
  {
1012
0
    minRecoveryPoint = InvalidXLogRecPtr;
1013
0
    minRecoveryPointTLI = 0;
1014
0
  }
1015
1016
  /*
1017
   * Start recovery assuming that the final record isn't lost.
1018
   */
1019
0
  abortedRecPtr = InvalidXLogRecPtr;
1020
0
  missingContrecPtr = InvalidXLogRecPtr;
1021
1022
0
  *wasShutdown_ptr = wasShutdown;
1023
0
  *haveBackupLabel_ptr = haveBackupLabel;
1024
0
  *haveTblspcMap_ptr = haveTblspcMap;
1025
0
}
1026
1027
/*
1028
 * See if there are any recovery signal files and if so, set state for
1029
 * recovery.
1030
 *
1031
 * See if there is a recovery command file (recovery.conf), and if so
1032
 * throw an ERROR since as of PG12 we no longer recognize that.
1033
 */
1034
static void
1035
readRecoverySignalFile(void)
1036
0
{
1037
0
  struct stat stat_buf;
1038
1039
0
  if (IsBootstrapProcessingMode())
1040
0
    return;
1041
1042
  /*
1043
   * Check for old recovery API file: recovery.conf
1044
   */
1045
0
  if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
1046
0
    ereport(FATAL,
1047
0
        (errcode_for_file_access(),
1048
0
         errmsg("using recovery command file \"%s\" is not supported",
1049
0
            RECOVERY_COMMAND_FILE)));
1050
1051
  /*
1052
   * Remove unused .done file, if present. Ignore if absent.
1053
   */
1054
0
  unlink(RECOVERY_COMMAND_DONE);
1055
1056
  /*
1057
   * Check for recovery signal files and if found, fsync them since they
1058
   * represent server state information.  We don't sweat too much about the
1059
   * possibility of fsync failure, however.
1060
   *
1061
   * If present, standby signal file takes precedence. If neither is present
1062
   * then we won't enter archive recovery.
1063
   */
1064
0
  if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
1065
0
  {
1066
0
    int     fd;
1067
1068
0
    fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1069
0
                 S_IRUSR | S_IWUSR);
1070
0
    if (fd >= 0)
1071
0
    {
1072
0
      (void) pg_fsync(fd);
1073
0
      close(fd);
1074
0
    }
1075
0
    standby_signal_file_found = true;
1076
0
  }
1077
0
  else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
1078
0
  {
1079
0
    int     fd;
1080
1081
0
    fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY,
1082
0
                 S_IRUSR | S_IWUSR);
1083
0
    if (fd >= 0)
1084
0
    {
1085
0
      (void) pg_fsync(fd);
1086
0
      close(fd);
1087
0
    }
1088
0
    recovery_signal_file_found = true;
1089
0
  }
1090
1091
0
  StandbyModeRequested = false;
1092
0
  ArchiveRecoveryRequested = false;
1093
0
  if (standby_signal_file_found)
1094
0
  {
1095
0
    StandbyModeRequested = true;
1096
0
    ArchiveRecoveryRequested = true;
1097
0
  }
1098
0
  else if (recovery_signal_file_found)
1099
0
  {
1100
0
    StandbyModeRequested = false;
1101
0
    ArchiveRecoveryRequested = true;
1102
0
  }
1103
0
  else
1104
0
    return;
1105
1106
  /*
1107
   * We don't support standby mode in standalone backends; that requires
1108
   * other processes such as the WAL receiver to be alive.
1109
   */
1110
0
  if (StandbyModeRequested && !IsUnderPostmaster)
1111
0
    ereport(FATAL,
1112
0
        (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1113
0
         errmsg("standby mode is not supported by single-user servers")));
1114
0
}
1115
1116
static void
1117
validateRecoveryParameters(void)
1118
0
{
1119
0
  if (!ArchiveRecoveryRequested)
1120
0
    return;
1121
1122
  /*
1123
   * Check for compulsory parameters
1124
   */
1125
0
  if (StandbyModeRequested)
1126
0
  {
1127
0
    if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
1128
0
      (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
1129
0
      ereport(WARNING,
1130
0
          (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""),
1131
0
           errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
1132
0
  }
1133
0
  else
1134
0
  {
1135
0
    if (recoveryRestoreCommand == NULL ||
1136
0
      strcmp(recoveryRestoreCommand, "") == 0)
1137
0
      ereport(FATAL,
1138
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1139
0
           errmsg("must specify \"restore_command\" when standby mode is not enabled")));
1140
0
  }
1141
1142
  /*
1143
   * Override any inconsistent requests. Note that this is a change of
1144
   * behaviour in 9.5; prior to this we simply ignored a request to pause if
1145
   * hot_standby = off, which was surprising behaviour.
1146
   */
1147
0
  if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
1148
0
    !EnableHotStandby)
1149
0
    recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
1150
1151
  /*
1152
   * Final parsing of recovery_target_time string; see also
1153
   * check_recovery_target_time().
1154
   */
1155
0
  if (recoveryTarget == RECOVERY_TARGET_TIME)
1156
0
  {
1157
0
    recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
1158
0
                                   CStringGetDatum(recovery_target_time_string),
1159
0
                                   ObjectIdGetDatum(InvalidOid),
1160
0
                                   Int32GetDatum(-1)));
1161
0
  }
1162
1163
  /*
1164
   * If user specified recovery_target_timeline, validate it or compute the
1165
   * "latest" value.  We can't do this until after we've gotten the restore
1166
   * command and set InArchiveRecovery, because we need to fetch timeline
1167
   * history files from the archive.
1168
   */
1169
0
  if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
1170
0
  {
1171
0
    TimeLineID  rtli = recoveryTargetTLIRequested;
1172
1173
    /* Timeline 1 does not have a history file, all else should */
1174
0
    if (rtli != 1 && !existsTimeLineHistory(rtli))
1175
0
      ereport(FATAL,
1176
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1177
0
           errmsg("recovery target timeline %u does not exist",
1178
0
              rtli)));
1179
0
    recoveryTargetTLI = rtli;
1180
0
  }
1181
0
  else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
1182
0
  {
1183
    /* We start the "latest" search from pg_control's timeline */
1184
0
    recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
1185
0
  }
1186
0
  else
1187
0
  {
1188
    /*
1189
     * else we just use the recoveryTargetTLI as already read from
1190
     * ControlFile
1191
     */
1192
0
    Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
1193
0
  }
1194
0
}
1195
1196
/*
1197
 * read_backup_label: check to see if a backup_label file is present
1198
 *
1199
 * If we see a backup_label during recovery, we assume that we are recovering
1200
 * from a backup dump file, and we therefore roll forward from the checkpoint
1201
 * identified by the label file, NOT what pg_control says.  This avoids the
1202
 * problem that pg_control might have been archived one or more checkpoints
1203
 * later than the start of the dump, and so if we rely on it as the start
1204
 * point, we will fail to restore a consistent database state.
1205
 *
1206
 * Returns true if a backup_label was found (and fills the checkpoint
1207
 * location and TLI into *checkPointLoc and *backupLabelTLI, respectively);
1208
 * returns false if not. If this backup_label came from a streamed backup,
1209
 * *backupEndRequired is set to true. If this backup_label was created during
1210
 * recovery, *backupFromStandby is set to true.
1211
 *
1212
 * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN
1213
 * and TLI read from the backup file.
1214
 */
1215
static bool
1216
read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI,
1217
          bool *backupEndRequired, bool *backupFromStandby)
1218
0
{
1219
0
  char    startxlogfilename[MAXFNAMELEN];
1220
0
  TimeLineID  tli_from_walseg,
1221
0
        tli_from_file;
1222
0
  FILE     *lfp;
1223
0
  char    ch;
1224
0
  char    backuptype[20];
1225
0
  char    backupfrom[20];
1226
0
  char    backuplabel[MAXPGPATH];
1227
0
  char    backuptime[128];
1228
0
  uint32    hi,
1229
0
        lo;
1230
1231
  /* suppress possible uninitialized-variable warnings */
1232
0
  *checkPointLoc = InvalidXLogRecPtr;
1233
0
  *backupLabelTLI = 0;
1234
0
  *backupEndRequired = false;
1235
0
  *backupFromStandby = false;
1236
1237
  /*
1238
   * See if label file is present
1239
   */
1240
0
  lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
1241
0
  if (!lfp)
1242
0
  {
1243
0
    if (errno != ENOENT)
1244
0
      ereport(FATAL,
1245
0
          (errcode_for_file_access(),
1246
0
           errmsg("could not read file \"%s\": %m",
1247
0
              BACKUP_LABEL_FILE)));
1248
0
    return false;     /* it's not there, all is fine */
1249
0
  }
1250
1251
  /*
1252
   * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
1253
   * is pretty crude, but we are not expecting any variability in the file
1254
   * format).
1255
   */
1256
0
  if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c",
1257
0
         &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
1258
0
    ereport(FATAL,
1259
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1260
0
         errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1261
0
  RedoStartLSN = ((uint64) hi) << 32 | lo;
1262
0
  RedoStartTLI = tli_from_walseg;
1263
0
  if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c",
1264
0
         &hi, &lo, &ch) != 3 || ch != '\n')
1265
0
    ereport(FATAL,
1266
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1267
0
         errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
1268
0
  *checkPointLoc = ((uint64) hi) << 32 | lo;
1269
0
  *backupLabelTLI = tli_from_walseg;
1270
1271
  /*
1272
   * BACKUP METHOD lets us know if this was a typical backup ("streamed",
1273
   * which could mean either pg_basebackup or the pg_backup_start/stop
1274
   * method was used) or if this label came from somewhere else (the only
1275
   * other option today being from pg_rewind).  If this was a streamed
1276
   * backup then we know that we need to play through until we get to the
1277
   * end of the WAL which was generated during the backup (at which point we
1278
   * will have reached consistency and backupEndRequired will be reset to be
1279
   * false).
1280
   */
1281
0
  if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
1282
0
  {
1283
0
    if (strcmp(backuptype, "streamed") == 0)
1284
0
      *backupEndRequired = true;
1285
0
  }
1286
1287
  /*
1288
   * BACKUP FROM lets us know if this was from a primary or a standby.  If
1289
   * it was from a standby, we'll double-check that the control file state
1290
   * matches that of a standby.
1291
   */
1292
0
  if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
1293
0
  {
1294
0
    if (strcmp(backupfrom, "standby") == 0)
1295
0
      *backupFromStandby = true;
1296
0
  }
1297
1298
  /*
1299
   * Parse START TIME and LABEL. Those are not mandatory fields for recovery
1300
   * but checking for their presence is useful for debugging and the next
1301
   * sanity checks. Cope also with the fact that the result buffers have a
1302
   * pre-allocated size, hence if the backup_label file has been generated
1303
   * with strings longer than the maximum assumed here an incorrect parsing
1304
   * happens. That's fine as only minor consistency checks are done
1305
   * afterwards.
1306
   */
1307
0
  if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
1308
0
    ereport(DEBUG1,
1309
0
        (errmsg_internal("backup time %s in file \"%s\"",
1310
0
                 backuptime, BACKUP_LABEL_FILE)));
1311
1312
0
  if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
1313
0
    ereport(DEBUG1,
1314
0
        (errmsg_internal("backup label %s in file \"%s\"",
1315
0
                 backuplabel, BACKUP_LABEL_FILE)));
1316
1317
  /*
1318
   * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
1319
   * it as a sanity check if present.
1320
   */
1321
0
  if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
1322
0
  {
1323
0
    if (tli_from_walseg != tli_from_file)
1324
0
      ereport(FATAL,
1325
0
          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1326
0
           errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
1327
0
           errdetail("Timeline ID parsed is %u, but expected %u.",
1328
0
                 tli_from_file, tli_from_walseg)));
1329
1330
0
    ereport(DEBUG1,
1331
0
        (errmsg_internal("backup timeline %u in file \"%s\"",
1332
0
                 tli_from_file, BACKUP_LABEL_FILE)));
1333
0
  }
1334
1335
0
  if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0)
1336
0
    ereport(FATAL,
1337
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1338
0
         errmsg("this is an incremental backup, not a data directory"),
1339
0
         errhint("Use pg_combinebackup to reconstruct a valid data directory.")));
1340
1341
0
  if (ferror(lfp) || FreeFile(lfp))
1342
0
    ereport(FATAL,
1343
0
        (errcode_for_file_access(),
1344
0
         errmsg("could not read file \"%s\": %m",
1345
0
            BACKUP_LABEL_FILE)));
1346
1347
0
  return true;
1348
0
}
1349
1350
/*
1351
 * read_tablespace_map: check to see if a tablespace_map file is present
1352
 *
1353
 * If we see a tablespace_map file during recovery, we assume that we are
1354
 * recovering from a backup dump file, and we therefore need to create symlinks
1355
 * as per the information present in tablespace_map file.
1356
 *
1357
 * Returns true if a tablespace_map file was found (and fills *tablespaces
1358
 * with a tablespaceinfo struct for each tablespace listed in the file);
1359
 * returns false if not.
1360
 */
1361
static bool
1362
read_tablespace_map(List **tablespaces)
1363
0
{
1364
0
  tablespaceinfo *ti;
1365
0
  FILE     *lfp;
1366
0
  char    str[MAXPGPATH];
1367
0
  int     ch,
1368
0
        i,
1369
0
        n;
1370
0
  bool    was_backslash;
1371
1372
  /*
1373
   * See if tablespace_map file is present
1374
   */
1375
0
  lfp = AllocateFile(TABLESPACE_MAP, "r");
1376
0
  if (!lfp)
1377
0
  {
1378
0
    if (errno != ENOENT)
1379
0
      ereport(FATAL,
1380
0
          (errcode_for_file_access(),
1381
0
           errmsg("could not read file \"%s\": %m",
1382
0
              TABLESPACE_MAP)));
1383
0
    return false;     /* it's not there, all is fine */
1384
0
  }
1385
1386
  /*
1387
   * Read and parse the link name and path lines from tablespace_map file
1388
   * (this code is pretty crude, but we are not expecting any variability in
1389
   * the file format).  De-escape any backslashes that were inserted.
1390
   */
1391
0
  i = 0;
1392
0
  was_backslash = false;
1393
0
  while ((ch = fgetc(lfp)) != EOF)
1394
0
  {
1395
0
    if (!was_backslash && (ch == '\n' || ch == '\r'))
1396
0
    {
1397
0
      char     *endp;
1398
1399
0
      if (i == 0)
1400
0
        continue;   /* \r immediately followed by \n */
1401
1402
      /*
1403
       * The de-escaped line should contain an OID followed by exactly
1404
       * one space followed by a path.  The path might start with
1405
       * spaces, so don't be too liberal about parsing.
1406
       */
1407
0
      str[i] = '\0';
1408
0
      n = 0;
1409
0
      while (str[n] && str[n] != ' ')
1410
0
        n++;
1411
0
      if (n < 1 || n >= i - 1)
1412
0
        ereport(FATAL,
1413
0
            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1414
0
             errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1415
0
      str[n++] = '\0';
1416
1417
0
      ti = palloc0(sizeof(tablespaceinfo));
1418
0
      errno = 0;
1419
0
      ti->oid = strtoul(str, &endp, 10);
1420
0
      if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
1421
0
        ereport(FATAL,
1422
0
            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1423
0
             errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1424
0
      ti->path = pstrdup(str + n);
1425
0
      *tablespaces = lappend(*tablespaces, ti);
1426
1427
0
      i = 0;
1428
0
      continue;
1429
0
    }
1430
0
    else if (!was_backslash && ch == '\\')
1431
0
      was_backslash = true;
1432
0
    else
1433
0
    {
1434
0
      if (i < sizeof(str) - 1)
1435
0
        str[i++] = ch;
1436
0
      was_backslash = false;
1437
0
    }
1438
0
  }
1439
1440
0
  if (i != 0 || was_backslash) /* last line not terminated? */
1441
0
    ereport(FATAL,
1442
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1443
0
         errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
1444
1445
0
  if (ferror(lfp) || FreeFile(lfp))
1446
0
    ereport(FATAL,
1447
0
        (errcode_for_file_access(),
1448
0
         errmsg("could not read file \"%s\": %m",
1449
0
            TABLESPACE_MAP)));
1450
1451
0
  return true;
1452
0
}
1453
1454
/*
1455
 * Finish WAL recovery.
1456
 *
1457
 * This does not close the 'xlogreader' yet, because in some cases the caller
1458
 * still wants to re-read the last checkpoint record by calling
1459
 * ReadCheckpointRecord().
1460
 *
1461
 * Returns the position of the last valid or applied record, after which new
1462
 * WAL should be appended, information about why recovery was ended, and some
1463
 * other things. See the EndOfWalRecoveryInfo struct for details.
1464
 */
1465
EndOfWalRecoveryInfo *
1466
FinishWalRecovery(void)
1467
0
{
1468
0
  EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo));
1469
0
  XLogRecPtr  lastRec;
1470
0
  TimeLineID  lastRecTLI;
1471
0
  XLogRecPtr  endOfLog;
1472
1473
  /*
1474
   * Kill WAL receiver, if it's still running, before we continue to write
1475
   * the startup checkpoint and aborted-contrecord records. It will trump
1476
   * over these records and subsequent ones if it's still alive when we
1477
   * start writing WAL.
1478
   */
1479
0
  XLogShutdownWalRcv();
1480
1481
  /*
1482
   * Shutdown the slot sync worker to drop any temporary slots acquired by
1483
   * it and to prevent it from keep trying to fetch the failover slots.
1484
   *
1485
   * We do not update the 'synced' column in 'pg_replication_slots' system
1486
   * view from true to false here, as any failed update could leave 'synced'
1487
   * column false for some slots. This could cause issues during slot sync
1488
   * after restarting the server as a standby. While updating the 'synced'
1489
   * column after switching to the new timeline is an option, it does not
1490
   * simplify the handling for the 'synced' column. Therefore, we retain the
1491
   * 'synced' column as true after promotion as it may provide useful
1492
   * information about the slot origin.
1493
   */
1494
0
  ShutDownSlotSync();
1495
1496
  /*
1497
   * We are now done reading the xlog from stream. Turn off streaming
1498
   * recovery to force fetching the files (which would be required at end of
1499
   * recovery, e.g., timeline history file) from archive or pg_wal.
1500
   *
1501
   * Note that standby mode must be turned off after killing WAL receiver,
1502
   * i.e., calling XLogShutdownWalRcv().
1503
   */
1504
0
  Assert(!WalRcvStreaming());
1505
0
  StandbyMode = false;
1506
1507
  /*
1508
   * Determine where to start writing WAL next.
1509
   *
1510
   * Re-fetch the last valid or last applied record, so we can identify the
1511
   * exact endpoint of what we consider the valid portion of WAL.  There may
1512
   * be an incomplete continuation record after that, in which case
1513
   * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will
1514
   * write a special OVERWRITE_CONTRECORD message to mark that the rest of
1515
   * it is intentionally missing.  See CreateOverwriteContrecordRecord().
1516
   *
1517
   * An important side-effect of this is to load the last page into
1518
   * xlogreader. The caller uses it to initialize the WAL for writing.
1519
   */
1520
0
  if (!InRecovery)
1521
0
  {
1522
0
    lastRec = CheckPointLoc;
1523
0
    lastRecTLI = CheckPointTLI;
1524
0
  }
1525
0
  else
1526
0
  {
1527
0
    lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr;
1528
0
    lastRecTLI = XLogRecoveryCtl->lastReplayedTLI;
1529
0
  }
1530
0
  XLogPrefetcherBeginRead(xlogprefetcher, lastRec);
1531
0
  (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI);
1532
0
  endOfLog = xlogreader->EndRecPtr;
1533
1534
  /*
1535
   * Remember the TLI in the filename of the XLOG segment containing the
1536
   * end-of-log.  It could be different from the timeline that endOfLog
1537
   * nominally belongs to, if there was a timeline switch in that segment,
1538
   * and we were reading the old WAL from a segment belonging to a higher
1539
   * timeline.
1540
   */
1541
0
  result->endOfLogTLI = xlogreader->seg.ws_tli;
1542
1543
0
  if (ArchiveRecoveryRequested)
1544
0
  {
1545
    /*
1546
     * We are no longer in archive recovery state.
1547
     *
1548
     * We are now done reading the old WAL.  Turn off archive fetching if
1549
     * it was active.
1550
     */
1551
0
    Assert(InArchiveRecovery);
1552
0
    InArchiveRecovery = false;
1553
1554
    /*
1555
     * If the ending log segment is still open, close it (to avoid
1556
     * problems on Windows with trying to rename or delete an open file).
1557
     */
1558
0
    if (readFile >= 0)
1559
0
    {
1560
0
      close(readFile);
1561
0
      readFile = -1;
1562
0
    }
1563
0
  }
1564
1565
  /*
1566
   * Copy the last partial block to the caller, for initializing the WAL
1567
   * buffer for appending new WAL.
1568
   */
1569
0
  if (endOfLog % XLOG_BLCKSZ != 0)
1570
0
  {
1571
0
    char     *page;
1572
0
    int     len;
1573
0
    XLogRecPtr  pageBeginPtr;
1574
1575
0
    pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ);
1576
0
    Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
1577
1578
    /* Copy the valid part of the last block */
1579
0
    len = endOfLog % XLOG_BLCKSZ;
1580
0
    page = palloc(len);
1581
0
    memcpy(page, xlogreader->readBuf, len);
1582
1583
0
    result->lastPageBeginPtr = pageBeginPtr;
1584
0
    result->lastPage = page;
1585
0
  }
1586
0
  else
1587
0
  {
1588
    /* There is no partial block to copy. */
1589
0
    result->lastPageBeginPtr = endOfLog;
1590
0
    result->lastPage = NULL;
1591
0
  }
1592
1593
  /*
1594
   * Create a comment for the history file to explain why and where timeline
1595
   * changed.
1596
   */
1597
0
  result->recoveryStopReason = getRecoveryStopReason();
1598
1599
0
  result->lastRec = lastRec;
1600
0
  result->lastRecTLI = lastRecTLI;
1601
0
  result->endOfLog = endOfLog;
1602
1603
0
  result->abortedRecPtr = abortedRecPtr;
1604
0
  result->missingContrecPtr = missingContrecPtr;
1605
1606
0
  result->standby_signal_file_found = standby_signal_file_found;
1607
0
  result->recovery_signal_file_found = recovery_signal_file_found;
1608
1609
0
  return result;
1610
0
}
1611
1612
/*
1613
 * Clean up the WAL reader and leftovers from restoring WAL from archive
1614
 */
1615
void
1616
ShutdownWalRecovery(void)
1617
0
{
1618
0
  char    recoveryPath[MAXPGPATH];
1619
1620
  /* Final update of pg_stat_recovery_prefetch. */
1621
0
  XLogPrefetcherComputeStats(xlogprefetcher);
1622
1623
  /* Shut down xlogreader */
1624
0
  if (readFile >= 0)
1625
0
  {
1626
0
    close(readFile);
1627
0
    readFile = -1;
1628
0
  }
1629
0
  pfree(xlogreader->private_data);
1630
0
  XLogReaderFree(xlogreader);
1631
0
  XLogPrefetcherFree(xlogprefetcher);
1632
1633
0
  if (ArchiveRecoveryRequested)
1634
0
  {
1635
    /*
1636
     * Since there might be a partial WAL segment named RECOVERYXLOG, get
1637
     * rid of it.
1638
     */
1639
0
    snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
1640
0
    unlink(recoveryPath); /* ignore any error */
1641
1642
    /* Get rid of any remaining recovered timeline-history file, too */
1643
0
    snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
1644
0
    unlink(recoveryPath); /* ignore any error */
1645
0
  }
1646
1647
  /*
1648
   * We don't need the latch anymore. It's not strictly necessary to disown
1649
   * it, but let's do it for the sake of tidiness.
1650
   */
1651
0
  if (ArchiveRecoveryRequested)
1652
0
    DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
1653
0
}
1654
1655
/*
1656
 * Perform WAL recovery.
1657
 *
1658
 * If the system was shut down cleanly, this is never called.
1659
 */
1660
void
1661
PerformWalRecovery(void)
1662
0
{
1663
0
  XLogRecord *record;
1664
0
  bool    reachedRecoveryTarget = false;
1665
0
  TimeLineID  replayTLI;
1666
1667
  /*
1668
   * Initialize shared variables for tracking progress of WAL replay, as if
1669
   * we had just replayed the record before the REDO location (or the
1670
   * checkpoint record itself, if it's a shutdown checkpoint).
1671
   */
1672
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1673
0
  if (RedoStartLSN < CheckPointLoc)
1674
0
  {
1675
0
    XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr;
1676
0
    XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN;
1677
0
    XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI;
1678
0
  }
1679
0
  else
1680
0
  {
1681
0
    XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
1682
0
    XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
1683
0
    XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI;
1684
0
  }
1685
0
  XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
1686
0
  XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI;
1687
0
  XLogRecoveryCtl->recoveryLastXTime = 0;
1688
0
  XLogRecoveryCtl->currentChunkStartTime = 0;
1689
0
  XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
1690
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
1691
1692
  /* Also ensure XLogReceiptTime has a sane value */
1693
0
  XLogReceiptTime = GetCurrentTimestamp();
1694
1695
  /*
1696
   * Let postmaster know we've started redo now, so that it can launch the
1697
   * archiver if necessary.
1698
   */
1699
0
  if (IsUnderPostmaster)
1700
0
    SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
1701
1702
  /*
1703
   * Allow read-only connections immediately if we're consistent already.
1704
   */
1705
0
  CheckRecoveryConsistency();
1706
1707
  /*
1708
   * Find the first record that logically follows the checkpoint --- it
1709
   * might physically precede it, though.
1710
   */
1711
0
  if (RedoStartLSN < CheckPointLoc)
1712
0
  {
1713
    /* back up to find the record */
1714
0
    replayTLI = RedoStartTLI;
1715
0
    XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN);
1716
0
    record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI);
1717
1718
    /*
1719
     * If a checkpoint record's redo pointer points back to an earlier
1720
     * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO
1721
     * record.
1722
     */
1723
0
    if (record->xl_rmid != RM_XLOG_ID ||
1724
0
      (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO)
1725
0
      ereport(FATAL,
1726
0
          errmsg("unexpected record type found at redo point %X/%08X",
1727
0
               LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1728
0
  }
1729
0
  else
1730
0
  {
1731
    /* just have to read next record after CheckPoint */
1732
0
    Assert(xlogreader->ReadRecPtr == CheckPointLoc);
1733
0
    replayTLI = CheckPointTLI;
1734
0
    record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1735
0
  }
1736
1737
0
  if (record != NULL)
1738
0
  {
1739
0
    TimestampTz xtime;
1740
0
    PGRUsage  ru0;
1741
1742
0
    pg_rusage_init(&ru0);
1743
1744
0
    InRedo = true;
1745
1746
0
    RmgrStartup();
1747
1748
0
    ereport(LOG,
1749
0
        errmsg("redo starts at %X/%08X",
1750
0
             LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)));
1751
1752
    /* Prepare to report progress of the redo phase. */
1753
0
    if (!StandbyMode)
1754
0
      begin_startup_progress_phase();
1755
1756
    /*
1757
     * main redo apply loop
1758
     */
1759
0
    do
1760
0
    {
1761
0
      if (!StandbyMode)
1762
0
        ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X",
1763
0
                     LSN_FORMAT_ARGS(xlogreader->ReadRecPtr));
1764
1765
#ifdef WAL_DEBUG
1766
      if (XLOG_DEBUG)
1767
      {
1768
        StringInfoData buf;
1769
1770
        initStringInfo(&buf);
1771
        appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ",
1772
                 LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1773
                 LSN_FORMAT_ARGS(xlogreader->EndRecPtr));
1774
        xlog_outrec(&buf, xlogreader);
1775
        appendStringInfoString(&buf, " - ");
1776
        xlog_outdesc(&buf, xlogreader);
1777
        elog(LOG, "%s", buf.data);
1778
        pfree(buf.data);
1779
      }
1780
#endif
1781
1782
      /* Handle interrupt signals of startup process */
1783
0
      ProcessStartupProcInterrupts();
1784
1785
      /*
1786
       * Pause WAL replay, if requested by a hot-standby session via
1787
       * SetRecoveryPause().
1788
       *
1789
       * Note that we intentionally don't take the info_lck spinlock
1790
       * here.  We might therefore read a slightly stale value of the
1791
       * recoveryPause flag, but it can't be very stale (no worse than
1792
       * the last spinlock we did acquire).  Since a pause request is a
1793
       * pretty asynchronous thing anyway, possibly responding to it one
1794
       * WAL record later than we otherwise would is a minor issue, so
1795
       * it doesn't seem worth adding another spinlock cycle to prevent
1796
       * that.
1797
       */
1798
0
      if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1799
0
        RECOVERY_NOT_PAUSED)
1800
0
        recoveryPausesHere(false);
1801
1802
      /*
1803
       * Have we reached our recovery target?
1804
       */
1805
0
      if (recoveryStopsBefore(xlogreader))
1806
0
      {
1807
0
        reachedRecoveryTarget = true;
1808
0
        break;
1809
0
      }
1810
1811
      /*
1812
       * If we've been asked to lag the primary, wait on latch until
1813
       * enough time has passed.
1814
       */
1815
0
      if (recoveryApplyDelay(xlogreader))
1816
0
      {
1817
        /*
1818
         * We test for paused recovery again here. If user sets
1819
         * delayed apply, it may be because they expect to pause
1820
         * recovery in case of problems, so we must test again here
1821
         * otherwise pausing during the delay-wait wouldn't work.
1822
         */
1823
0
        if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
1824
0
          RECOVERY_NOT_PAUSED)
1825
0
          recoveryPausesHere(false);
1826
0
      }
1827
1828
      /*
1829
       * Apply the record
1830
       */
1831
0
      ApplyWalRecord(xlogreader, record, &replayTLI);
1832
1833
      /* Exit loop if we reached inclusive recovery target */
1834
0
      if (recoveryStopsAfter(xlogreader))
1835
0
      {
1836
0
        reachedRecoveryTarget = true;
1837
0
        break;
1838
0
      }
1839
1840
      /* Else, try to fetch the next WAL record */
1841
0
      record = ReadRecord(xlogprefetcher, LOG, false, replayTLI);
1842
0
    } while (record != NULL);
1843
1844
    /*
1845
     * end of main redo apply loop
1846
     */
1847
1848
0
    if (reachedRecoveryTarget)
1849
0
    {
1850
0
      if (!reachedConsistency)
1851
0
        ereport(FATAL,
1852
0
            (errmsg("requested recovery stop point is before consistent recovery point")));
1853
1854
      /*
1855
       * This is the last point where we can restart recovery with a new
1856
       * recovery target, if we shutdown and begin again. After this,
1857
       * Resource Managers may choose to do permanent corrective actions
1858
       * at end of recovery.
1859
       */
1860
0
      switch (recoveryTargetAction)
1861
0
      {
1862
0
        case RECOVERY_TARGET_ACTION_SHUTDOWN:
1863
1864
          /*
1865
           * exit with special return code to request shutdown of
1866
           * postmaster.  Log messages issued from postmaster.
1867
           */
1868
0
          proc_exit(3);
1869
1870
0
        case RECOVERY_TARGET_ACTION_PAUSE:
1871
0
          SetRecoveryPause(true);
1872
0
          recoveryPausesHere(true);
1873
1874
          /* drop into promote */
1875
1876
0
        case RECOVERY_TARGET_ACTION_PROMOTE:
1877
0
          break;
1878
0
      }
1879
0
    }
1880
1881
0
    RmgrCleanup();
1882
1883
0
    ereport(LOG,
1884
0
        errmsg("redo done at %X/%08X system usage: %s",
1885
0
             LSN_FORMAT_ARGS(xlogreader->ReadRecPtr),
1886
0
             pg_rusage_show(&ru0)));
1887
0
    xtime = GetLatestXTime();
1888
0
    if (xtime)
1889
0
      ereport(LOG,
1890
0
          (errmsg("last completed transaction was at log time %s",
1891
0
              timestamptz_to_str(xtime))));
1892
1893
0
    InRedo = false;
1894
0
  }
1895
0
  else
1896
0
  {
1897
    /* there are no WAL records following the checkpoint */
1898
0
    ereport(LOG,
1899
0
        (errmsg("redo is not required")));
1900
0
  }
1901
1902
  /*
1903
   * This check is intentionally after the above log messages that indicate
1904
   * how far recovery went.
1905
   */
1906
0
  if (ArchiveRecoveryRequested &&
1907
0
    recoveryTarget != RECOVERY_TARGET_UNSET &&
1908
0
    !reachedRecoveryTarget)
1909
0
    ereport(FATAL,
1910
0
        (errcode(ERRCODE_CONFIG_FILE_ERROR),
1911
0
         errmsg("recovery ended before configured recovery target was reached")));
1912
0
}
1913
1914
/*
1915
 * Subroutine of PerformWalRecovery, to apply one WAL record.
1916
 */
1917
static void
1918
ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI)
1919
0
{
1920
0
  ErrorContextCallback errcallback;
1921
0
  bool    switchedTLI = false;
1922
1923
  /* Setup error traceback support for ereport() */
1924
0
  errcallback.callback = rm_redo_error_callback;
1925
0
  errcallback.arg = xlogreader;
1926
0
  errcallback.previous = error_context_stack;
1927
0
  error_context_stack = &errcallback;
1928
1929
  /*
1930
   * TransamVariables->nextXid must be beyond record's xid.
1931
   */
1932
0
  AdvanceNextFullTransactionIdPastXid(record->xl_xid);
1933
1934
  /*
1935
   * Before replaying this record, check if this record causes the current
1936
   * timeline to change. The record is already considered to be part of the
1937
   * new timeline, so we update replayTLI before replaying it. That's
1938
   * important so that replayEndTLI, which is recorded as the minimum
1939
   * recovery point's TLI if recovery stops after this record, is set
1940
   * correctly.
1941
   */
1942
0
  if (record->xl_rmid == RM_XLOG_ID)
1943
0
  {
1944
0
    TimeLineID  newReplayTLI = *replayTLI;
1945
0
    TimeLineID  prevReplayTLI = *replayTLI;
1946
0
    uint8   info = record->xl_info & ~XLR_INFO_MASK;
1947
1948
0
    if (info == XLOG_CHECKPOINT_SHUTDOWN)
1949
0
    {
1950
0
      CheckPoint  checkPoint;
1951
1952
0
      memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1953
0
      newReplayTLI = checkPoint.ThisTimeLineID;
1954
0
      prevReplayTLI = checkPoint.PrevTimeLineID;
1955
0
    }
1956
0
    else if (info == XLOG_END_OF_RECOVERY)
1957
0
    {
1958
0
      xl_end_of_recovery xlrec;
1959
1960
0
      memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1961
0
      newReplayTLI = xlrec.ThisTimeLineID;
1962
0
      prevReplayTLI = xlrec.PrevTimeLineID;
1963
0
    }
1964
1965
0
    if (newReplayTLI != *replayTLI)
1966
0
    {
1967
      /* Check that it's OK to switch to this TLI */
1968
0
      checkTimeLineSwitch(xlogreader->EndRecPtr,
1969
0
                newReplayTLI, prevReplayTLI, *replayTLI);
1970
1971
      /* Following WAL records should be run with new TLI */
1972
0
      *replayTLI = newReplayTLI;
1973
0
      switchedTLI = true;
1974
0
    }
1975
0
  }
1976
1977
  /*
1978
   * Update shared replayEndRecPtr before replaying this record, so that
1979
   * XLogFlush will update minRecoveryPoint correctly.
1980
   */
1981
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
1982
0
  XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr;
1983
0
  XLogRecoveryCtl->replayEndTLI = *replayTLI;
1984
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
1985
1986
  /*
1987
   * If we are attempting to enter Hot Standby mode, process XIDs we see
1988
   */
1989
0
  if (standbyState >= STANDBY_INITIALIZED &&
1990
0
    TransactionIdIsValid(record->xl_xid))
1991
0
    RecordKnownAssignedTransactionIds(record->xl_xid);
1992
1993
  /*
1994
   * Some XLOG record types that are related to recovery are processed
1995
   * directly here, rather than in xlog_redo()
1996
   */
1997
0
  if (record->xl_rmid == RM_XLOG_ID)
1998
0
    xlogrecovery_redo(xlogreader, *replayTLI);
1999
2000
  /* Now apply the WAL record itself */
2001
0
  GetRmgr(record->xl_rmid).rm_redo(xlogreader);
2002
2003
  /*
2004
   * After redo, check whether the backup pages associated with the WAL
2005
   * record are consistent with the existing pages. This check is done only
2006
   * if consistency check is enabled for this record.
2007
   */
2008
0
  if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
2009
0
    verifyBackupPageConsistency(xlogreader);
2010
2011
  /* Pop the error context stack */
2012
0
  error_context_stack = errcallback.previous;
2013
2014
  /*
2015
   * Update lastReplayedEndRecPtr after this record has been successfully
2016
   * replayed.
2017
   */
2018
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2019
0
  XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr;
2020
0
  XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
2021
0
  XLogRecoveryCtl->lastReplayedTLI = *replayTLI;
2022
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
2023
2024
  /* ------
2025
   * Wakeup walsenders:
2026
   *
2027
   * On the standby, the WAL is flushed first (which will only wake up
2028
   * physical walsenders) and then applied, which will only wake up logical
2029
   * walsenders.
2030
   *
2031
   * Indeed, logical walsenders on standby can't decode and send data until
2032
   * it's been applied.
2033
   *
2034
   * Physical walsenders don't need to be woken up during replay unless
2035
   * cascading replication is allowed and time line change occurred (so that
2036
   * they can notice that they are on a new time line).
2037
   *
2038
   * That's why the wake up conditions are for:
2039
   *
2040
   *  - physical walsenders in case of new time line and cascade
2041
   *    replication is allowed
2042
   *  - logical walsenders in case cascade replication is allowed (could not
2043
   *    be created otherwise)
2044
   * ------
2045
   */
2046
0
  if (AllowCascadeReplication())
2047
0
    WalSndWakeup(switchedTLI, true);
2048
2049
  /*
2050
   * If rm_redo called XLogRequestWalReceiverReply, then we wake up the
2051
   * receiver so that it notices the updated lastReplayedEndRecPtr and sends
2052
   * a reply to the primary.
2053
   */
2054
0
  if (doRequestWalReceiverReply)
2055
0
  {
2056
0
    doRequestWalReceiverReply = false;
2057
0
    WalRcvForceReply();
2058
0
  }
2059
2060
  /* Allow read-only connections if we're consistent now */
2061
0
  CheckRecoveryConsistency();
2062
2063
  /* Is this a timeline switch? */
2064
0
  if (switchedTLI)
2065
0
  {
2066
    /*
2067
     * Before we continue on the new timeline, clean up any (possibly
2068
     * bogus) future WAL segments on the old timeline.
2069
     */
2070
0
    RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI);
2071
2072
    /* Reset the prefetcher. */
2073
0
    XLogPrefetchReconfigure();
2074
0
  }
2075
0
}
2076
2077
/*
2078
 * Some XLOG RM record types that are directly related to WAL recovery are
2079
 * handled here rather than in the xlog_redo()
2080
 */
2081
static void
2082
xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
2083
0
{
2084
0
  uint8   info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2085
0
  XLogRecPtr  lsn = record->EndRecPtr;
2086
2087
0
  Assert(XLogRecGetRmid(record) == RM_XLOG_ID);
2088
2089
0
  if (info == XLOG_OVERWRITE_CONTRECORD)
2090
0
  {
2091
    /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */
2092
0
    xl_overwrite_contrecord xlrec;
2093
2094
0
    memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
2095
0
    if (xlrec.overwritten_lsn != record->overwrittenRecPtr)
2096
0
      elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X",
2097
0
         LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2098
0
         LSN_FORMAT_ARGS(record->overwrittenRecPtr));
2099
2100
    /* We have safely skipped the aborted record */
2101
0
    abortedRecPtr = InvalidXLogRecPtr;
2102
0
    missingContrecPtr = InvalidXLogRecPtr;
2103
2104
0
    ereport(LOG,
2105
0
        errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s",
2106
0
             LSN_FORMAT_ARGS(xlrec.overwritten_lsn),
2107
0
             timestamptz_to_str(xlrec.overwrite_time)));
2108
2109
    /* Verifying the record should only happen once */
2110
0
    record->overwrittenRecPtr = InvalidXLogRecPtr;
2111
0
  }
2112
0
  else if (info == XLOG_BACKUP_END)
2113
0
  {
2114
0
    XLogRecPtr  startpoint;
2115
2116
0
    memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
2117
2118
0
    if (backupStartPoint == startpoint)
2119
0
    {
2120
      /*
2121
       * We have reached the end of base backup, the point where
2122
       * pg_backup_stop() was done.  The data on disk is now consistent
2123
       * (assuming we have also reached minRecoveryPoint).  Set
2124
       * backupEndPoint to the current LSN, so that the next call to
2125
       * CheckRecoveryConsistency() will notice it and do the
2126
       * end-of-backup processing.
2127
       */
2128
0
      elog(DEBUG1, "end of backup record reached");
2129
2130
0
      backupEndPoint = lsn;
2131
0
    }
2132
0
    else
2133
0
      elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X",
2134
0
         LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
2135
0
  }
2136
0
}
2137
2138
/*
2139
 * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2140
 * directories.
2141
 *
2142
 * Replay of database creation XLOG records for databases that were later
2143
 * dropped can create fake directories in pg_tblspc.  By the time consistency
2144
 * is reached these directories should have been removed; here we verify
2145
 * that this did indeed happen.  This is to be called at the point where
2146
 * consistent state is reached.
2147
 *
2148
 * allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2149
 * useful for testing purposes, and also allows for an escape hatch in case
2150
 * things go south.
2151
 */
2152
static void
2153
CheckTablespaceDirectory(void)
2154
0
{
2155
0
  DIR      *dir;
2156
0
  struct dirent *de;
2157
2158
0
  dir = AllocateDir(PG_TBLSPC_DIR);
2159
0
  while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL)
2160
0
  {
2161
0
    char    path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
2162
2163
    /* Skip entries of non-oid names */
2164
0
    if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2165
0
      continue;
2166
2167
0
    snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name);
2168
2169
0
    if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2170
0
      ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2171
0
          (errcode(ERRCODE_DATA_CORRUPTED),
2172
0
           errmsg("unexpected directory entry \"%s\" found in %s",
2173
0
              de->d_name, PG_TBLSPC_DIR),
2174
0
           errdetail("All directory entries in %s/ should be symbolic links.",
2175
0
                 PG_TBLSPC_DIR),
2176
0
           errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete.")));
2177
0
  }
2178
0
}
2179
2180
/*
2181
 * Checks if recovery has reached a consistent state. When consistency is
2182
 * reached and we have a valid starting standby snapshot, tell postmaster
2183
 * that it can start accepting read-only connections.
2184
 */
2185
static void
2186
CheckRecoveryConsistency(void)
2187
0
{
2188
0
  XLogRecPtr  lastReplayedEndRecPtr;
2189
0
  TimeLineID  lastReplayedTLI;
2190
2191
  /*
2192
   * During crash recovery, we don't reach a consistent state until we've
2193
   * replayed all the WAL.
2194
   */
2195
0
  if (XLogRecPtrIsInvalid(minRecoveryPoint))
2196
0
    return;
2197
2198
0
  Assert(InArchiveRecovery);
2199
2200
  /*
2201
   * assume that we are called in the startup process, and hence don't need
2202
   * a lock to read lastReplayedEndRecPtr
2203
   */
2204
0
  lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr;
2205
0
  lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI;
2206
2207
  /*
2208
   * Have we reached the point where our base backup was completed?
2209
   */
2210
0
  if (!XLogRecPtrIsInvalid(backupEndPoint) &&
2211
0
    backupEndPoint <= lastReplayedEndRecPtr)
2212
0
  {
2213
0
    XLogRecPtr  saveBackupStartPoint = backupStartPoint;
2214
0
    XLogRecPtr  saveBackupEndPoint = backupEndPoint;
2215
2216
0
    elog(DEBUG1, "end of backup reached");
2217
2218
    /*
2219
     * We have reached the end of base backup, as indicated by pg_control.
2220
     * Update the control file accordingly.
2221
     */
2222
0
    ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
2223
0
    backupStartPoint = InvalidXLogRecPtr;
2224
0
    backupEndPoint = InvalidXLogRecPtr;
2225
0
    backupEndRequired = false;
2226
2227
0
    ereport(LOG,
2228
0
        errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X",
2229
0
             LSN_FORMAT_ARGS(saveBackupStartPoint),
2230
0
             LSN_FORMAT_ARGS(saveBackupEndPoint)));
2231
0
  }
2232
2233
  /*
2234
   * Have we passed our safe starting point? Note that minRecoveryPoint is
2235
   * known to be incorrectly set if recovering from a backup, until the
2236
   * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint.
2237
   * All we know prior to that is that we're not consistent yet.
2238
   */
2239
0
  if (!reachedConsistency && !backupEndRequired &&
2240
0
    minRecoveryPoint <= lastReplayedEndRecPtr)
2241
0
  {
2242
    /*
2243
     * Check to see if the XLOG sequence contained any unresolved
2244
     * references to uninitialized pages.
2245
     */
2246
0
    XLogCheckInvalidPages();
2247
2248
    /*
2249
     * Check that pg_tblspc doesn't contain any real directories. Replay
2250
     * of Database/CREATE_* records may have created fictitious tablespace
2251
     * directories that should have been removed by the time consistency
2252
     * was reached.
2253
     */
2254
0
    CheckTablespaceDirectory();
2255
2256
0
    reachedConsistency = true;
2257
0
    SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
2258
0
    ereport(LOG,
2259
0
        errmsg("consistent recovery state reached at %X/%08X",
2260
0
             LSN_FORMAT_ARGS(lastReplayedEndRecPtr)));
2261
0
  }
2262
2263
  /*
2264
   * Have we got a valid starting snapshot that will allow queries to be
2265
   * run? If so, we can tell postmaster that the database is consistent now,
2266
   * enabling connections.
2267
   */
2268
0
  if (standbyState == STANDBY_SNAPSHOT_READY &&
2269
0
    !LocalHotStandbyActive &&
2270
0
    reachedConsistency &&
2271
0
    IsUnderPostmaster)
2272
0
  {
2273
0
    SpinLockAcquire(&XLogRecoveryCtl->info_lck);
2274
0
    XLogRecoveryCtl->SharedHotStandbyActive = true;
2275
0
    SpinLockRelease(&XLogRecoveryCtl->info_lck);
2276
2277
0
    LocalHotStandbyActive = true;
2278
2279
0
    SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
2280
0
  }
2281
0
}
2282
2283
/*
2284
 * Error context callback for errors occurring during rm_redo().
2285
 */
2286
static void
2287
rm_redo_error_callback(void *arg)
2288
0
{
2289
0
  XLogReaderState *record = (XLogReaderState *) arg;
2290
0
  StringInfoData buf;
2291
2292
0
  initStringInfo(&buf);
2293
0
  xlog_outdesc(&buf, record);
2294
0
  xlog_block_info(&buf, record);
2295
2296
  /* translator: %s is a WAL record description */
2297
0
  errcontext("WAL redo at %X/%08X for %s",
2298
0
         LSN_FORMAT_ARGS(record->ReadRecPtr),
2299
0
         buf.data);
2300
2301
0
  pfree(buf.data);
2302
0
}
2303
2304
/*
2305
 * Returns a string describing an XLogRecord, consisting of its identity
2306
 * optionally followed by a colon, a space, and a further description.
2307
 */
2308
void
2309
xlog_outdesc(StringInfo buf, XLogReaderState *record)
2310
0
{
2311
0
  RmgrData  rmgr = GetRmgr(XLogRecGetRmid(record));
2312
0
  uint8   info = XLogRecGetInfo(record);
2313
0
  const char *id;
2314
2315
0
  appendStringInfoString(buf, rmgr.rm_name);
2316
0
  appendStringInfoChar(buf, '/');
2317
2318
0
  id = rmgr.rm_identify(info);
2319
0
  if (id == NULL)
2320
0
    appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
2321
0
  else
2322
0
    appendStringInfo(buf, "%s: ", id);
2323
2324
0
  rmgr.rm_desc(buf, record);
2325
0
}
2326
2327
#ifdef WAL_DEBUG
2328
2329
static void
2330
xlog_outrec(StringInfo buf, XLogReaderState *record)
2331
{
2332
  appendStringInfo(buf, "prev %X/%08X; xid %u",
2333
           LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
2334
           XLogRecGetXid(record));
2335
2336
  appendStringInfo(buf, "; len %u",
2337
           XLogRecGetDataLen(record));
2338
2339
  xlog_block_info(buf, record);
2340
}
2341
#endif              /* WAL_DEBUG */
2342
2343
/*
2344
 * Returns a string giving information about all the blocks in an
2345
 * XLogRecord.
2346
 */
2347
static void
2348
xlog_block_info(StringInfo buf, XLogReaderState *record)
2349
0
{
2350
0
  int     block_id;
2351
2352
  /* decode block references */
2353
0
  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2354
0
  {
2355
0
    RelFileLocator rlocator;
2356
0
    ForkNumber  forknum;
2357
0
    BlockNumber blk;
2358
2359
0
    if (!XLogRecGetBlockTagExtended(record, block_id,
2360
0
                    &rlocator, &forknum, &blk, NULL))
2361
0
      continue;
2362
2363
0
    if (forknum != MAIN_FORKNUM)
2364
0
      appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u",
2365
0
               block_id,
2366
0
               rlocator.spcOid, rlocator.dbOid,
2367
0
               rlocator.relNumber,
2368
0
               forknum,
2369
0
               blk);
2370
0
    else
2371
0
      appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u",
2372
0
               block_id,
2373
0
               rlocator.spcOid, rlocator.dbOid,
2374
0
               rlocator.relNumber,
2375
0
               blk);
2376
0
    if (XLogRecHasBlockImage(record, block_id))
2377
0
      appendStringInfoString(buf, " FPW");
2378
0
  }
2379
0
}
2380
2381
2382
/*
2383
 * Check that it's OK to switch to new timeline during recovery.
2384
 *
2385
 * 'lsn' is the address of the shutdown checkpoint record we're about to
2386
 * replay. (Currently, timeline can only change at a shutdown checkpoint).
2387
 */
2388
static void
2389
checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI,
2390
          TimeLineID replayTLI)
2391
0
{
2392
  /* Check that the record agrees on what the current (old) timeline is */
2393
0
  if (prevTLI != replayTLI)
2394
0
    ereport(PANIC,
2395
0
        (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
2396
0
            prevTLI, replayTLI)));
2397
2398
  /*
2399
   * The new timeline better be in the list of timelines we expect to see,
2400
   * according to the timeline history. It should also not decrease.
2401
   */
2402
0
  if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs))
2403
0
    ereport(PANIC,
2404
0
        (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
2405
0
            newTLI, replayTLI)));
2406
2407
  /*
2408
   * If we have not yet reached min recovery point, and we're about to
2409
   * switch to a timeline greater than the timeline of the min recovery
2410
   * point: trouble. After switching to the new timeline, we could not
2411
   * possibly visit the min recovery point on the correct timeline anymore.
2412
   * This can happen if there is a newer timeline in the archive that
2413
   * branched before the timeline the min recovery point is on, and you
2414
   * attempt to do PITR to the new timeline.
2415
   */
2416
0
  if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
2417
0
    lsn < minRecoveryPoint &&
2418
0
    newTLI > minRecoveryPointTLI)
2419
0
    ereport(PANIC,
2420
0
        errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u",
2421
0
             newTLI,
2422
0
             LSN_FORMAT_ARGS(minRecoveryPoint),
2423
0
             minRecoveryPointTLI));
2424
2425
  /* Looks good */
2426
0
}
2427
2428
2429
/*
2430
 * Extract timestamp from WAL record.
2431
 *
2432
 * If the record contains a timestamp, returns true, and saves the timestamp
2433
 * in *recordXtime. If the record type has no timestamp, returns false.
2434
 * Currently, only transaction commit/abort records and restore points contain
2435
 * timestamps.
2436
 */
2437
static bool
2438
getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
2439
0
{
2440
0
  uint8   info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2441
0
  uint8   xact_info = info & XLOG_XACT_OPMASK;
2442
0
  uint8   rmid = XLogRecGetRmid(record);
2443
2444
0
  if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2445
0
  {
2446
0
    *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
2447
0
    return true;
2448
0
  }
2449
0
  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
2450
0
                 xact_info == XLOG_XACT_COMMIT_PREPARED))
2451
0
  {
2452
0
    *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
2453
0
    return true;
2454
0
  }
2455
0
  if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
2456
0
                 xact_info == XLOG_XACT_ABORT_PREPARED))
2457
0
  {
2458
0
    *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
2459
0
    return true;
2460
0
  }
2461
0
  return false;
2462
0
}
2463
2464
/*
2465
 * Checks whether the current buffer page and backup page stored in the
2466
 * WAL record are consistent or not. Before comparing the two pages, a
2467
 * masking can be applied to the pages to ignore certain areas like hint bits,
2468
 * unused space between pd_lower and pd_upper among other things. This
2469
 * function should be called once WAL replay has been completed for a
2470
 * given record.
2471
 */
2472
static void
2473
verifyBackupPageConsistency(XLogReaderState *record)
2474
0
{
2475
0
  RmgrData  rmgr = GetRmgr(XLogRecGetRmid(record));
2476
0
  RelFileLocator rlocator;
2477
0
  ForkNumber  forknum;
2478
0
  BlockNumber blkno;
2479
0
  int     block_id;
2480
2481
  /* Records with no backup blocks have no need for consistency checks. */
2482
0
  if (!XLogRecHasAnyBlockRefs(record))
2483
0
    return;
2484
2485
0
  Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
2486
2487
0
  for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
2488
0
  {
2489
0
    Buffer    buf;
2490
0
    Page    page;
2491
2492
0
    if (!XLogRecGetBlockTagExtended(record, block_id,
2493
0
                    &rlocator, &forknum, &blkno, NULL))
2494
0
    {
2495
      /*
2496
       * WAL record doesn't contain a block reference with the given id.
2497
       * Do nothing.
2498
       */
2499
0
      continue;
2500
0
    }
2501
2502
0
    Assert(XLogRecHasBlockImage(record, block_id));
2503
2504
0
    if (XLogRecBlockImageApply(record, block_id))
2505
0
    {
2506
      /*
2507
       * WAL record has already applied the page, so bypass the
2508
       * consistency check as that would result in comparing the full
2509
       * page stored in the record with itself.
2510
       */
2511
0
      continue;
2512
0
    }
2513
2514
    /*
2515
     * Read the contents from the current buffer and store it in a
2516
     * temporary page.
2517
     */
2518
0
    buf = XLogReadBufferExtended(rlocator, forknum, blkno,
2519
0
                   RBM_NORMAL_NO_LOG,
2520
0
                   InvalidBuffer);
2521
0
    if (!BufferIsValid(buf))
2522
0
      continue;
2523
2524
0
    LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
2525
0
    page = BufferGetPage(buf);
2526
2527
    /*
2528
     * Take a copy of the local page where WAL has been applied to have a
2529
     * comparison base before masking it...
2530
     */
2531
0
    memcpy(replay_image_masked, page, BLCKSZ);
2532
2533
    /* No need for this page anymore now that a copy is in. */
2534
0
    UnlockReleaseBuffer(buf);
2535
2536
    /*
2537
     * If the block LSN is already ahead of this WAL record, we can't
2538
     * expect contents to match.  This can happen if recovery is
2539
     * restarted.
2540
     */
2541
0
    if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
2542
0
      continue;
2543
2544
    /*
2545
     * Read the contents from the backup copy, stored in WAL record and
2546
     * store it in a temporary page. There is no need to allocate a new
2547
     * page here, a local buffer is fine to hold its contents and a mask
2548
     * can be directly applied on it.
2549
     */
2550
0
    if (!RestoreBlockImage(record, block_id, primary_image_masked))
2551
0
      ereport(ERROR,
2552
0
          (errcode(ERRCODE_INTERNAL_ERROR),
2553
0
           errmsg_internal("%s", record->errormsg_buf)));
2554
2555
    /*
2556
     * If masking function is defined, mask both the primary and replay
2557
     * images
2558
     */
2559
0
    if (rmgr.rm_mask != NULL)
2560
0
    {
2561
0
      rmgr.rm_mask(replay_image_masked, blkno);
2562
0
      rmgr.rm_mask(primary_image_masked, blkno);
2563
0
    }
2564
2565
    /* Time to compare the primary and replay images. */
2566
0
    if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
2567
0
    {
2568
0
      elog(FATAL,
2569
0
         "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
2570
0
         rlocator.spcOid, rlocator.dbOid, rlocator.relNumber,
2571
0
         forknum, blkno);
2572
0
    }
2573
0
  }
2574
0
}
2575
2576
/*
2577
 * For point-in-time recovery, this function decides whether we want to
2578
 * stop applying the XLOG before the current record.
2579
 *
2580
 * Returns true if we are stopping, false otherwise. If stopping, some
2581
 * information is saved in recoveryStopXid et al for use in annotating the
2582
 * new timeline's history file.
2583
 */
2584
static bool
2585
recoveryStopsBefore(XLogReaderState *record)
2586
0
{
2587
0
  bool    stopsHere = false;
2588
0
  uint8   xact_info;
2589
0
  bool    isCommit;
2590
0
  TimestampTz recordXtime = 0;
2591
0
  TransactionId recordXid;
2592
2593
  /*
2594
   * Ignore recovery target settings when not in archive recovery (meaning
2595
   * we are in crash recovery).
2596
   */
2597
0
  if (!ArchiveRecoveryRequested)
2598
0
    return false;
2599
2600
  /* Check if we should stop as soon as reaching consistency */
2601
0
  if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2602
0
  {
2603
0
    ereport(LOG,
2604
0
        (errmsg("recovery stopping after reaching consistency")));
2605
2606
0
    recoveryStopAfter = false;
2607
0
    recoveryStopXid = InvalidTransactionId;
2608
0
    recoveryStopLSN = InvalidXLogRecPtr;
2609
0
    recoveryStopTime = 0;
2610
0
    recoveryStopName[0] = '\0';
2611
0
    return true;
2612
0
  }
2613
2614
  /* Check if target LSN has been reached */
2615
0
  if (recoveryTarget == RECOVERY_TARGET_LSN &&
2616
0
    !recoveryTargetInclusive &&
2617
0
    record->ReadRecPtr >= recoveryTargetLSN)
2618
0
  {
2619
0
    recoveryStopAfter = false;
2620
0
    recoveryStopXid = InvalidTransactionId;
2621
0
    recoveryStopLSN = record->ReadRecPtr;
2622
0
    recoveryStopTime = 0;
2623
0
    recoveryStopName[0] = '\0';
2624
0
    ereport(LOG,
2625
0
        errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"",
2626
0
             LSN_FORMAT_ARGS(recoveryStopLSN)));
2627
0
    return true;
2628
0
  }
2629
2630
  /* Otherwise we only consider stopping before COMMIT or ABORT records. */
2631
0
  if (XLogRecGetRmid(record) != RM_XACT_ID)
2632
0
    return false;
2633
2634
0
  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
2635
2636
0
  if (xact_info == XLOG_XACT_COMMIT)
2637
0
  {
2638
0
    isCommit = true;
2639
0
    recordXid = XLogRecGetXid(record);
2640
0
  }
2641
0
  else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2642
0
  {
2643
0
    xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2644
0
    xl_xact_parsed_commit parsed;
2645
2646
0
    isCommit = true;
2647
0
    ParseCommitRecord(XLogRecGetInfo(record),
2648
0
              xlrec,
2649
0
              &parsed);
2650
0
    recordXid = parsed.twophase_xid;
2651
0
  }
2652
0
  else if (xact_info == XLOG_XACT_ABORT)
2653
0
  {
2654
0
    isCommit = false;
2655
0
    recordXid = XLogRecGetXid(record);
2656
0
  }
2657
0
  else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2658
0
  {
2659
0
    xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2660
0
    xl_xact_parsed_abort parsed;
2661
2662
0
    isCommit = false;
2663
0
    ParseAbortRecord(XLogRecGetInfo(record),
2664
0
             xlrec,
2665
0
             &parsed);
2666
0
    recordXid = parsed.twophase_xid;
2667
0
  }
2668
0
  else
2669
0
    return false;
2670
2671
0
  if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
2672
0
  {
2673
    /*
2674
     * There can be only one transaction end record with this exact
2675
     * transactionid
2676
     *
2677
     * when testing for an xid, we MUST test for equality only, since
2678
     * transactions are numbered in the order they start, not the order
2679
     * they complete. A higher numbered xid will complete before you about
2680
     * 50% of the time...
2681
     */
2682
0
    stopsHere = (recordXid == recoveryTargetXid);
2683
0
  }
2684
2685
  /*
2686
   * Note: we must fetch recordXtime regardless of recoveryTarget setting.
2687
   * We don't expect getRecordTimestamp ever to fail, since we already know
2688
   * this is a commit or abort record; but test its result anyway.
2689
   */
2690
0
  if (getRecordTimestamp(record, &recordXtime) &&
2691
0
    recoveryTarget == RECOVERY_TARGET_TIME)
2692
0
  {
2693
    /*
2694
     * There can be many transactions that share the same commit time, so
2695
     * we stop after the last one, if we are inclusive, or stop at the
2696
     * first one if we are exclusive
2697
     */
2698
0
    if (recoveryTargetInclusive)
2699
0
      stopsHere = (recordXtime > recoveryTargetTime);
2700
0
    else
2701
0
      stopsHere = (recordXtime >= recoveryTargetTime);
2702
0
  }
2703
2704
0
  if (stopsHere)
2705
0
  {
2706
0
    recoveryStopAfter = false;
2707
0
    recoveryStopXid = recordXid;
2708
0
    recoveryStopTime = recordXtime;
2709
0
    recoveryStopLSN = InvalidXLogRecPtr;
2710
0
    recoveryStopName[0] = '\0';
2711
2712
0
    if (isCommit)
2713
0
    {
2714
0
      ereport(LOG,
2715
0
          (errmsg("recovery stopping before commit of transaction %u, time %s",
2716
0
              recoveryStopXid,
2717
0
              timestamptz_to_str(recoveryStopTime))));
2718
0
    }
2719
0
    else
2720
0
    {
2721
0
      ereport(LOG,
2722
0
          (errmsg("recovery stopping before abort of transaction %u, time %s",
2723
0
              recoveryStopXid,
2724
0
              timestamptz_to_str(recoveryStopTime))));
2725
0
    }
2726
0
  }
2727
2728
0
  return stopsHere;
2729
0
}
2730
2731
/*
2732
 * Same as recoveryStopsBefore, but called after applying the record.
2733
 *
2734
 * We also track the timestamp of the latest applied COMMIT/ABORT
2735
 * record in XLogRecoveryCtl->recoveryLastXTime.
2736
 */
2737
static bool
2738
recoveryStopsAfter(XLogReaderState *record)
2739
0
{
2740
0
  uint8   info;
2741
0
  uint8   xact_info;
2742
0
  uint8   rmid;
2743
0
  TimestampTz recordXtime = 0;
2744
2745
  /*
2746
   * Ignore recovery target settings when not in archive recovery (meaning
2747
   * we are in crash recovery).
2748
   */
2749
0
  if (!ArchiveRecoveryRequested)
2750
0
    return false;
2751
2752
0
  info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
2753
0
  rmid = XLogRecGetRmid(record);
2754
2755
  /*
2756
   * There can be many restore points that share the same name; we stop at
2757
   * the first one.
2758
   */
2759
0
  if (recoveryTarget == RECOVERY_TARGET_NAME &&
2760
0
    rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
2761
0
  {
2762
0
    xl_restore_point *recordRestorePointData;
2763
2764
0
    recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
2765
2766
0
    if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
2767
0
    {
2768
0
      recoveryStopAfter = true;
2769
0
      recoveryStopXid = InvalidTransactionId;
2770
0
      recoveryStopLSN = InvalidXLogRecPtr;
2771
0
      (void) getRecordTimestamp(record, &recoveryStopTime);
2772
0
      strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
2773
2774
0
      ereport(LOG,
2775
0
          (errmsg("recovery stopping at restore point \"%s\", time %s",
2776
0
              recoveryStopName,
2777
0
              timestamptz_to_str(recoveryStopTime))));
2778
0
      return true;
2779
0
    }
2780
0
  }
2781
2782
  /* Check if the target LSN has been reached */
2783
0
  if (recoveryTarget == RECOVERY_TARGET_LSN &&
2784
0
    recoveryTargetInclusive &&
2785
0
    record->ReadRecPtr >= recoveryTargetLSN)
2786
0
  {
2787
0
    recoveryStopAfter = true;
2788
0
    recoveryStopXid = InvalidTransactionId;
2789
0
    recoveryStopLSN = record->ReadRecPtr;
2790
0
    recoveryStopTime = 0;
2791
0
    recoveryStopName[0] = '\0';
2792
0
    ereport(LOG,
2793
0
        errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"",
2794
0
             LSN_FORMAT_ARGS(recoveryStopLSN)));
2795
0
    return true;
2796
0
  }
2797
2798
0
  if (rmid != RM_XACT_ID)
2799
0
    return false;
2800
2801
0
  xact_info = info & XLOG_XACT_OPMASK;
2802
2803
0
  if (xact_info == XLOG_XACT_COMMIT ||
2804
0
    xact_info == XLOG_XACT_COMMIT_PREPARED ||
2805
0
    xact_info == XLOG_XACT_ABORT ||
2806
0
    xact_info == XLOG_XACT_ABORT_PREPARED)
2807
0
  {
2808
0
    TransactionId recordXid;
2809
2810
    /* Update the last applied transaction timestamp */
2811
0
    if (getRecordTimestamp(record, &recordXtime))
2812
0
      SetLatestXTime(recordXtime);
2813
2814
    /* Extract the XID of the committed/aborted transaction */
2815
0
    if (xact_info == XLOG_XACT_COMMIT_PREPARED)
2816
0
    {
2817
0
      xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
2818
0
      xl_xact_parsed_commit parsed;
2819
2820
0
      ParseCommitRecord(XLogRecGetInfo(record),
2821
0
                xlrec,
2822
0
                &parsed);
2823
0
      recordXid = parsed.twophase_xid;
2824
0
    }
2825
0
    else if (xact_info == XLOG_XACT_ABORT_PREPARED)
2826
0
    {
2827
0
      xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
2828
0
      xl_xact_parsed_abort parsed;
2829
2830
0
      ParseAbortRecord(XLogRecGetInfo(record),
2831
0
               xlrec,
2832
0
               &parsed);
2833
0
      recordXid = parsed.twophase_xid;
2834
0
    }
2835
0
    else
2836
0
      recordXid = XLogRecGetXid(record);
2837
2838
    /*
2839
     * There can be only one transaction end record with this exact
2840
     * transactionid
2841
     *
2842
     * when testing for an xid, we MUST test for equality only, since
2843
     * transactions are numbered in the order they start, not the order
2844
     * they complete. A higher numbered xid will complete before you about
2845
     * 50% of the time...
2846
     */
2847
0
    if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
2848
0
      recordXid == recoveryTargetXid)
2849
0
    {
2850
0
      recoveryStopAfter = true;
2851
0
      recoveryStopXid = recordXid;
2852
0
      recoveryStopTime = recordXtime;
2853
0
      recoveryStopLSN = InvalidXLogRecPtr;
2854
0
      recoveryStopName[0] = '\0';
2855
2856
0
      if (xact_info == XLOG_XACT_COMMIT ||
2857
0
        xact_info == XLOG_XACT_COMMIT_PREPARED)
2858
0
      {
2859
0
        ereport(LOG,
2860
0
            (errmsg("recovery stopping after commit of transaction %u, time %s",
2861
0
                recoveryStopXid,
2862
0
                timestamptz_to_str(recoveryStopTime))));
2863
0
      }
2864
0
      else if (xact_info == XLOG_XACT_ABORT ||
2865
0
           xact_info == XLOG_XACT_ABORT_PREPARED)
2866
0
      {
2867
0
        ereport(LOG,
2868
0
            (errmsg("recovery stopping after abort of transaction %u, time %s",
2869
0
                recoveryStopXid,
2870
0
                timestamptz_to_str(recoveryStopTime))));
2871
0
      }
2872
0
      return true;
2873
0
    }
2874
0
  }
2875
2876
  /* Check if we should stop as soon as reaching consistency */
2877
0
  if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
2878
0
  {
2879
0
    ereport(LOG,
2880
0
        (errmsg("recovery stopping after reaching consistency")));
2881
2882
0
    recoveryStopAfter = true;
2883
0
    recoveryStopXid = InvalidTransactionId;
2884
0
    recoveryStopTime = 0;
2885
0
    recoveryStopLSN = InvalidXLogRecPtr;
2886
0
    recoveryStopName[0] = '\0';
2887
0
    return true;
2888
0
  }
2889
2890
0
  return false;
2891
0
}
2892
2893
/*
2894
 * Create a comment for the history file to explain why and where
2895
 * timeline changed.
2896
 */
2897
static char *
2898
getRecoveryStopReason(void)
2899
0
{
2900
0
  char    reason[200];
2901
2902
0
  if (recoveryTarget == RECOVERY_TARGET_XID)
2903
0
    snprintf(reason, sizeof(reason),
2904
0
         "%s transaction %u",
2905
0
         recoveryStopAfter ? "after" : "before",
2906
0
         recoveryStopXid);
2907
0
  else if (recoveryTarget == RECOVERY_TARGET_TIME)
2908
0
    snprintf(reason, sizeof(reason),
2909
0
         "%s %s\n",
2910
0
         recoveryStopAfter ? "after" : "before",
2911
0
         timestamptz_to_str(recoveryStopTime));
2912
0
  else if (recoveryTarget == RECOVERY_TARGET_LSN)
2913
0
    snprintf(reason, sizeof(reason),
2914
0
         "%s LSN %X/%08X\n",
2915
0
         recoveryStopAfter ? "after" : "before",
2916
0
         LSN_FORMAT_ARGS(recoveryStopLSN));
2917
0
  else if (recoveryTarget == RECOVERY_TARGET_NAME)
2918
0
    snprintf(reason, sizeof(reason),
2919
0
         "at restore point \"%s\"",
2920
0
         recoveryStopName);
2921
0
  else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
2922
0
    snprintf(reason, sizeof(reason), "reached consistency");
2923
0
  else
2924
0
    snprintf(reason, sizeof(reason), "no recovery target specified");
2925
2926
0
  return pstrdup(reason);
2927
0
}
2928
2929
/*
2930
 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
2931
 *
2932
 * endOfRecovery is true if the recovery target is reached and
2933
 * the paused state starts at the end of recovery because of
2934
 * recovery_target_action=pause, and false otherwise.
2935
 */
2936
static void
2937
recoveryPausesHere(bool endOfRecovery)
2938
0
{
2939
  /* Don't pause unless users can connect! */
2940
0
  if (!LocalHotStandbyActive)
2941
0
    return;
2942
2943
  /* Don't pause after standby promotion has been triggered */
2944
0
  if (LocalPromoteIsTriggered)
2945
0
    return;
2946
2947
0
  if (endOfRecovery)
2948
0
    ereport(LOG,
2949
0
        (errmsg("pausing at the end of recovery"),
2950
0
         errhint("Execute pg_wal_replay_resume() to promote.")));
2951
0
  else
2952
0
    ereport(LOG,
2953
0
        (errmsg("recovery has paused"),
2954
0
         errhint("Execute pg_wal_replay_resume() to continue.")));
2955
2956
  /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
2957
0
  while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
2958
0
  {
2959
0
    ProcessStartupProcInterrupts();
2960
0
    if (CheckForStandbyTrigger())
2961
0
      return;
2962
2963
    /*
2964
     * If recovery pause is requested then set it paused.  While we are in
2965
     * the loop, user might resume and pause again so set this every time.
2966
     */
2967
0
    ConfirmRecoveryPaused();
2968
2969
    /*
2970
     * We wait on a condition variable that will wake us as soon as the
2971
     * pause ends, but we use a timeout so we can check the above exit
2972
     * condition periodically too.
2973
     */
2974
0
    ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
2975
0
                  WAIT_EVENT_RECOVERY_PAUSE);
2976
0
  }
2977
0
  ConditionVariableCancelSleep();
2978
0
}
2979
2980
/*
2981
 * When recovery_min_apply_delay is set, we wait long enough to make sure
2982
 * certain record types are applied at least that interval behind the primary.
2983
 *
2984
 * Returns true if we waited.
2985
 *
2986
 * Note that the delay is calculated between the WAL record log time and
2987
 * the current time on standby. We would prefer to keep track of when this
2988
 * standby received each WAL record, which would allow a more consistent
2989
 * approach and one not affected by time synchronisation issues, but that
2990
 * is significantly more effort and complexity for little actual gain in
2991
 * usability.
2992
 */
2993
static bool
2994
recoveryApplyDelay(XLogReaderState *record)
2995
0
{
2996
0
  uint8   xact_info;
2997
0
  TimestampTz xtime;
2998
0
  TimestampTz delayUntil;
2999
0
  long    msecs;
3000
3001
  /* nothing to do if no delay configured */
3002
0
  if (recovery_min_apply_delay <= 0)
3003
0
    return false;
3004
3005
  /* no delay is applied on a database not yet consistent */
3006
0
  if (!reachedConsistency)
3007
0
    return false;
3008
3009
  /* nothing to do if crash recovery is requested */
3010
0
  if (!ArchiveRecoveryRequested)
3011
0
    return false;
3012
3013
  /*
3014
   * Is it a COMMIT record?
3015
   *
3016
   * We deliberately choose not to delay aborts since they have no effect on
3017
   * MVCC. We already allow replay of records that don't have a timestamp,
3018
   * so there is already opportunity for issues caused by early conflicts on
3019
   * standbys.
3020
   */
3021
0
  if (XLogRecGetRmid(record) != RM_XACT_ID)
3022
0
    return false;
3023
3024
0
  xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
3025
3026
0
  if (xact_info != XLOG_XACT_COMMIT &&
3027
0
    xact_info != XLOG_XACT_COMMIT_PREPARED)
3028
0
    return false;
3029
3030
0
  if (!getRecordTimestamp(record, &xtime))
3031
0
    return false;
3032
3033
0
  delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3034
3035
  /*
3036
   * Exit without arming the latch if it's already past time to apply this
3037
   * record
3038
   */
3039
0
  msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
3040
0
  if (msecs <= 0)
3041
0
    return false;
3042
3043
0
  while (true)
3044
0
  {
3045
0
    ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3046
3047
    /* This might change recovery_min_apply_delay. */
3048
0
    ProcessStartupProcInterrupts();
3049
3050
0
    if (CheckForStandbyTrigger())
3051
0
      break;
3052
3053
    /*
3054
     * Recalculate delayUntil as recovery_min_apply_delay could have
3055
     * changed while waiting in this loop.
3056
     */
3057
0
    delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
3058
3059
    /*
3060
     * Wait for difference between GetCurrentTimestamp() and delayUntil.
3061
     */
3062
0
    msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
3063
0
                        delayUntil);
3064
3065
0
    if (msecs <= 0)
3066
0
      break;
3067
3068
0
    elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
3069
3070
0
    (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3071
0
             WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
3072
0
             msecs,
3073
0
             WAIT_EVENT_RECOVERY_APPLY_DELAY);
3074
0
  }
3075
0
  return true;
3076
0
}
3077
3078
/*
3079
 * Get the current state of the recovery pause request.
3080
 */
3081
RecoveryPauseState
3082
GetRecoveryPauseState(void)
3083
0
{
3084
0
  RecoveryPauseState state;
3085
3086
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3087
0
  state = XLogRecoveryCtl->recoveryPauseState;
3088
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
3089
3090
0
  return state;
3091
0
}
3092
3093
/*
3094
 * Set the recovery pause state.
3095
 *
3096
 * If recovery pause is requested then sets the recovery pause state to
3097
 * 'pause requested' if it is not already 'paused'.  Otherwise, sets it
3098
 * to 'not paused' to resume the recovery.  The recovery pause will be
3099
 * confirmed by the ConfirmRecoveryPaused.
3100
 */
3101
void
3102
SetRecoveryPause(bool recoveryPause)
3103
0
{
3104
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3105
3106
0
  if (!recoveryPause)
3107
0
    XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
3108
0
  else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
3109
0
    XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
3110
3111
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
3112
3113
0
  if (!recoveryPause)
3114
0
    ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV);
3115
0
}
3116
3117
/*
3118
 * Confirm the recovery pause by setting the recovery pause state to
3119
 * RECOVERY_PAUSED.
3120
 */
3121
static void
3122
ConfirmRecoveryPaused(void)
3123
0
{
3124
  /* If recovery pause is requested then set it paused */
3125
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
3126
0
  if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
3127
0
    XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED;
3128
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
3129
0
}
3130
3131
3132
/*
3133
 * Attempt to read the next XLOG record.
3134
 *
3135
 * Before first call, the reader needs to be positioned to the first record
3136
 * by calling XLogPrefetcherBeginRead().
3137
 *
3138
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
3139
 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
3140
 * record is available.
3141
 */
3142
static XLogRecord *
3143
ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
3144
       bool fetching_ckpt, TimeLineID replayTLI)
3145
0
{
3146
0
  XLogRecord *record;
3147
0
  XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher);
3148
0
  XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
3149
3150
  /* Pass through parameters to XLogPageRead */
3151
0
  private->fetching_ckpt = fetching_ckpt;
3152
0
  private->emode = emode;
3153
0
  private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
3154
0
  private->replayTLI = replayTLI;
3155
3156
  /* This is the first attempt to read this page. */
3157
0
  lastSourceFailed = false;
3158
3159
0
  for (;;)
3160
0
  {
3161
0
    char     *errormsg;
3162
3163
0
    record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg);
3164
0
    if (record == NULL)
3165
0
    {
3166
      /*
3167
       * When we find that WAL ends in an incomplete record, keep track
3168
       * of that record.  After recovery is done, we'll write a record
3169
       * to indicate to downstream WAL readers that that portion is to
3170
       * be ignored.
3171
       *
3172
       * However, when ArchiveRecoveryRequested = true, we're going to
3173
       * switch to a new timeline at the end of recovery. We will only
3174
       * copy WAL over to the new timeline up to the end of the last
3175
       * complete record, so if we did this, we would later create an
3176
       * overwrite contrecord in the wrong place, breaking everything.
3177
       */
3178
0
      if (!ArchiveRecoveryRequested &&
3179
0
        !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
3180
0
      {
3181
0
        abortedRecPtr = xlogreader->abortedRecPtr;
3182
0
        missingContrecPtr = xlogreader->missingContrecPtr;
3183
0
      }
3184
3185
0
      if (readFile >= 0)
3186
0
      {
3187
0
        close(readFile);
3188
0
        readFile = -1;
3189
0
      }
3190
3191
      /*
3192
       * We only end up here without a message when XLogPageRead()
3193
       * failed - in that case we already logged something. In
3194
       * StandbyMode that only happens if we have been triggered, so we
3195
       * shouldn't loop anymore in that case.
3196
       */
3197
0
      if (errormsg)
3198
0
        ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3199
0
            (errmsg_internal("%s", errormsg) /* already translated */ ));
3200
0
    }
3201
3202
    /*
3203
     * Check page TLI is one of the expected values.
3204
     */
3205
0
    else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
3206
0
    {
3207
0
      char    fname[MAXFNAMELEN];
3208
0
      XLogSegNo segno;
3209
0
      int32   offset;
3210
3211
0
      XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
3212
0
      offset = XLogSegmentOffset(xlogreader->latestPagePtr,
3213
0
                     wal_segment_size);
3214
0
      XLogFileName(fname, xlogreader->seg.ws_tli, segno,
3215
0
             wal_segment_size);
3216
0
      ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3217
0
          errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u",
3218
0
               xlogreader->latestPageTLI,
3219
0
               fname,
3220
0
               LSN_FORMAT_ARGS(xlogreader->latestPagePtr),
3221
0
               offset));
3222
0
      record = NULL;
3223
0
    }
3224
3225
0
    if (record)
3226
0
    {
3227
      /* Great, got a record */
3228
0
      return record;
3229
0
    }
3230
0
    else
3231
0
    {
3232
      /* No valid record available from this source */
3233
0
      lastSourceFailed = true;
3234
3235
      /*
3236
       * If archive recovery was requested, but we were still doing
3237
       * crash recovery, switch to archive recovery and retry using the
3238
       * offline archive. We have now replayed all the valid WAL in
3239
       * pg_wal, so we are presumably now consistent.
3240
       *
3241
       * We require that there's at least some valid WAL present in
3242
       * pg_wal, however (!fetching_ckpt).  We could recover using the
3243
       * WAL from the archive, even if pg_wal is completely empty, but
3244
       * we'd have no idea how far we'd have to replay to reach
3245
       * consistency.  So err on the safe side and give up.
3246
       */
3247
0
      if (!InArchiveRecovery && ArchiveRecoveryRequested &&
3248
0
        !fetching_ckpt)
3249
0
      {
3250
0
        ereport(DEBUG1,
3251
0
            (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
3252
0
        InArchiveRecovery = true;
3253
0
        if (StandbyModeRequested)
3254
0
          EnableStandbyMode();
3255
3256
0
        SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI);
3257
0
        minRecoveryPoint = xlogreader->EndRecPtr;
3258
0
        minRecoveryPointTLI = replayTLI;
3259
3260
0
        CheckRecoveryConsistency();
3261
3262
        /*
3263
         * Before we retry, reset lastSourceFailed and currentSource
3264
         * so that we will check the archive next.
3265
         */
3266
0
        lastSourceFailed = false;
3267
0
        currentSource = XLOG_FROM_ANY;
3268
3269
0
        continue;
3270
0
      }
3271
3272
      /* In standby mode, loop back to retry. Otherwise, give up. */
3273
0
      if (StandbyMode && !CheckForStandbyTrigger())
3274
0
        continue;
3275
0
      else
3276
0
        return NULL;
3277
0
    }
3278
0
  }
3279
0
}
3280
3281
/*
3282
 * Read the XLOG page containing targetPagePtr into readBuf (if not read
3283
 * already).  Returns number of bytes read, if the page is read successfully,
3284
 * or XLREAD_FAIL in case of errors.  When errors occur, they are ereport'ed,
3285
 * but only if they have not been previously reported.
3286
 *
3287
 * See XLogReaderRoutine.page_read for more details.
3288
 *
3289
 * While prefetching, xlogreader->nonblocking may be set.  In that case,
3290
 * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL.
3291
 *
3292
 * This is responsible for restoring files from archive as needed, as well
3293
 * as for waiting for the requested WAL record to arrive in standby mode.
3294
 *
3295
 * xlogreader->private_data->emode specifies the log level used for reporting
3296
 * "file not found" or "end of WAL" situations in archive recovery, or in
3297
 * standby mode when promotion is triggered. If set to WARNING or below,
3298
 * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log
3299
 * levels the ereport() won't return.
3300
 *
3301
 * In standby mode, if after a successful return of XLogPageRead() the
3302
 * caller finds the record it's interested in to be broken, it should
3303
 * ereport the error with the level determined by
3304
 * emode_for_corrupt_record(), and then set lastSourceFailed
3305
 * and call XLogPageRead() again with the same arguments. This lets
3306
 * XLogPageRead() to try fetching the record from another source, or to
3307
 * sleep and retry.
3308
 */
3309
static int
3310
XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
3311
       XLogRecPtr targetRecPtr, char *readBuf)
3312
0
{
3313
0
  XLogPageReadPrivate *private =
3314
0
    (XLogPageReadPrivate *) xlogreader->private_data;
3315
0
  int     emode = private->emode;
3316
0
  uint32    targetPageOff;
3317
0
  XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
3318
0
  int     r;
3319
0
  instr_time  io_start;
3320
3321
0
  XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
3322
0
  targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
3323
3324
  /*
3325
   * See if we need to switch to a new segment because the requested record
3326
   * is not in the currently open one.
3327
   */
3328
0
  if (readFile >= 0 &&
3329
0
    !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
3330
0
  {
3331
    /*
3332
     * Request a restartpoint if we've replayed too much xlog since the
3333
     * last one.
3334
     */
3335
0
    if (ArchiveRecoveryRequested && IsUnderPostmaster)
3336
0
    {
3337
0
      if (XLogCheckpointNeeded(readSegNo))
3338
0
      {
3339
0
        (void) GetRedoRecPtr();
3340
0
        if (XLogCheckpointNeeded(readSegNo))
3341
0
          RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
3342
0
      }
3343
0
    }
3344
3345
0
    close(readFile);
3346
0
    readFile = -1;
3347
0
    readSource = XLOG_FROM_ANY;
3348
0
  }
3349
3350
0
  XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
3351
3352
0
retry:
3353
  /* See if we need to retrieve more data */
3354
0
  if (readFile < 0 ||
3355
0
    (readSource == XLOG_FROM_STREAM &&
3356
0
     flushedUpto < targetPagePtr + reqLen))
3357
0
  {
3358
0
    if (readFile >= 0 &&
3359
0
      xlogreader->nonblocking &&
3360
0
      readSource == XLOG_FROM_STREAM &&
3361
0
      flushedUpto < targetPagePtr + reqLen)
3362
0
      return XLREAD_WOULDBLOCK;
3363
3364
0
    switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
3365
0
                      private->randAccess,
3366
0
                      private->fetching_ckpt,
3367
0
                      targetRecPtr,
3368
0
                      private->replayTLI,
3369
0
                      xlogreader->EndRecPtr,
3370
0
                      xlogreader->nonblocking))
3371
0
    {
3372
0
      case XLREAD_WOULDBLOCK:
3373
0
        return XLREAD_WOULDBLOCK;
3374
0
      case XLREAD_FAIL:
3375
0
        if (readFile >= 0)
3376
0
          close(readFile);
3377
0
        readFile = -1;
3378
0
        readLen = 0;
3379
0
        readSource = XLOG_FROM_ANY;
3380
0
        return XLREAD_FAIL;
3381
0
      case XLREAD_SUCCESS:
3382
0
        break;
3383
0
    }
3384
0
  }
3385
3386
  /*
3387
   * At this point, we have the right segment open and if we're streaming we
3388
   * know the requested record is in it.
3389
   */
3390
0
  Assert(readFile != -1);
3391
3392
  /*
3393
   * If the current segment is being streamed from the primary, calculate
3394
   * how much of the current page we have received already. We know the
3395
   * requested record has been received, but this is for the benefit of
3396
   * future calls, to allow quick exit at the top of this function.
3397
   */
3398
0
  if (readSource == XLOG_FROM_STREAM)
3399
0
  {
3400
0
    if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
3401
0
      readLen = XLOG_BLCKSZ;
3402
0
    else
3403
0
      readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
3404
0
        targetPageOff;
3405
0
  }
3406
0
  else
3407
0
    readLen = XLOG_BLCKSZ;
3408
3409
  /* Read the requested page */
3410
0
  readOff = targetPageOff;
3411
3412
  /* Measure I/O timing when reading segment */
3413
0
  io_start = pgstat_prepare_io_time(track_wal_io_timing);
3414
3415
0
  pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
3416
0
  r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
3417
0
  if (r != XLOG_BLCKSZ)
3418
0
  {
3419
0
    char    fname[MAXFNAMELEN];
3420
0
    int     save_errno = errno;
3421
3422
0
    pgstat_report_wait_end();
3423
3424
0
    pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3425
0
                io_start, 1, r);
3426
3427
0
    XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
3428
0
    if (r < 0)
3429
0
    {
3430
0
      errno = save_errno;
3431
0
      ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3432
0
          (errcode_for_file_access(),
3433
0
           errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m",
3434
0
              fname, LSN_FORMAT_ARGS(targetPagePtr),
3435
0
              readOff)));
3436
0
    }
3437
0
    else
3438
0
      ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
3439
0
          (errcode(ERRCODE_DATA_CORRUPTED),
3440
0
           errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu",
3441
0
              fname, LSN_FORMAT_ARGS(targetPagePtr),
3442
0
              readOff, r, (Size) XLOG_BLCKSZ)));
3443
0
    goto next_record_is_invalid;
3444
0
  }
3445
0
  pgstat_report_wait_end();
3446
3447
0
  pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ,
3448
0
              io_start, 1, r);
3449
3450
0
  Assert(targetSegNo == readSegNo);
3451
0
  Assert(targetPageOff == readOff);
3452
0
  Assert(reqLen <= readLen);
3453
3454
0
  xlogreader->seg.ws_tli = curFileTLI;
3455
3456
  /*
3457
   * Check the page header immediately, so that we can retry immediately if
3458
   * it's not valid. This may seem unnecessary, because ReadPageInternal()
3459
   * validates the page header anyway, and would propagate the failure up to
3460
   * ReadRecord(), which would retry. However, there's a corner case with
3461
   * continuation records, if a record is split across two pages such that
3462
   * we would need to read the two pages from different sources across two
3463
   * WAL segments.
3464
   *
3465
   * The first page is only available locally, in pg_wal, because it's
3466
   * already been recycled on the primary. The second page, however, is not
3467
   * present in pg_wal, and we should stream it from the primary. There is a
3468
   * recycled WAL segment present in pg_wal, with garbage contents, however.
3469
   * We would read the first page from the local WAL segment, but when
3470
   * reading the second page, we would read the bogus, recycled, WAL
3471
   * segment. If we didn't catch that case here, we would never recover,
3472
   * because ReadRecord() would retry reading the whole record from the
3473
   * beginning.
3474
   *
3475
   * Of course, this only catches errors in the page header, which is what
3476
   * happens in the case of a recycled WAL segment. Other kinds of errors or
3477
   * corruption still has the same problem. But this at least fixes the
3478
   * common case, which can happen as part of normal operation.
3479
   *
3480
   * Validating the page header is cheap enough that doing it twice
3481
   * shouldn't be a big deal from a performance point of view.
3482
   *
3483
   * When not in standby mode, an invalid page header should cause recovery
3484
   * to end, not retry reading the page, so we don't need to validate the
3485
   * page header here for the retry. Instead, ReadPageInternal() is
3486
   * responsible for the validation.
3487
   */
3488
0
  if (StandbyMode &&
3489
0
    (targetPagePtr % wal_segment_size) == 0 &&
3490
0
    !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
3491
0
  {
3492
    /*
3493
     * Emit this error right now then retry this page immediately. Use
3494
     * errmsg_internal() because the message was already translated.
3495
     */
3496
0
    if (xlogreader->errormsg_buf[0])
3497
0
      ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr),
3498
0
          (errmsg_internal("%s", xlogreader->errormsg_buf)));
3499
3500
    /* reset any error XLogReaderValidatePageHeader() might have set */
3501
0
    XLogReaderResetError(xlogreader);
3502
0
    goto next_record_is_invalid;
3503
0
  }
3504
3505
0
  return readLen;
3506
3507
0
next_record_is_invalid:
3508
3509
  /*
3510
   * If we're reading ahead, give up fast.  Retries and error reporting will
3511
   * be handled by a later read when recovery catches up to this point.
3512
   */
3513
0
  if (xlogreader->nonblocking)
3514
0
    return XLREAD_WOULDBLOCK;
3515
3516
0
  lastSourceFailed = true;
3517
3518
0
  if (readFile >= 0)
3519
0
    close(readFile);
3520
0
  readFile = -1;
3521
0
  readLen = 0;
3522
0
  readSource = XLOG_FROM_ANY;
3523
3524
  /* In standby-mode, keep trying */
3525
0
  if (StandbyMode)
3526
0
    goto retry;
3527
0
  else
3528
0
    return XLREAD_FAIL;
3529
0
}
3530
3531
/*
3532
 * Open the WAL segment containing WAL location 'RecPtr'.
3533
 *
3534
 * The segment can be fetched via restore_command, or via walreceiver having
3535
 * streamed the record, or it can already be present in pg_wal. Checking
3536
 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
3537
 * too, in case someone copies a new segment directly to pg_wal. That is not
3538
 * documented or recommended, though.
3539
 *
3540
 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
3541
 * prepare to read WAL starting from RedoStartLSN after this.
3542
 *
3543
 * 'RecPtr' might not point to the beginning of the record we're interested
3544
 * in, it might also point to the page or segment header. In that case,
3545
 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
3546
 * used to decide which timeline to stream the requested WAL from.
3547
 *
3548
 * 'replayLSN' is the current replay LSN, so that if we scan for new
3549
 * timelines, we can reject a switch to a timeline that branched off before
3550
 * this point.
3551
 *
3552
 * If the record is not immediately available, the function returns false
3553
 * if we're not in standby mode. In standby mode, waits for it to become
3554
 * available.
3555
 *
3556
 * When the requested record becomes available, the function opens the file
3557
 * containing it (if not open already), and returns XLREAD_SUCCESS. When end
3558
 * of standby mode is triggered by the user, and there is no more WAL
3559
 * available, returns XLREAD_FAIL.
3560
 *
3561
 * If nonblocking is true, then give up immediately if we can't satisfy the
3562
 * request, returning XLREAD_WOULDBLOCK instead of waiting.
3563
 */
3564
static XLogPageReadResult
3565
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
3566
              bool fetching_ckpt, XLogRecPtr tliRecPtr,
3567
              TimeLineID replayTLI, XLogRecPtr replayLSN,
3568
              bool nonblocking)
3569
0
{
3570
0
  static TimestampTz last_fail_time = 0;
3571
0
  TimestampTz now;
3572
0
  bool    streaming_reply_sent = false;
3573
3574
  /*-------
3575
   * Standby mode is implemented by a state machine:
3576
   *
3577
   * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
3578
   *    pg_wal (XLOG_FROM_PG_WAL)
3579
   * 2. Check for promotion trigger request
3580
   * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
3581
   * 4. Rescan timelines
3582
   * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
3583
   *
3584
   * Failure to read from the current source advances the state machine to
3585
   * the next state.
3586
   *
3587
   * 'currentSource' indicates the current state. There are no currentSource
3588
   * values for "check trigger", "rescan timelines", and "sleep" states,
3589
   * those actions are taken when reading from the previous source fails, as
3590
   * part of advancing to the next state.
3591
   *
3592
   * If standby mode is turned off while reading WAL from stream, we move
3593
   * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
3594
   * the files (which would be required at end of recovery, e.g., timeline
3595
   * history file) from archive or pg_wal. We don't need to kill WAL receiver
3596
   * here because it's already stopped when standby mode is turned off at
3597
   * the end of recovery.
3598
   *-------
3599
   */
3600
0
  if (!InArchiveRecovery)
3601
0
    currentSource = XLOG_FROM_PG_WAL;
3602
0
  else if (currentSource == XLOG_FROM_ANY ||
3603
0
       (!StandbyMode && currentSource == XLOG_FROM_STREAM))
3604
0
  {
3605
0
    lastSourceFailed = false;
3606
0
    currentSource = XLOG_FROM_ARCHIVE;
3607
0
  }
3608
3609
0
  for (;;)
3610
0
  {
3611
0
    XLogSource  oldSource = currentSource;
3612
0
    bool    startWalReceiver = false;
3613
3614
    /*
3615
     * First check if we failed to read from the current source, and
3616
     * advance the state machine if so. The failure to read might've
3617
     * happened outside this function, e.g when a CRC check fails on a
3618
     * record, or within this loop.
3619
     */
3620
0
    if (lastSourceFailed)
3621
0
    {
3622
      /*
3623
       * Don't allow any retry loops to occur during nonblocking
3624
       * readahead.  Let the caller process everything that has been
3625
       * decoded already first.
3626
       */
3627
0
      if (nonblocking)
3628
0
        return XLREAD_WOULDBLOCK;
3629
3630
0
      switch (currentSource)
3631
0
      {
3632
0
        case XLOG_FROM_ARCHIVE:
3633
0
        case XLOG_FROM_PG_WAL:
3634
3635
          /*
3636
           * Check to see if promotion is requested. Note that we do
3637
           * this only after failure, so when you promote, we still
3638
           * finish replaying as much as we can from archive and
3639
           * pg_wal before failover.
3640
           */
3641
0
          if (StandbyMode && CheckForStandbyTrigger())
3642
0
          {
3643
0
            XLogShutdownWalRcv();
3644
0
            return XLREAD_FAIL;
3645
0
          }
3646
3647
          /*
3648
           * Not in standby mode, and we've now tried the archive
3649
           * and pg_wal.
3650
           */
3651
0
          if (!StandbyMode)
3652
0
            return XLREAD_FAIL;
3653
3654
          /*
3655
           * Move to XLOG_FROM_STREAM state, and set to start a
3656
           * walreceiver if necessary.
3657
           */
3658
0
          currentSource = XLOG_FROM_STREAM;
3659
0
          startWalReceiver = true;
3660
0
          break;
3661
3662
0
        case XLOG_FROM_STREAM:
3663
3664
          /*
3665
           * Failure while streaming. Most likely, we got here
3666
           * because streaming replication was terminated, or
3667
           * promotion was triggered. But we also get here if we
3668
           * find an invalid record in the WAL streamed from the
3669
           * primary, in which case something is seriously wrong.
3670
           * There's little chance that the problem will just go
3671
           * away, but PANIC is not good for availability either,
3672
           * especially in hot standby mode. So, we treat that the
3673
           * same as disconnection, and retry from archive/pg_wal
3674
           * again. The WAL in the archive should be identical to
3675
           * what was streamed, so it's unlikely that it helps, but
3676
           * one can hope...
3677
           */
3678
3679
          /*
3680
           * We should be able to move to XLOG_FROM_STREAM only in
3681
           * standby mode.
3682
           */
3683
0
          Assert(StandbyMode);
3684
3685
          /*
3686
           * Before we leave XLOG_FROM_STREAM state, make sure that
3687
           * walreceiver is not active, so that it won't overwrite
3688
           * WAL that we restore from archive.
3689
           */
3690
0
          XLogShutdownWalRcv();
3691
3692
          /*
3693
           * Before we sleep, re-scan for possible new timelines if
3694
           * we were requested to recover to the latest timeline.
3695
           */
3696
0
          if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
3697
0
          {
3698
0
            if (rescanLatestTimeLine(replayTLI, replayLSN))
3699
0
            {
3700
0
              currentSource = XLOG_FROM_ARCHIVE;
3701
0
              break;
3702
0
            }
3703
0
          }
3704
3705
          /*
3706
           * XLOG_FROM_STREAM is the last state in our state
3707
           * machine, so we've exhausted all the options for
3708
           * obtaining the requested WAL. We're going to loop back
3709
           * and retry from the archive, but if it hasn't been long
3710
           * since last attempt, sleep wal_retrieve_retry_interval
3711
           * milliseconds to avoid busy-waiting.
3712
           */
3713
0
          now = GetCurrentTimestamp();
3714
0
          if (!TimestampDifferenceExceeds(last_fail_time, now,
3715
0
                          wal_retrieve_retry_interval))
3716
0
          {
3717
0
            long    wait_time;
3718
3719
0
            wait_time = wal_retrieve_retry_interval -
3720
0
              TimestampDifferenceMilliseconds(last_fail_time, now);
3721
3722
0
            elog(LOG, "waiting for WAL to become available at %X/%08X",
3723
0
               LSN_FORMAT_ARGS(RecPtr));
3724
3725
            /* Do background tasks that might benefit us later. */
3726
0
            KnownAssignedTransactionIdsIdleMaintenance();
3727
3728
0
            (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
3729
0
                     WL_LATCH_SET | WL_TIMEOUT |
3730
0
                     WL_EXIT_ON_PM_DEATH,
3731
0
                     wait_time,
3732
0
                     WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
3733
0
            ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
3734
0
            now = GetCurrentTimestamp();
3735
3736
            /* Handle interrupt signals of startup process */
3737
0
            ProcessStartupProcInterrupts();
3738
0
          }
3739
0
          last_fail_time = now;
3740
0
          currentSource = XLOG_FROM_ARCHIVE;
3741
0
          break;
3742
3743
0
        default:
3744
0
          elog(ERROR, "unexpected WAL source %d", currentSource);
3745
0
      }
3746
0
    }
3747
0
    else if (currentSource == XLOG_FROM_PG_WAL)
3748
0
    {
3749
      /*
3750
       * We just successfully read a file in pg_wal. We prefer files in
3751
       * the archive over ones in pg_wal, so try the next file again
3752
       * from the archive first.
3753
       */
3754
0
      if (InArchiveRecovery)
3755
0
        currentSource = XLOG_FROM_ARCHIVE;
3756
0
    }
3757
3758
0
    if (currentSource != oldSource)
3759
0
      elog(DEBUG2, "switched WAL source from %s to %s after %s",
3760
0
         xlogSourceNames[oldSource], xlogSourceNames[currentSource],
3761
0
         lastSourceFailed ? "failure" : "success");
3762
3763
    /*
3764
     * We've now handled possible failure. Try to read from the chosen
3765
     * source.
3766
     */
3767
0
    lastSourceFailed = false;
3768
3769
0
    switch (currentSource)
3770
0
    {
3771
0
      case XLOG_FROM_ARCHIVE:
3772
0
      case XLOG_FROM_PG_WAL:
3773
3774
        /*
3775
         * WAL receiver must not be running when reading WAL from
3776
         * archive or pg_wal.
3777
         */
3778
0
        Assert(!WalRcvStreaming());
3779
3780
        /* Close any old file we might have open. */
3781
0
        if (readFile >= 0)
3782
0
        {
3783
0
          close(readFile);
3784
0
          readFile = -1;
3785
0
        }
3786
        /* Reset curFileTLI if random fetch. */
3787
0
        if (randAccess)
3788
0
          curFileTLI = 0;
3789
3790
        /*
3791
         * Try to restore the file from archive, or read an existing
3792
         * file from pg_wal.
3793
         */
3794
0
        readFile = XLogFileReadAnyTLI(readSegNo,
3795
0
                        currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
3796
0
                        currentSource);
3797
0
        if (readFile >= 0)
3798
0
          return XLREAD_SUCCESS; /* success! */
3799
3800
        /*
3801
         * Nope, not found in archive or pg_wal.
3802
         */
3803
0
        lastSourceFailed = true;
3804
0
        break;
3805
3806
0
      case XLOG_FROM_STREAM:
3807
0
        {
3808
0
          bool    havedata;
3809
3810
          /*
3811
           * We should be able to move to XLOG_FROM_STREAM only in
3812
           * standby mode.
3813
           */
3814
0
          Assert(StandbyMode);
3815
3816
          /*
3817
           * First, shutdown walreceiver if its restart has been
3818
           * requested -- but no point if we're already slated for
3819
           * starting it.
3820
           */
3821
0
          if (pendingWalRcvRestart && !startWalReceiver)
3822
0
          {
3823
0
            XLogShutdownWalRcv();
3824
3825
            /*
3826
             * Re-scan for possible new timelines if we were
3827
             * requested to recover to the latest timeline.
3828
             */
3829
0
            if (recoveryTargetTimeLineGoal ==
3830
0
              RECOVERY_TARGET_TIMELINE_LATEST)
3831
0
              rescanLatestTimeLine(replayTLI, replayLSN);
3832
3833
0
            startWalReceiver = true;
3834
0
          }
3835
0
          pendingWalRcvRestart = false;
3836
3837
          /*
3838
           * Launch walreceiver if needed.
3839
           *
3840
           * If fetching_ckpt is true, RecPtr points to the initial
3841
           * checkpoint location. In that case, we use RedoStartLSN
3842
           * as the streaming start position instead of RecPtr, so
3843
           * that when we later jump backwards to start redo at
3844
           * RedoStartLSN, we will have the logs streamed already.
3845
           */
3846
0
          if (startWalReceiver &&
3847
0
            PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
3848
0
          {
3849
0
            XLogRecPtr  ptr;
3850
0
            TimeLineID  tli;
3851
3852
0
            if (fetching_ckpt)
3853
0
            {
3854
0
              ptr = RedoStartLSN;
3855
0
              tli = RedoStartTLI;
3856
0
            }
3857
0
            else
3858
0
            {
3859
0
              ptr = RecPtr;
3860
3861
              /*
3862
               * Use the record begin position to determine the
3863
               * TLI, rather than the position we're reading.
3864
               */
3865
0
              tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
3866
3867
0
              if (curFileTLI > 0 && tli < curFileTLI)
3868
0
                elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
3869
0
                   LSN_FORMAT_ARGS(tliRecPtr),
3870
0
                   tli, curFileTLI);
3871
0
            }
3872
0
            curFileTLI = tli;
3873
0
            SetInstallXLogFileSegmentActive();
3874
0
            RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
3875
0
                       PrimarySlotName,
3876
0
                       wal_receiver_create_temp_slot);
3877
0
            flushedUpto = 0;
3878
0
          }
3879
3880
          /*
3881
           * Check if WAL receiver is active or wait to start up.
3882
           */
3883
0
          if (!WalRcvStreaming())
3884
0
          {
3885
0
            lastSourceFailed = true;
3886
0
            break;
3887
0
          }
3888
3889
          /*
3890
           * Walreceiver is active, so see if new data has arrived.
3891
           *
3892
           * We only advance XLogReceiptTime when we obtain fresh
3893
           * WAL from walreceiver and observe that we had already
3894
           * processed everything before the most recent "chunk"
3895
           * that it flushed to disk.  In steady state where we are
3896
           * keeping up with the incoming data, XLogReceiptTime will
3897
           * be updated on each cycle. When we are behind,
3898
           * XLogReceiptTime will not advance, so the grace time
3899
           * allotted to conflicting queries will decrease.
3900
           */
3901
0
          if (RecPtr < flushedUpto)
3902
0
            havedata = true;
3903
0
          else
3904
0
          {
3905
0
            XLogRecPtr  latestChunkStart;
3906
3907
0
            flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
3908
0
            if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
3909
0
            {
3910
0
              havedata = true;
3911
0
              if (latestChunkStart <= RecPtr)
3912
0
              {
3913
0
                XLogReceiptTime = GetCurrentTimestamp();
3914
0
                SetCurrentChunkStartTime(XLogReceiptTime);
3915
0
              }
3916
0
            }
3917
0
            else
3918
0
              havedata = false;
3919
0
          }
3920
0
          if (havedata)
3921
0
          {
3922
            /*
3923
             * Great, streamed far enough.  Open the file if it's
3924
             * not open already.  Also read the timeline history
3925
             * file if we haven't initialized timeline history
3926
             * yet; it should be streamed over and present in
3927
             * pg_wal by now.  Use XLOG_FROM_STREAM so that source
3928
             * info is set correctly and XLogReceiptTime isn't
3929
             * changed.
3930
             *
3931
             * NB: We must set readTimeLineHistory based on
3932
             * recoveryTargetTLI, not receiveTLI. Normally they'll
3933
             * be the same, but if recovery_target_timeline is
3934
             * 'latest' and archiving is configured, then it's
3935
             * possible that we managed to retrieve one or more
3936
             * new timeline history files from the archive,
3937
             * updating recoveryTargetTLI.
3938
             */
3939
0
            if (readFile < 0)
3940
0
            {
3941
0
              if (!expectedTLEs)
3942
0
                expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
3943
0
              readFile = XLogFileRead(readSegNo, receiveTLI,
3944
0
                          XLOG_FROM_STREAM, false);
3945
0
              Assert(readFile >= 0);
3946
0
            }
3947
0
            else
3948
0
            {
3949
              /* just make sure source info is correct... */
3950
0
              readSource = XLOG_FROM_STREAM;
3951
0
              XLogReceiptSource = XLOG_FROM_STREAM;
3952
0
              return XLREAD_SUCCESS;
3953
0
            }
3954
0
            break;
3955
0
          }
3956
3957
          /* In nonblocking mode, return rather than sleeping. */
3958
0
          if (nonblocking)
3959
0
            return XLREAD_WOULDBLOCK;
3960
3961
          /*
3962
           * Data not here yet. Check for trigger, then wait for
3963
           * walreceiver to wake us up when new WAL arrives.
3964
           */
3965
0
          if (CheckForStandbyTrigger())
3966
0
          {
3967
            /*
3968
             * Note that we don't return XLREAD_FAIL immediately
3969
             * here. After being triggered, we still want to
3970
             * replay all the WAL that was already streamed. It's
3971
             * in pg_wal now, so we just treat this as a failure,
3972
             * and the state machine will move on to replay the
3973
             * streamed WAL from pg_wal, and then recheck the
3974
             * trigger and exit replay.
3975
             */
3976
0
            lastSourceFailed = true;
3977
0
            break;
3978
0
          }
3979
3980
          /*
3981
           * Since we have replayed everything we have received so
3982
           * far and are about to start waiting for more WAL, let's
3983
           * tell the upstream server our replay location now so
3984
           * that pg_stat_replication doesn't show stale
3985
           * information.
3986
           */
3987
0
          if (!streaming_reply_sent)
3988
0
          {
3989
0
            WalRcvForceReply();
3990
0
            streaming_reply_sent = true;
3991
0
          }
3992
3993
          /* Do any background tasks that might benefit us later. */
3994
0
          KnownAssignedTransactionIdsIdleMaintenance();
3995
3996
          /* Update pg_stat_recovery_prefetch before sleeping. */
3997
0
          XLogPrefetcherComputeStats(xlogprefetcher);
3998
3999
          /*
4000
           * Wait for more WAL to arrive, when we will be woken
4001
           * immediately by the WAL receiver.
4002
           */
4003
0
          (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch,
4004
0
                   WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
4005
0
                   -1L,
4006
0
                   WAIT_EVENT_RECOVERY_WAL_STREAM);
4007
0
          ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4008
0
          break;
4009
0
        }
4010
4011
0
      default:
4012
0
        elog(ERROR, "unexpected WAL source %d", currentSource);
4013
0
    }
4014
4015
    /*
4016
     * Check for recovery pause here so that we can confirm more quickly
4017
     * that a requested pause has actually taken effect.
4018
     */
4019
0
    if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState !=
4020
0
      RECOVERY_NOT_PAUSED)
4021
0
      recoveryPausesHere(false);
4022
4023
    /*
4024
     * This possibly-long loop needs to handle interrupts of startup
4025
     * process.
4026
     */
4027
0
    ProcessStartupProcInterrupts();
4028
0
  }
4029
4030
0
  return XLREAD_FAIL;     /* not reached */
4031
0
}
4032
4033
4034
/*
4035
 * Determine what log level should be used to report a corrupt WAL record
4036
 * in the current WAL page, previously read by XLogPageRead().
4037
 *
4038
 * 'emode' is the error mode that would be used to report a file-not-found
4039
 * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
4040
 * we're retrying the exact same record that we've tried previously, only
4041
 * complain the first time to keep the noise down.  However, we only do when
4042
 * reading from pg_wal, because we don't expect any invalid records in archive
4043
 * or in records streamed from the primary. Files in the archive should be complete,
4044
 * and we should never hit the end of WAL because we stop and wait for more WAL
4045
 * to arrive before replaying it.
4046
 *
4047
 * NOTE: This function remembers the RecPtr value it was last called with,
4048
 * to suppress repeated messages about the same record. Only call this when
4049
 * you are about to ereport(), or you might cause a later message to be
4050
 * erroneously suppressed.
4051
 */
4052
static int
4053
emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
4054
0
{
4055
0
  static XLogRecPtr lastComplaint = 0;
4056
4057
0
  if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
4058
0
  {
4059
0
    if (RecPtr == lastComplaint)
4060
0
      emode = DEBUG1;
4061
0
    else
4062
0
      lastComplaint = RecPtr;
4063
0
  }
4064
0
  return emode;
4065
0
}
4066
4067
4068
/*
4069
 * Subroutine to try to fetch and validate a prior checkpoint record.
4070
 */
4071
static XLogRecord *
4072
ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
4073
           TimeLineID replayTLI)
4074
0
{
4075
0
  XLogRecord *record;
4076
0
  uint8   info;
4077
4078
0
  Assert(xlogreader != NULL);
4079
4080
0
  if (!XRecOffIsValid(RecPtr))
4081
0
  {
4082
0
    ereport(LOG,
4083
0
        (errmsg("invalid checkpoint location")));
4084
0
    return NULL;
4085
0
  }
4086
4087
0
  XLogPrefetcherBeginRead(xlogprefetcher, RecPtr);
4088
0
  record = ReadRecord(xlogprefetcher, LOG, true, replayTLI);
4089
4090
0
  if (record == NULL)
4091
0
  {
4092
0
    ereport(LOG,
4093
0
        (errmsg("invalid checkpoint record")));
4094
0
    return NULL;
4095
0
  }
4096
0
  if (record->xl_rmid != RM_XLOG_ID)
4097
0
  {
4098
0
    ereport(LOG,
4099
0
        (errmsg("invalid resource manager ID in checkpoint record")));
4100
0
    return NULL;
4101
0
  }
4102
0
  info = record->xl_info & ~XLR_INFO_MASK;
4103
0
  if (info != XLOG_CHECKPOINT_SHUTDOWN &&
4104
0
    info != XLOG_CHECKPOINT_ONLINE)
4105
0
  {
4106
0
    ereport(LOG,
4107
0
        (errmsg("invalid xl_info in checkpoint record")));
4108
0
    return NULL;
4109
0
  }
4110
0
  if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
4111
0
  {
4112
0
    ereport(LOG,
4113
0
        (errmsg("invalid length of checkpoint record")));
4114
0
    return NULL;
4115
0
  }
4116
0
  return record;
4117
0
}
4118
4119
/*
4120
 * Scan for new timelines that might have appeared in the archive since we
4121
 * started recovery.
4122
 *
4123
 * If there are any, the function changes recovery target TLI to the latest
4124
 * one and returns 'true'.
4125
 */
4126
static bool
4127
rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
4128
0
{
4129
0
  List     *newExpectedTLEs;
4130
0
  bool    found;
4131
0
  ListCell   *cell;
4132
0
  TimeLineID  newtarget;
4133
0
  TimeLineID  oldtarget = recoveryTargetTLI;
4134
0
  TimeLineHistoryEntry *currentTle = NULL;
4135
4136
0
  newtarget = findNewestTimeLine(recoveryTargetTLI);
4137
0
  if (newtarget == recoveryTargetTLI)
4138
0
  {
4139
    /* No new timelines found */
4140
0
    return false;
4141
0
  }
4142
4143
  /*
4144
   * Determine the list of expected TLIs for the new TLI
4145
   */
4146
4147
0
  newExpectedTLEs = readTimeLineHistory(newtarget);
4148
4149
  /*
4150
   * If the current timeline is not part of the history of the new timeline,
4151
   * we cannot proceed to it.
4152
   */
4153
0
  found = false;
4154
0
  foreach(cell, newExpectedTLEs)
4155
0
  {
4156
0
    currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4157
4158
0
    if (currentTle->tli == recoveryTargetTLI)
4159
0
    {
4160
0
      found = true;
4161
0
      break;
4162
0
    }
4163
0
  }
4164
0
  if (!found)
4165
0
  {
4166
0
    ereport(LOG,
4167
0
        (errmsg("new timeline %u is not a child of database system timeline %u",
4168
0
            newtarget,
4169
0
            replayTLI)));
4170
0
    return false;
4171
0
  }
4172
4173
  /*
4174
   * The current timeline was found in the history file, but check that the
4175
   * next timeline was forked off from it *after* the current recovery
4176
   * location.
4177
   */
4178
0
  if (currentTle->end < replayLSN)
4179
0
  {
4180
0
    ereport(LOG,
4181
0
        errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X",
4182
0
             newtarget,
4183
0
             replayTLI,
4184
0
             LSN_FORMAT_ARGS(replayLSN)));
4185
0
    return false;
4186
0
  }
4187
4188
  /* The new timeline history seems valid. Switch target */
4189
0
  recoveryTargetTLI = newtarget;
4190
0
  list_free_deep(expectedTLEs);
4191
0
  expectedTLEs = newExpectedTLEs;
4192
4193
  /*
4194
   * As in StartupXLOG(), try to ensure we have all the history files
4195
   * between the old target and new target in pg_wal.
4196
   */
4197
0
  restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4198
4199
0
  ereport(LOG,
4200
0
      (errmsg("new target timeline is %u",
4201
0
          recoveryTargetTLI)));
4202
4203
0
  return true;
4204
0
}
4205
4206
4207
/*
4208
 * Open a logfile segment for reading (during recovery).
4209
 *
4210
 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
4211
 * Otherwise, it's assumed to be already available in pg_wal.
4212
 */
4213
static int
4214
XLogFileRead(XLogSegNo segno, TimeLineID tli,
4215
       XLogSource source, bool notfoundOk)
4216
0
{
4217
0
  char    xlogfname[MAXFNAMELEN];
4218
0
  char    activitymsg[MAXFNAMELEN + 16];
4219
0
  char    path[MAXPGPATH];
4220
0
  int     fd;
4221
4222
0
  XLogFileName(xlogfname, tli, segno, wal_segment_size);
4223
4224
0
  switch (source)
4225
0
  {
4226
0
    case XLOG_FROM_ARCHIVE:
4227
      /* Report recovery progress in PS display */
4228
0
      snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
4229
0
           xlogfname);
4230
0
      set_ps_display(activitymsg);
4231
4232
0
      if (!RestoreArchivedFile(path, xlogfname,
4233
0
                   "RECOVERYXLOG",
4234
0
                   wal_segment_size,
4235
0
                   InRedo))
4236
0
        return -1;
4237
0
      break;
4238
4239
0
    case XLOG_FROM_PG_WAL:
4240
0
    case XLOG_FROM_STREAM:
4241
0
      XLogFilePath(path, tli, segno, wal_segment_size);
4242
0
      break;
4243
4244
0
    default:
4245
0
      elog(ERROR, "invalid XLogFileRead source %d", source);
4246
0
  }
4247
4248
  /*
4249
   * If the segment was fetched from archival storage, replace the existing
4250
   * xlog segment (if any) with the archival version.
4251
   */
4252
0
  if (source == XLOG_FROM_ARCHIVE)
4253
0
  {
4254
0
    Assert(!IsInstallXLogFileSegmentActive());
4255
0
    KeepFileRestoredFromArchive(path, xlogfname);
4256
4257
    /*
4258
     * Set path to point at the new file in pg_wal.
4259
     */
4260
0
    snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
4261
0
  }
4262
4263
0
  fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
4264
0
  if (fd >= 0)
4265
0
  {
4266
    /* Success! */
4267
0
    curFileTLI = tli;
4268
4269
    /* Report recovery progress in PS display */
4270
0
    snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
4271
0
         xlogfname);
4272
0
    set_ps_display(activitymsg);
4273
4274
    /* Track source of data in assorted state variables */
4275
0
    readSource = source;
4276
0
    XLogReceiptSource = source;
4277
    /* In FROM_STREAM case, caller tracks receipt time, not me */
4278
0
    if (source != XLOG_FROM_STREAM)
4279
0
      XLogReceiptTime = GetCurrentTimestamp();
4280
4281
0
    return fd;
4282
0
  }
4283
0
  if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
4284
0
    ereport(PANIC,
4285
0
        (errcode_for_file_access(),
4286
0
         errmsg("could not open file \"%s\": %m", path)));
4287
0
  return -1;
4288
0
}
4289
4290
/*
4291
 * Open a logfile segment for reading (during recovery).
4292
 *
4293
 * This version searches for the segment with any TLI listed in expectedTLEs.
4294
 */
4295
static int
4296
XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source)
4297
0
{
4298
0
  char    path[MAXPGPATH];
4299
0
  ListCell   *cell;
4300
0
  int     fd;
4301
0
  List     *tles;
4302
4303
  /*
4304
   * Loop looking for a suitable timeline ID: we might need to read any of
4305
   * the timelines listed in expectedTLEs.
4306
   *
4307
   * We expect curFileTLI on entry to be the TLI of the preceding file in
4308
   * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
4309
   * to go backwards; this prevents us from picking up the wrong file when a
4310
   * parent timeline extends to higher segment numbers than the child we
4311
   * want to read.
4312
   *
4313
   * If we haven't read the timeline history file yet, read it now, so that
4314
   * we know which TLIs to scan.  We don't save the list in expectedTLEs,
4315
   * however, unless we actually find a valid segment.  That way if there is
4316
   * neither a timeline history file nor a WAL segment in the archive, and
4317
   * streaming replication is set up, we'll read the timeline history file
4318
   * streamed from the primary when we start streaming, instead of
4319
   * recovering with a dummy history generated here.
4320
   */
4321
0
  if (expectedTLEs)
4322
0
    tles = expectedTLEs;
4323
0
  else
4324
0
    tles = readTimeLineHistory(recoveryTargetTLI);
4325
4326
0
  foreach(cell, tles)
4327
0
  {
4328
0
    TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
4329
0
    TimeLineID  tli = hent->tli;
4330
4331
0
    if (tli < curFileTLI)
4332
0
      break;       /* don't bother looking at too-old TLIs */
4333
4334
    /*
4335
     * Skip scanning the timeline ID that the logfile segment to read
4336
     * doesn't belong to
4337
     */
4338
0
    if (hent->begin != InvalidXLogRecPtr)
4339
0
    {
4340
0
      XLogSegNo beginseg = 0;
4341
4342
0
      XLByteToSeg(hent->begin, beginseg, wal_segment_size);
4343
4344
      /*
4345
       * The logfile segment that doesn't belong to the timeline is
4346
       * older or newer than the segment that the timeline started or
4347
       * ended at, respectively. It's sufficient to check only the
4348
       * starting segment of the timeline here. Since the timelines are
4349
       * scanned in descending order in this loop, any segments newer
4350
       * than the ending segment should belong to newer timeline and
4351
       * have already been read before. So it's not necessary to check
4352
       * the ending segment of the timeline here.
4353
       */
4354
0
      if (segno < beginseg)
4355
0
        continue;
4356
0
    }
4357
4358
0
    if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
4359
0
    {
4360
0
      fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true);
4361
0
      if (fd != -1)
4362
0
      {
4363
0
        elog(DEBUG1, "got WAL segment from archive");
4364
0
        if (!expectedTLEs)
4365
0
          expectedTLEs = tles;
4366
0
        return fd;
4367
0
      }
4368
0
    }
4369
4370
0
    if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
4371
0
    {
4372
0
      fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true);
4373
0
      if (fd != -1)
4374
0
      {
4375
0
        if (!expectedTLEs)
4376
0
          expectedTLEs = tles;
4377
0
        return fd;
4378
0
      }
4379
0
    }
4380
0
  }
4381
4382
  /* Couldn't find it.  For simplicity, complain about front timeline */
4383
0
  XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
4384
0
  errno = ENOENT;
4385
0
  ereport(DEBUG2,
4386
0
      (errcode_for_file_access(),
4387
0
       errmsg("could not open file \"%s\": %m", path)));
4388
0
  return -1;
4389
0
}
4390
4391
/*
4392
 * Set flag to signal the walreceiver to restart.  (The startup process calls
4393
 * this on noticing a relevant configuration change.)
4394
 */
4395
void
4396
StartupRequestWalReceiverRestart(void)
4397
0
{
4398
0
  if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
4399
0
  {
4400
0
    ereport(LOG,
4401
0
        (errmsg("WAL receiver process shutdown requested")));
4402
4403
0
    pendingWalRcvRestart = true;
4404
0
  }
4405
0
}
4406
4407
4408
/*
4409
 * Has a standby promotion already been triggered?
4410
 *
4411
 * Unlike CheckForStandbyTrigger(), this works in any process
4412
 * that's connected to shared memory.
4413
 */
4414
bool
4415
PromoteIsTriggered(void)
4416
0
{
4417
  /*
4418
   * We check shared state each time only until a standby promotion is
4419
   * triggered. We can't trigger a promotion again, so there's no need to
4420
   * keep checking after the shared variable has once been seen true.
4421
   */
4422
0
  if (LocalPromoteIsTriggered)
4423
0
    return true;
4424
4425
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4426
0
  LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered;
4427
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
4428
4429
0
  return LocalPromoteIsTriggered;
4430
0
}
4431
4432
static void
4433
SetPromoteIsTriggered(void)
4434
0
{
4435
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4436
0
  XLogRecoveryCtl->SharedPromoteIsTriggered = true;
4437
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
4438
4439
  /*
4440
   * Mark the recovery pause state as 'not paused' because the paused state
4441
   * ends and promotion continues if a promotion is triggered while recovery
4442
   * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
4443
   * return 'paused' while a promotion is ongoing.
4444
   */
4445
0
  SetRecoveryPause(false);
4446
4447
0
  LocalPromoteIsTriggered = true;
4448
0
}
4449
4450
/*
4451
 * Check whether a promote request has arrived.
4452
 */
4453
static bool
4454
CheckForStandbyTrigger(void)
4455
0
{
4456
0
  if (LocalPromoteIsTriggered)
4457
0
    return true;
4458
4459
0
  if (IsPromoteSignaled() && CheckPromoteSignal())
4460
0
  {
4461
0
    ereport(LOG, (errmsg("received promote request")));
4462
0
    RemovePromoteSignalFiles();
4463
0
    ResetPromoteSignaled();
4464
0
    SetPromoteIsTriggered();
4465
0
    return true;
4466
0
  }
4467
4468
0
  return false;
4469
0
}
4470
4471
/*
4472
 * Remove the files signaling a standby promotion request.
4473
 */
4474
void
4475
RemovePromoteSignalFiles(void)
4476
0
{
4477
0
  unlink(PROMOTE_SIGNAL_FILE);
4478
0
}
4479
4480
/*
4481
 * Check to see if a promote request has arrived.
4482
 */
4483
bool
4484
CheckPromoteSignal(void)
4485
0
{
4486
0
  struct stat stat_buf;
4487
4488
0
  if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
4489
0
    return true;
4490
4491
0
  return false;
4492
0
}
4493
4494
/*
4495
 * Wake up startup process to replay newly arrived WAL, or to notice that
4496
 * failover has been requested.
4497
 */
4498
void
4499
WakeupRecovery(void)
4500
0
{
4501
0
  SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch);
4502
0
}
4503
4504
/*
4505
 * Schedule a walreceiver wakeup in the main recovery loop.
4506
 */
4507
void
4508
XLogRequestWalReceiverReply(void)
4509
0
{
4510
0
  doRequestWalReceiverReply = true;
4511
0
}
4512
4513
/*
4514
 * Is HotStandby active yet? This is only important in special backends
4515
 * since normal backends won't ever be able to connect until this returns
4516
 * true. Postmaster knows this by way of signal, not via shared memory.
4517
 *
4518
 * Unlike testing standbyState, this works in any process that's connected to
4519
 * shared memory.  (And note that standbyState alone doesn't tell the truth
4520
 * anyway.)
4521
 */
4522
bool
4523
HotStandbyActive(void)
4524
0
{
4525
  /*
4526
   * We check shared state each time only until Hot Standby is active. We
4527
   * can't de-activate Hot Standby, so there's no need to keep checking
4528
   * after the shared variable has once been seen true.
4529
   */
4530
0
  if (LocalHotStandbyActive)
4531
0
    return true;
4532
0
  else
4533
0
  {
4534
    /* spinlock is essential on machines with weak memory ordering! */
4535
0
    SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4536
0
    LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive;
4537
0
    SpinLockRelease(&XLogRecoveryCtl->info_lck);
4538
4539
0
    return LocalHotStandbyActive;
4540
0
  }
4541
0
}
4542
4543
/*
4544
 * Like HotStandbyActive(), but to be used only in WAL replay code,
4545
 * where we don't need to ask any other process what the state is.
4546
 */
4547
static bool
4548
HotStandbyActiveInReplay(void)
4549
0
{
4550
0
  Assert(AmStartupProcess() || !IsPostmasterEnvironment);
4551
0
  return LocalHotStandbyActive;
4552
0
}
4553
4554
/*
4555
 * Get latest redo apply position.
4556
 *
4557
 * Exported to allow WALReceiver to read the pointer directly.
4558
 */
4559
XLogRecPtr
4560
GetXLogReplayRecPtr(TimeLineID *replayTLI)
4561
0
{
4562
0
  XLogRecPtr  recptr;
4563
0
  TimeLineID  tli;
4564
4565
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4566
0
  recptr = XLogRecoveryCtl->lastReplayedEndRecPtr;
4567
0
  tli = XLogRecoveryCtl->lastReplayedTLI;
4568
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
4569
4570
0
  if (replayTLI)
4571
0
    *replayTLI = tli;
4572
0
  return recptr;
4573
0
}
4574
4575
4576
/*
4577
 * Get position of last applied, or the record being applied.
4578
 *
4579
 * This is different from GetXLogReplayRecPtr() in that if a WAL
4580
 * record is currently being applied, this includes that record.
4581
 */
4582
XLogRecPtr
4583
GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
4584
0
{
4585
0
  XLogRecPtr  recptr;
4586
0
  TimeLineID  tli;
4587
4588
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4589
0
  recptr = XLogRecoveryCtl->replayEndRecPtr;
4590
0
  tli = XLogRecoveryCtl->replayEndTLI;
4591
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
4592
4593
0
  if (replayEndTLI)
4594
0
    *replayEndTLI = tli;
4595
0
  return recptr;
4596
0
}
4597
4598
/*
4599
 * Save timestamp of latest processed commit/abort record.
4600
 *
4601
 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4602
 * seen by processes other than the startup process.  Note in particular
4603
 * that CreateRestartPoint is executed in the checkpointer.
4604
 */
4605
static void
4606
SetLatestXTime(TimestampTz xtime)
4607
0
{
4608
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4609
0
  XLogRecoveryCtl->recoveryLastXTime = xtime;
4610
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
4611
0
}
4612
4613
/*
4614
 * Fetch timestamp of latest processed commit/abort record.
4615
 */
4616
TimestampTz
4617
GetLatestXTime(void)
4618
0
{
4619
0
  TimestampTz xtime;
4620
4621
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4622
0
  xtime = XLogRecoveryCtl->recoveryLastXTime;
4623
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
4624
4625
0
  return xtime;
4626
0
}
4627
4628
/*
4629
 * Save timestamp of the next chunk of WAL records to apply.
4630
 *
4631
 * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be
4632
 * seen by all backends.
4633
 */
4634
static void
4635
SetCurrentChunkStartTime(TimestampTz xtime)
4636
0
{
4637
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4638
0
  XLogRecoveryCtl->currentChunkStartTime = xtime;
4639
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
4640
0
}
4641
4642
/*
4643
 * Fetch timestamp of latest processed commit/abort record.
4644
 * Startup process maintains an accurate local copy in XLogReceiptTime
4645
 */
4646
TimestampTz
4647
GetCurrentChunkReplayStartTime(void)
4648
0
{
4649
0
  TimestampTz xtime;
4650
4651
0
  SpinLockAcquire(&XLogRecoveryCtl->info_lck);
4652
0
  xtime = XLogRecoveryCtl->currentChunkStartTime;
4653
0
  SpinLockRelease(&XLogRecoveryCtl->info_lck);
4654
4655
0
  return xtime;
4656
0
}
4657
4658
/*
4659
 * Returns time of receipt of current chunk of XLOG data, as well as
4660
 * whether it was received from streaming replication or from archives.
4661
 */
4662
void
4663
GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
4664
0
{
4665
  /*
4666
   * This must be executed in the startup process, since we don't export the
4667
   * relevant state to shared memory.
4668
   */
4669
0
  Assert(InRecovery);
4670
4671
0
  *rtime = XLogReceiptTime;
4672
0
  *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
4673
0
}
4674
4675
/*
4676
 * Note that text field supplied is a parameter name and does not require
4677
 * translation
4678
 */
4679
void
4680
RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
4681
0
{
4682
0
  if (currValue < minValue)
4683
0
  {
4684
0
    if (HotStandbyActiveInReplay())
4685
0
    {
4686
0
      bool    warned_for_promote = false;
4687
4688
0
      ereport(WARNING,
4689
0
          (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4690
0
           errmsg("hot standby is not possible because of insufficient parameter settings"),
4691
0
           errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4692
0
                 param_name,
4693
0
                 currValue,
4694
0
                 minValue)));
4695
4696
0
      SetRecoveryPause(true);
4697
4698
0
      ereport(LOG,
4699
0
          (errmsg("recovery has paused"),
4700
0
           errdetail("If recovery is unpaused, the server will shut down."),
4701
0
           errhint("You can then restart the server after making the necessary configuration changes.")));
4702
4703
0
      while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
4704
0
      {
4705
0
        ProcessStartupProcInterrupts();
4706
4707
0
        if (CheckForStandbyTrigger())
4708
0
        {
4709
0
          if (!warned_for_promote)
4710
0
            ereport(WARNING,
4711
0
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4712
0
                 errmsg("promotion is not possible because of insufficient parameter settings"),
4713
4714
            /*
4715
             * Repeat the detail from above so it's easy to find
4716
             * in the log.
4717
             */
4718
0
                 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4719
0
                       param_name,
4720
0
                       currValue,
4721
0
                       minValue),
4722
0
                 errhint("Restart the server after making the necessary configuration changes.")));
4723
0
          warned_for_promote = true;
4724
0
        }
4725
4726
        /*
4727
         * If recovery pause is requested then set it paused.  While
4728
         * we are in the loop, user might resume and pause again so
4729
         * set this every time.
4730
         */
4731
0
        ConfirmRecoveryPaused();
4732
4733
        /*
4734
         * We wait on a condition variable that will wake us as soon
4735
         * as the pause ends, but we use a timeout so we can check the
4736
         * above conditions periodically too.
4737
         */
4738
0
        ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000,
4739
0
                      WAIT_EVENT_RECOVERY_PAUSE);
4740
0
      }
4741
0
      ConditionVariableCancelSleep();
4742
0
    }
4743
4744
0
    ereport(FATAL,
4745
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4746
0
         errmsg("recovery aborted because of insufficient parameter settings"),
4747
    /* Repeat the detail from above so it's easy to find in the log. */
4748
0
         errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
4749
0
               param_name,
4750
0
               currValue,
4751
0
               minValue),
4752
0
         errhint("You can restart the server after making the necessary configuration changes.")));
4753
0
  }
4754
0
}
4755
4756
4757
/*
4758
 * GUC check_hook for primary_slot_name
4759
 */
4760
bool
4761
check_primary_slot_name(char **newval, void **extra, GucSource source)
4762
2
{
4763
2
  if (*newval && strcmp(*newval, "") != 0 &&
4764
2
    !ReplicationSlotValidateName(*newval, false, WARNING))
4765
0
    return false;
4766
4767
2
  return true;
4768
2
}
4769
4770
/*
4771
 * Recovery target settings: Only one of the several recovery_target* settings
4772
 * may be set.  Setting a second one results in an error.  The global variable
4773
 * recoveryTarget tracks which kind of recovery target was chosen.  Other
4774
 * variables store the actual target value (for example a string or a xid).
4775
 * The assign functions of the parameters check whether a competing parameter
4776
 * was already set.  But we want to allow setting the same parameter multiple
4777
 * times.  We also want to allow unsetting a parameter and setting a different
4778
 * one, so we unset recoveryTarget when the parameter is set to an empty
4779
 * string.
4780
 *
4781
 * XXX this code is broken by design.  Throwing an error from a GUC assign
4782
 * hook breaks fundamental assumptions of guc.c.  So long as all the variables
4783
 * for which this can happen are PGC_POSTMASTER, the consequences are limited,
4784
 * since we'd just abort postmaster startup anyway.  Nonetheless it's likely
4785
 * that we have odd behaviors such as unexpected GUC ordering dependencies.
4786
 */
4787
4788
pg_noreturn static void
4789
error_multiple_recovery_targets(void)
4790
0
{
4791
0
  ereport(ERROR,
4792
0
      (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4793
0
       errmsg("multiple recovery targets specified"),
4794
0
       errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set.")));
4795
0
}
4796
4797
/*
4798
 * GUC check_hook for recovery_target
4799
 */
4800
bool
4801
check_recovery_target(char **newval, void **extra, GucSource source)
4802
2
{
4803
2
  if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0)
4804
0
  {
4805
0
    GUC_check_errdetail("The only allowed value is \"immediate\".");
4806
0
    return false;
4807
0
  }
4808
2
  return true;
4809
2
}
4810
4811
/*
4812
 * GUC assign_hook for recovery_target
4813
 */
4814
void
4815
assign_recovery_target(const char *newval, void *extra)
4816
2
{
4817
2
  if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4818
2
    recoveryTarget != RECOVERY_TARGET_IMMEDIATE)
4819
0
    error_multiple_recovery_targets();
4820
4821
2
  if (newval && strcmp(newval, "") != 0)
4822
0
    recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
4823
2
  else
4824
2
    recoveryTarget = RECOVERY_TARGET_UNSET;
4825
2
}
4826
4827
/*
4828
 * GUC check_hook for recovery_target_lsn
4829
 */
4830
bool
4831
check_recovery_target_lsn(char **newval, void **extra, GucSource source)
4832
2
{
4833
2
  if (strcmp(*newval, "") != 0)
4834
0
  {
4835
0
    XLogRecPtr  lsn;
4836
0
    XLogRecPtr *myextra;
4837
0
    bool    have_error = false;
4838
4839
0
    lsn = pg_lsn_in_internal(*newval, &have_error);
4840
0
    if (have_error)
4841
0
      return false;
4842
4843
0
    myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr));
4844
0
    if (!myextra)
4845
0
      return false;
4846
0
    *myextra = lsn;
4847
0
    *extra = myextra;
4848
0
  }
4849
2
  return true;
4850
2
}
4851
4852
/*
4853
 * GUC assign_hook for recovery_target_lsn
4854
 */
4855
void
4856
assign_recovery_target_lsn(const char *newval, void *extra)
4857
2
{
4858
2
  if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4859
2
    recoveryTarget != RECOVERY_TARGET_LSN)
4860
0
    error_multiple_recovery_targets();
4861
4862
2
  if (newval && strcmp(newval, "") != 0)
4863
0
  {
4864
0
    recoveryTarget = RECOVERY_TARGET_LSN;
4865
0
    recoveryTargetLSN = *((XLogRecPtr *) extra);
4866
0
  }
4867
2
  else
4868
2
    recoveryTarget = RECOVERY_TARGET_UNSET;
4869
2
}
4870
4871
/*
4872
 * GUC check_hook for recovery_target_name
4873
 */
4874
bool
4875
check_recovery_target_name(char **newval, void **extra, GucSource source)
4876
2
{
4877
  /* Use the value of newval directly */
4878
2
  if (strlen(*newval) >= MAXFNAMELEN)
4879
0
  {
4880
0
    GUC_check_errdetail("\"%s\" is too long (maximum %d characters).",
4881
0
              "recovery_target_name", MAXFNAMELEN - 1);
4882
0
    return false;
4883
0
  }
4884
2
  return true;
4885
2
}
4886
4887
/*
4888
 * GUC assign_hook for recovery_target_name
4889
 */
4890
void
4891
assign_recovery_target_name(const char *newval, void *extra)
4892
2
{
4893
2
  if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4894
2
    recoveryTarget != RECOVERY_TARGET_NAME)
4895
0
    error_multiple_recovery_targets();
4896
4897
2
  if (newval && strcmp(newval, "") != 0)
4898
0
  {
4899
0
    recoveryTarget = RECOVERY_TARGET_NAME;
4900
0
    recoveryTargetName = newval;
4901
0
  }
4902
2
  else
4903
2
    recoveryTarget = RECOVERY_TARGET_UNSET;
4904
2
}
4905
4906
/*
4907
 * GUC check_hook for recovery_target_time
4908
 *
4909
 * The interpretation of the recovery_target_time string can depend on the
4910
 * time zone setting, so we need to wait until after all GUC processing is
4911
 * done before we can do the final parsing of the string.  This check function
4912
 * only does a parsing pass to catch syntax errors, but we store the string
4913
 * and parse it again when we need to use it.
4914
 */
4915
bool
4916
check_recovery_target_time(char **newval, void **extra, GucSource source)
4917
2
{
4918
2
  if (strcmp(*newval, "") != 0)
4919
0
  {
4920
    /* reject some special values */
4921
0
    if (strcmp(*newval, "now") == 0 ||
4922
0
      strcmp(*newval, "today") == 0 ||
4923
0
      strcmp(*newval, "tomorrow") == 0 ||
4924
0
      strcmp(*newval, "yesterday") == 0)
4925
0
    {
4926
0
      return false;
4927
0
    }
4928
4929
    /*
4930
     * parse timestamp value (see also timestamptz_in())
4931
     */
4932
0
    {
4933
0
      char     *str = *newval;
4934
0
      fsec_t    fsec;
4935
0
      struct pg_tm tt,
4936
0
             *tm = &tt;
4937
0
      int     tz;
4938
0
      int     dtype;
4939
0
      int     nf;
4940
0
      int     dterr;
4941
0
      char     *field[MAXDATEFIELDS];
4942
0
      int     ftype[MAXDATEFIELDS];
4943
0
      char    workbuf[MAXDATELEN + MAXDATEFIELDS];
4944
0
      DateTimeErrorExtra dtextra;
4945
0
      TimestampTz timestamp;
4946
4947
0
      dterr = ParseDateTime(str, workbuf, sizeof(workbuf),
4948
0
                  field, ftype, MAXDATEFIELDS, &nf);
4949
0
      if (dterr == 0)
4950
0
        dterr = DecodeDateTime(field, ftype, nf,
4951
0
                     &dtype, tm, &fsec, &tz, &dtextra);
4952
0
      if (dterr != 0)
4953
0
        return false;
4954
0
      if (dtype != DTK_DATE)
4955
0
        return false;
4956
4957
0
      if (tm2timestamp(tm, fsec, &tz, &timestamp) != 0)
4958
0
      {
4959
0
        GUC_check_errdetail("Timestamp out of range: \"%s\".", str);
4960
0
        return false;
4961
0
      }
4962
0
    }
4963
0
  }
4964
2
  return true;
4965
2
}
4966
4967
/*
4968
 * GUC assign_hook for recovery_target_time
4969
 */
4970
void
4971
assign_recovery_target_time(const char *newval, void *extra)
4972
2
{
4973
2
  if (recoveryTarget != RECOVERY_TARGET_UNSET &&
4974
2
    recoveryTarget != RECOVERY_TARGET_TIME)
4975
0
    error_multiple_recovery_targets();
4976
4977
2
  if (newval && strcmp(newval, "") != 0)
4978
0
    recoveryTarget = RECOVERY_TARGET_TIME;
4979
2
  else
4980
2
    recoveryTarget = RECOVERY_TARGET_UNSET;
4981
2
}
4982
4983
/*
4984
 * GUC check_hook for recovery_target_timeline
4985
 */
4986
bool
4987
check_recovery_target_timeline(char **newval, void **extra, GucSource source)
4988
2
{
4989
2
  RecoveryTargetTimeLineGoal rttg;
4990
2
  RecoveryTargetTimeLineGoal *myextra;
4991
4992
2
  if (strcmp(*newval, "current") == 0)
4993
0
    rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE;
4994
2
  else if (strcmp(*newval, "latest") == 0)
4995
2
    rttg = RECOVERY_TARGET_TIMELINE_LATEST;
4996
0
  else
4997
0
  {
4998
0
    char     *endp;
4999
0
    uint64    timeline;
5000
5001
0
    rttg = RECOVERY_TARGET_TIMELINE_NUMERIC;
5002
5003
0
    errno = 0;
5004
0
    timeline = strtou64(*newval, &endp, 0);
5005
5006
0
    if (*endp != '\0' || errno == EINVAL || errno == ERANGE)
5007
0
    {
5008
0
      GUC_check_errdetail("\"%s\" is not a valid number.",
5009
0
                "recovery_target_timeline");
5010
0
      return false;
5011
0
    }
5012
5013
0
    if (timeline < 1 || timeline > PG_UINT32_MAX)
5014
0
    {
5015
0
      GUC_check_errdetail("\"%s\" must be between %u and %u.",
5016
0
                "recovery_target_timeline", 1, UINT_MAX);
5017
0
      return false;
5018
0
    }
5019
0
  }
5020
5021
2
  myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal));
5022
2
  if (!myextra)
5023
0
    return false;
5024
2
  *myextra = rttg;
5025
2
  *extra = myextra;
5026
5027
2
  return true;
5028
2
}
5029
5030
/*
5031
 * GUC assign_hook for recovery_target_timeline
5032
 */
5033
void
5034
assign_recovery_target_timeline(const char *newval, void *extra)
5035
2
{
5036
2
  recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra);
5037
2
  if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5038
0
    recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0);
5039
2
  else
5040
2
    recoveryTargetTLIRequested = 0;
5041
2
}
5042
5043
/*
5044
 * GUC check_hook for recovery_target_xid
5045
 */
5046
bool
5047
check_recovery_target_xid(char **newval, void **extra, GucSource source)
5048
2
{
5049
2
  if (strcmp(*newval, "") != 0)
5050
0
  {
5051
0
    TransactionId xid;
5052
0
    TransactionId *myextra;
5053
5054
0
    errno = 0;
5055
0
    xid = (TransactionId) strtou64(*newval, NULL, 0);
5056
0
    if (errno == EINVAL || errno == ERANGE)
5057
0
      return false;
5058
5059
0
    myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId));
5060
0
    if (!myextra)
5061
0
      return false;
5062
0
    *myextra = xid;
5063
0
    *extra = myextra;
5064
0
  }
5065
2
  return true;
5066
2
}
5067
5068
/*
5069
 * GUC assign_hook for recovery_target_xid
5070
 */
5071
void
5072
assign_recovery_target_xid(const char *newval, void *extra)
5073
2
{
5074
2
  if (recoveryTarget != RECOVERY_TARGET_UNSET &&
5075
2
    recoveryTarget != RECOVERY_TARGET_XID)
5076
0
    error_multiple_recovery_targets();
5077
5078
2
  if (newval && strcmp(newval, "") != 0)
5079
0
  {
5080
0
    recoveryTarget = RECOVERY_TARGET_XID;
5081
0
    recoveryTargetXid = *((TransactionId *) extra);
5082
0
  }
5083
2
  else
5084
2
    recoveryTarget = RECOVERY_TARGET_UNSET;
5085
2
}