/src/postgres/src/backend/access/transam/xlogrecovery.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * xlogrecovery.c |
4 | | * Functions for WAL recovery, standby mode |
5 | | * |
6 | | * This source file contains functions controlling WAL recovery. |
7 | | * InitWalRecovery() initializes the system for crash or archive recovery, |
8 | | * or standby mode, depending on configuration options and the state of |
9 | | * the control file and possible backup label file. PerformWalRecovery() |
10 | | * performs the actual WAL replay, calling the rmgr-specific redo routines. |
11 | | * FinishWalRecovery() performs end-of-recovery checks and cleanup actions, |
12 | | * and prepares information needed to initialize the WAL for writes. In |
13 | | * addition to these three main functions, there are a bunch of functions |
14 | | * for interrogating recovery state and controlling the recovery process. |
15 | | * |
16 | | * |
17 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
18 | | * Portions Copyright (c) 1994, Regents of the University of California |
19 | | * |
20 | | * src/backend/access/transam/xlogrecovery.c |
21 | | * |
22 | | *------------------------------------------------------------------------- |
23 | | */ |
24 | | |
25 | | #include "postgres.h" |
26 | | |
27 | | #include <ctype.h> |
28 | | #include <math.h> |
29 | | #include <time.h> |
30 | | #include <sys/stat.h> |
31 | | #include <sys/time.h> |
32 | | #include <unistd.h> |
33 | | |
34 | | #include "access/timeline.h" |
35 | | #include "access/transam.h" |
36 | | #include "access/xact.h" |
37 | | #include "access/xlog_internal.h" |
38 | | #include "access/xlogarchive.h" |
39 | | #include "access/xlogprefetcher.h" |
40 | | #include "access/xlogreader.h" |
41 | | #include "access/xlogrecovery.h" |
42 | | #include "access/xlogutils.h" |
43 | | #include "backup/basebackup.h" |
44 | | #include "catalog/pg_control.h" |
45 | | #include "commands/tablespace.h" |
46 | | #include "common/file_utils.h" |
47 | | #include "miscadmin.h" |
48 | | #include "pgstat.h" |
49 | | #include "postmaster/bgwriter.h" |
50 | | #include "postmaster/startup.h" |
51 | | #include "replication/slot.h" |
52 | | #include "replication/slotsync.h" |
53 | | #include "replication/walreceiver.h" |
54 | | #include "storage/fd.h" |
55 | | #include "storage/ipc.h" |
56 | | #include "storage/latch.h" |
57 | | #include "storage/pmsignal.h" |
58 | | #include "storage/procarray.h" |
59 | | #include "storage/spin.h" |
60 | | #include "utils/datetime.h" |
61 | | #include "utils/fmgrprotos.h" |
62 | | #include "utils/guc_hooks.h" |
63 | | #include "utils/pgstat_internal.h" |
64 | | #include "utils/pg_lsn.h" |
65 | | #include "utils/ps_status.h" |
66 | | #include "utils/pg_rusage.h" |
67 | | |
68 | | /* Unsupported old recovery command file names (relative to $PGDATA) */ |
69 | 0 | #define RECOVERY_COMMAND_FILE "recovery.conf" |
70 | 0 | #define RECOVERY_COMMAND_DONE "recovery.done" |
71 | | |
72 | | /* |
73 | | * GUC support |
74 | | */ |
75 | | const struct config_enum_entry recovery_target_action_options[] = { |
76 | | {"pause", RECOVERY_TARGET_ACTION_PAUSE, false}, |
77 | | {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false}, |
78 | | {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false}, |
79 | | {NULL, 0, false} |
80 | | }; |
81 | | |
82 | | /* options formerly taken from recovery.conf for archive recovery */ |
83 | | char *recoveryRestoreCommand = NULL; |
84 | | char *recoveryEndCommand = NULL; |
85 | | char *archiveCleanupCommand = NULL; |
86 | | RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET; |
87 | | bool recoveryTargetInclusive = true; |
88 | | int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE; |
89 | | TransactionId recoveryTargetXid; |
90 | | char *recovery_target_time_string; |
91 | | TimestampTz recoveryTargetTime; |
92 | | const char *recoveryTargetName; |
93 | | XLogRecPtr recoveryTargetLSN; |
94 | | int recovery_min_apply_delay = 0; |
95 | | |
96 | | /* options formerly taken from recovery.conf for XLOG streaming */ |
97 | | char *PrimaryConnInfo = NULL; |
98 | | char *PrimarySlotName = NULL; |
99 | | bool wal_receiver_create_temp_slot = false; |
100 | | |
101 | | /* |
102 | | * recoveryTargetTimeLineGoal: what the user requested, if any |
103 | | * |
104 | | * recoveryTargetTLIRequested: numeric value of requested timeline, if constant |
105 | | * |
106 | | * recoveryTargetTLI: the currently understood target timeline; changes |
107 | | * |
108 | | * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and |
109 | | * the timelines of its known parents, newest first (so recoveryTargetTLI is |
110 | | * always the first list member). Only these TLIs are expected to be seen in |
111 | | * the WAL segments we read, and indeed only these TLIs will be considered as |
112 | | * candidate WAL files to open at all. |
113 | | * |
114 | | * curFileTLI: the TLI appearing in the name of the current input WAL file. |
115 | | * (This is not necessarily the same as the timeline from which we are |
116 | | * replaying WAL, which StartupXLOG calls replayTLI, because we could be |
117 | | * scanning data that was copied from an ancestor timeline when the current |
118 | | * file was created.) During a sequential scan we do not allow this value |
119 | | * to decrease. |
120 | | */ |
121 | | RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST; |
122 | | TimeLineID recoveryTargetTLIRequested = 0; |
123 | | TimeLineID recoveryTargetTLI = 0; |
124 | | static List *expectedTLEs; |
125 | | static TimeLineID curFileTLI; |
126 | | |
127 | | /* |
128 | | * When ArchiveRecoveryRequested is set, archive recovery was requested, |
129 | | * ie. signal files were present. When InArchiveRecovery is set, we are |
130 | | * currently recovering using offline XLOG archives. These variables are only |
131 | | * valid in the startup process. |
132 | | * |
133 | | * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're |
134 | | * currently performing crash recovery using only XLOG files in pg_wal, but |
135 | | * will switch to using offline XLOG archives as soon as we reach the end of |
136 | | * WAL in pg_wal. |
137 | | */ |
138 | | bool ArchiveRecoveryRequested = false; |
139 | | bool InArchiveRecovery = false; |
140 | | |
141 | | /* |
142 | | * When StandbyModeRequested is set, standby mode was requested, i.e. |
143 | | * standby.signal file was present. When StandbyMode is set, we are currently |
144 | | * in standby mode. These variables are only valid in the startup process. |
145 | | * They work similarly to ArchiveRecoveryRequested and InArchiveRecovery. |
146 | | */ |
147 | | static bool StandbyModeRequested = false; |
148 | | bool StandbyMode = false; |
149 | | |
150 | | /* was a signal file present at startup? */ |
151 | | static bool standby_signal_file_found = false; |
152 | | static bool recovery_signal_file_found = false; |
153 | | |
154 | | /* |
155 | | * CheckPointLoc is the position of the checkpoint record that determines |
156 | | * where to start the replay. It comes from the backup label file or the |
157 | | * control file. |
158 | | * |
159 | | * RedoStartLSN is the checkpoint's REDO location, also from the backup label |
160 | | * file or the control file. In standby mode, XLOG streaming usually starts |
161 | | * from the position where an invalid record was found. But if we fail to |
162 | | * read even the initial checkpoint record, we use the REDO location instead |
163 | | * of the checkpoint location as the start position of XLOG streaming. |
164 | | * Otherwise we would have to jump backwards to the REDO location after |
165 | | * reading the checkpoint record, because the REDO record can precede the |
166 | | * checkpoint record. |
167 | | */ |
168 | | static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr; |
169 | | static TimeLineID CheckPointTLI = 0; |
170 | | static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr; |
171 | | static TimeLineID RedoStartTLI = 0; |
172 | | |
173 | | /* |
174 | | * Local copy of SharedHotStandbyActive variable. False actually means "not |
175 | | * known, need to check the shared state". |
176 | | */ |
177 | | static bool LocalHotStandbyActive = false; |
178 | | |
179 | | /* |
180 | | * Local copy of SharedPromoteIsTriggered variable. False actually means "not |
181 | | * known, need to check the shared state". |
182 | | */ |
183 | | static bool LocalPromoteIsTriggered = false; |
184 | | |
185 | | /* Has the recovery code requested a walreceiver wakeup? */ |
186 | | static bool doRequestWalReceiverReply; |
187 | | |
188 | | /* XLogReader object used to parse the WAL records */ |
189 | | static XLogReaderState *xlogreader = NULL; |
190 | | |
191 | | /* XLogPrefetcher object used to consume WAL records with read-ahead */ |
192 | | static XLogPrefetcher *xlogprefetcher = NULL; |
193 | | |
194 | | /* Parameters passed down from ReadRecord to the XLogPageRead callback. */ |
195 | | typedef struct XLogPageReadPrivate |
196 | | { |
197 | | int emode; |
198 | | bool fetching_ckpt; /* are we fetching a checkpoint record? */ |
199 | | bool randAccess; |
200 | | TimeLineID replayTLI; |
201 | | } XLogPageReadPrivate; |
202 | | |
203 | | /* flag to tell XLogPageRead that we have started replaying */ |
204 | | static bool InRedo = false; |
205 | | |
206 | | /* |
207 | | * Codes indicating where we got a WAL file from during recovery, or where |
208 | | * to attempt to get one. |
209 | | */ |
210 | | typedef enum |
211 | | { |
212 | | XLOG_FROM_ANY = 0, /* request to read WAL from any source */ |
213 | | XLOG_FROM_ARCHIVE, /* restored using restore_command */ |
214 | | XLOG_FROM_PG_WAL, /* existing file in pg_wal */ |
215 | | XLOG_FROM_STREAM, /* streamed from primary */ |
216 | | } XLogSource; |
217 | | |
218 | | /* human-readable names for XLogSources, for debugging output */ |
219 | | static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"}; |
220 | | |
221 | | /* |
222 | | * readFile is -1 or a kernel FD for the log file segment that's currently |
223 | | * open for reading. readSegNo identifies the segment. readOff is the offset |
224 | | * of the page just read, readLen indicates how much of it has been read into |
225 | | * readBuf, and readSource indicates where we got the currently open file from. |
226 | | * |
227 | | * Note: we could use Reserve/ReleaseExternalFD to track consumption of this |
228 | | * FD too (like for openLogFile in xlog.c); but it doesn't currently seem |
229 | | * worthwhile, since the XLOG is not read by general-purpose sessions. |
230 | | */ |
231 | | static int readFile = -1; |
232 | | static XLogSegNo readSegNo = 0; |
233 | | static uint32 readOff = 0; |
234 | | static uint32 readLen = 0; |
235 | | static XLogSource readSource = XLOG_FROM_ANY; |
236 | | |
237 | | /* |
238 | | * Keeps track of which source we're currently reading from. This is |
239 | | * different from readSource in that this is always set, even when we don't |
240 | | * currently have a WAL file open. If lastSourceFailed is set, our last |
241 | | * attempt to read from currentSource failed, and we should try another source |
242 | | * next. |
243 | | * |
244 | | * pendingWalRcvRestart is set when a config change occurs that requires a |
245 | | * walreceiver restart. This is only valid in XLOG_FROM_STREAM state. |
246 | | */ |
247 | | static XLogSource currentSource = XLOG_FROM_ANY; |
248 | | static bool lastSourceFailed = false; |
249 | | static bool pendingWalRcvRestart = false; |
250 | | |
251 | | /* |
252 | | * These variables track when we last obtained some WAL data to process, |
253 | | * and where we got it from. (XLogReceiptSource is initially the same as |
254 | | * readSource, but readSource gets reset to zero when we don't have data |
255 | | * to process right now. It is also different from currentSource, which |
256 | | * also changes when we try to read from a source and fail, while |
257 | | * XLogReceiptSource tracks where we last successfully read some WAL.) |
258 | | */ |
259 | | static TimestampTz XLogReceiptTime = 0; |
260 | | static XLogSource XLogReceiptSource = XLOG_FROM_ANY; |
261 | | |
262 | | /* Local copy of WalRcv->flushedUpto */ |
263 | | static XLogRecPtr flushedUpto = 0; |
264 | | static TimeLineID receiveTLI = 0; |
265 | | |
266 | | /* |
267 | | * Copy of minRecoveryPoint and backupEndPoint from the control file. |
268 | | * |
269 | | * In order to reach consistency, we must replay the WAL up to |
270 | | * minRecoveryPoint. If backupEndRequired is true, we must also reach |
271 | | * backupEndPoint, or if it's invalid, an end-of-backup record corresponding |
272 | | * to backupStartPoint. |
273 | | * |
274 | | * Note: In archive recovery, after consistency has been reached, the |
275 | | * functions in xlog.c will start updating minRecoveryPoint in the control |
276 | | * file. But this copy of minRecoveryPoint variable reflects the value at the |
277 | | * beginning of recovery, and is *not* updated after consistency is reached. |
278 | | */ |
279 | | static XLogRecPtr minRecoveryPoint; |
280 | | static TimeLineID minRecoveryPointTLI; |
281 | | |
282 | | static XLogRecPtr backupStartPoint; |
283 | | static XLogRecPtr backupEndPoint; |
284 | | static bool backupEndRequired = false; |
285 | | |
286 | | /* |
287 | | * Have we reached a consistent database state? In crash recovery, we have |
288 | | * to replay all the WAL, so reachedConsistency is never set. During archive |
289 | | * recovery, the database is consistent once minRecoveryPoint is reached. |
290 | | * |
291 | | * Consistent state means that the system is internally consistent, all |
292 | | * the WAL has been replayed up to a certain point, and importantly, there |
293 | | * is no trace of later actions on disk. |
294 | | * |
295 | | * This flag is used only by the startup process and postmaster. When |
296 | | * minRecoveryPoint is reached, the startup process sets it to true and |
297 | | * sends a PMSIGNAL_RECOVERY_CONSISTENT signal to the postmaster, |
298 | | * which then sets it to true upon receiving the signal. |
299 | | */ |
300 | | bool reachedConsistency = false; |
301 | | |
302 | | /* Buffers dedicated to consistency checks of size BLCKSZ */ |
303 | | static char *replay_image_masked = NULL; |
304 | | static char *primary_image_masked = NULL; |
305 | | |
306 | | |
307 | | /* |
308 | | * Shared-memory state for WAL recovery. |
309 | | */ |
310 | | typedef struct XLogRecoveryCtlData |
311 | | { |
312 | | /* |
313 | | * SharedHotStandbyActive indicates if we allow hot standby queries to be |
314 | | * run. Protected by info_lck. |
315 | | */ |
316 | | bool SharedHotStandbyActive; |
317 | | |
318 | | /* |
319 | | * SharedPromoteIsTriggered indicates if a standby promotion has been |
320 | | * triggered. Protected by info_lck. |
321 | | */ |
322 | | bool SharedPromoteIsTriggered; |
323 | | |
324 | | /* |
325 | | * recoveryWakeupLatch is used to wake up the startup process to continue |
326 | | * WAL replay, if it is waiting for WAL to arrive or promotion to be |
327 | | * requested. |
328 | | * |
329 | | * Note that the startup process also uses another latch, its procLatch, |
330 | | * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for |
331 | | * signaling the startup process in favor of using its procLatch, which |
332 | | * comports better with possible generic signal handlers using that latch. |
333 | | * But we should not do that because the startup process doesn't assume |
334 | | * that it's waken up by walreceiver process or SIGHUP signal handler |
335 | | * while it's waiting for recovery conflict. The separate latches, |
336 | | * recoveryWakeupLatch and procLatch, should be used for inter-process |
337 | | * communication for WAL replay and recovery conflict, respectively. |
338 | | */ |
339 | | Latch recoveryWakeupLatch; |
340 | | |
341 | | /* |
342 | | * Last record successfully replayed. |
343 | | */ |
344 | | XLogRecPtr lastReplayedReadRecPtr; /* start position */ |
345 | | XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */ |
346 | | TimeLineID lastReplayedTLI; /* timeline */ |
347 | | |
348 | | /* |
349 | | * When we're currently replaying a record, ie. in a redo function, |
350 | | * replayEndRecPtr points to the end+1 of the record being replayed, |
351 | | * otherwise it's equal to lastReplayedEndRecPtr. |
352 | | */ |
353 | | XLogRecPtr replayEndRecPtr; |
354 | | TimeLineID replayEndTLI; |
355 | | /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */ |
356 | | TimestampTz recoveryLastXTime; |
357 | | |
358 | | /* |
359 | | * timestamp of when we started replaying the current chunk of WAL data, |
360 | | * only relevant for replication or archive recovery |
361 | | */ |
362 | | TimestampTz currentChunkStartTime; |
363 | | /* Recovery pause state */ |
364 | | RecoveryPauseState recoveryPauseState; |
365 | | ConditionVariable recoveryNotPausedCV; |
366 | | |
367 | | slock_t info_lck; /* locks shared variables shown above */ |
368 | | } XLogRecoveryCtlData; |
369 | | |
370 | | static XLogRecoveryCtlData *XLogRecoveryCtl = NULL; |
371 | | |
372 | | /* |
373 | | * abortedRecPtr is the start pointer of a broken record at end of WAL when |
374 | | * recovery completes; missingContrecPtr is the location of the first |
375 | | * contrecord that went missing. See CreateOverwriteContrecordRecord for |
376 | | * details. |
377 | | */ |
378 | | static XLogRecPtr abortedRecPtr; |
379 | | static XLogRecPtr missingContrecPtr; |
380 | | |
381 | | /* |
382 | | * if recoveryStopsBefore/After returns true, it saves information of the stop |
383 | | * point here |
384 | | */ |
385 | | static TransactionId recoveryStopXid; |
386 | | static TimestampTz recoveryStopTime; |
387 | | static XLogRecPtr recoveryStopLSN; |
388 | | static char recoveryStopName[MAXFNAMELEN]; |
389 | | static bool recoveryStopAfter; |
390 | | |
391 | | /* prototypes for local functions */ |
392 | | static void ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI); |
393 | | |
394 | | static void EnableStandbyMode(void); |
395 | | static void readRecoverySignalFile(void); |
396 | | static void validateRecoveryParameters(void); |
397 | | static bool read_backup_label(XLogRecPtr *checkPointLoc, |
398 | | TimeLineID *backupLabelTLI, |
399 | | bool *backupEndRequired, bool *backupFromStandby); |
400 | | static bool read_tablespace_map(List **tablespaces); |
401 | | |
402 | | static void xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI); |
403 | | static void CheckRecoveryConsistency(void); |
404 | | static void rm_redo_error_callback(void *arg); |
405 | | #ifdef WAL_DEBUG |
406 | | static void xlog_outrec(StringInfo buf, XLogReaderState *record); |
407 | | #endif |
408 | | static void xlog_block_info(StringInfo buf, XLogReaderState *record); |
409 | | static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, |
410 | | TimeLineID prevTLI, TimeLineID replayTLI); |
411 | | static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime); |
412 | | static void verifyBackupPageConsistency(XLogReaderState *record); |
413 | | |
414 | | static bool recoveryStopsBefore(XLogReaderState *record); |
415 | | static bool recoveryStopsAfter(XLogReaderState *record); |
416 | | static char *getRecoveryStopReason(void); |
417 | | static void recoveryPausesHere(bool endOfRecovery); |
418 | | static bool recoveryApplyDelay(XLogReaderState *record); |
419 | | static void ConfirmRecoveryPaused(void); |
420 | | |
421 | | static XLogRecord *ReadRecord(XLogPrefetcher *xlogprefetcher, |
422 | | int emode, bool fetching_ckpt, |
423 | | TimeLineID replayTLI); |
424 | | |
425 | | static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, |
426 | | int reqLen, XLogRecPtr targetRecPtr, char *readBuf); |
427 | | static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, |
428 | | bool randAccess, |
429 | | bool fetching_ckpt, |
430 | | XLogRecPtr tliRecPtr, |
431 | | TimeLineID replayTLI, |
432 | | XLogRecPtr replayLSN, |
433 | | bool nonblocking); |
434 | | static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); |
435 | | static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, |
436 | | XLogRecPtr RecPtr, TimeLineID replayTLI); |
437 | | static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN); |
438 | | static int XLogFileRead(XLogSegNo segno, TimeLineID tli, |
439 | | XLogSource source, bool notfoundOk); |
440 | | static int XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source); |
441 | | |
442 | | static bool CheckForStandbyTrigger(void); |
443 | | static void SetPromoteIsTriggered(void); |
444 | | static bool HotStandbyActiveInReplay(void); |
445 | | |
446 | | static void SetCurrentChunkStartTime(TimestampTz xtime); |
447 | | static void SetLatestXTime(TimestampTz xtime); |
448 | | |
449 | | /* |
450 | | * Initialization of shared memory for WAL recovery |
451 | | */ |
452 | | Size |
453 | | XLogRecoveryShmemSize(void) |
454 | 0 | { |
455 | 0 | Size size; |
456 | | |
457 | | /* XLogRecoveryCtl */ |
458 | 0 | size = sizeof(XLogRecoveryCtlData); |
459 | |
|
460 | 0 | return size; |
461 | 0 | } |
462 | | |
463 | | void |
464 | | XLogRecoveryShmemInit(void) |
465 | 0 | { |
466 | 0 | bool found; |
467 | |
|
468 | 0 | XLogRecoveryCtl = (XLogRecoveryCtlData *) |
469 | 0 | ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found); |
470 | 0 | if (found) |
471 | 0 | return; |
472 | 0 | memset(XLogRecoveryCtl, 0, sizeof(XLogRecoveryCtlData)); |
473 | |
|
474 | 0 | SpinLockInit(&XLogRecoveryCtl->info_lck); |
475 | 0 | InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch); |
476 | 0 | ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV); |
477 | 0 | } |
478 | | |
479 | | /* |
480 | | * A thin wrapper to enable StandbyMode and do other preparatory work as |
481 | | * needed. |
482 | | */ |
483 | | static void |
484 | | EnableStandbyMode(void) |
485 | 0 | { |
486 | 0 | StandbyMode = true; |
487 | | |
488 | | /* |
489 | | * To avoid server log bloat, we don't report recovery progress in a |
490 | | * standby as it will always be in recovery unless promoted. We disable |
491 | | * startup progress timeout in standby mode to avoid calling |
492 | | * startup_progress_timeout_handler() unnecessarily. |
493 | | */ |
494 | 0 | disable_startup_progress_timeout(); |
495 | 0 | } |
496 | | |
497 | | /* |
498 | | * Prepare the system for WAL recovery, if needed. |
499 | | * |
500 | | * This is called by StartupXLOG() which coordinates the server startup |
501 | | * sequence. This function analyzes the control file and the backup label |
502 | | * file, if any, and figures out whether we need to perform crash recovery or |
503 | | * archive recovery, and how far we need to replay the WAL to reach a |
504 | | * consistent state. |
505 | | * |
506 | | * This doesn't yet change the on-disk state, except for creating the symlinks |
507 | | * from table space map file if any, and for fetching WAL files needed to find |
508 | | * the checkpoint record. On entry, the caller has already read the control |
509 | | * file into memory, and passes it as argument. This function updates it to |
510 | | * reflect the recovery state, and the caller is expected to write it back to |
511 | | * disk does after initializing other subsystems, but before calling |
512 | | * PerformWalRecovery(). |
513 | | * |
514 | | * This initializes some global variables like ArchiveRecoveryRequested, and |
515 | | * StandbyModeRequested and InRecovery. |
516 | | */ |
517 | | void |
518 | | InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, |
519 | | bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr) |
520 | 0 | { |
521 | 0 | XLogPageReadPrivate *private; |
522 | 0 | struct stat st; |
523 | 0 | bool wasShutdown; |
524 | 0 | XLogRecord *record; |
525 | 0 | DBState dbstate_at_startup; |
526 | 0 | bool haveTblspcMap = false; |
527 | 0 | bool haveBackupLabel = false; |
528 | 0 | CheckPoint checkPoint; |
529 | 0 | bool backupFromStandby = false; |
530 | |
|
531 | 0 | dbstate_at_startup = ControlFile->state; |
532 | | |
533 | | /* |
534 | | * Initialize on the assumption we want to recover to the latest timeline |
535 | | * that's active according to pg_control. |
536 | | */ |
537 | 0 | if (ControlFile->minRecoveryPointTLI > |
538 | 0 | ControlFile->checkPointCopy.ThisTimeLineID) |
539 | 0 | recoveryTargetTLI = ControlFile->minRecoveryPointTLI; |
540 | 0 | else |
541 | 0 | recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID; |
542 | | |
543 | | /* |
544 | | * Check for signal files, and if so set up state for offline recovery |
545 | | */ |
546 | 0 | readRecoverySignalFile(); |
547 | 0 | validateRecoveryParameters(); |
548 | | |
549 | | /* |
550 | | * Take ownership of the wakeup latch if we're going to sleep during |
551 | | * recovery, if required. |
552 | | */ |
553 | 0 | if (ArchiveRecoveryRequested) |
554 | 0 | OwnLatch(&XLogRecoveryCtl->recoveryWakeupLatch); |
555 | | |
556 | | /* |
557 | | * Set the WAL reading processor now, as it will be needed when reading |
558 | | * the checkpoint record required (backup_label or not). |
559 | | */ |
560 | 0 | private = palloc0(sizeof(XLogPageReadPrivate)); |
561 | 0 | xlogreader = |
562 | 0 | XLogReaderAllocate(wal_segment_size, NULL, |
563 | 0 | XL_ROUTINE(.page_read = &XLogPageRead, |
564 | 0 | .segment_open = NULL, |
565 | 0 | .segment_close = wal_segment_close), |
566 | 0 | private); |
567 | 0 | if (!xlogreader) |
568 | 0 | ereport(ERROR, |
569 | 0 | (errcode(ERRCODE_OUT_OF_MEMORY), |
570 | 0 | errmsg("out of memory"), |
571 | 0 | errdetail("Failed while allocating a WAL reading processor."))); |
572 | 0 | xlogreader->system_identifier = ControlFile->system_identifier; |
573 | | |
574 | | /* |
575 | | * Set the WAL decode buffer size. This limits how far ahead we can read |
576 | | * in the WAL. |
577 | | */ |
578 | 0 | XLogReaderSetDecodeBuffer(xlogreader, NULL, wal_decode_buffer_size); |
579 | | |
580 | | /* Create a WAL prefetcher. */ |
581 | 0 | xlogprefetcher = XLogPrefetcherAllocate(xlogreader); |
582 | | |
583 | | /* |
584 | | * Allocate two page buffers dedicated to WAL consistency checks. We do |
585 | | * it this way, rather than just making static arrays, for two reasons: |
586 | | * (1) no need to waste the storage in most instantiations of the backend; |
587 | | * (2) a static char array isn't guaranteed to have any particular |
588 | | * alignment, whereas palloc() will provide MAXALIGN'd storage. |
589 | | */ |
590 | 0 | replay_image_masked = (char *) palloc(BLCKSZ); |
591 | 0 | primary_image_masked = (char *) palloc(BLCKSZ); |
592 | | |
593 | | /* |
594 | | * Read the backup_label file. We want to run this part of the recovery |
595 | | * process after checking for signal files and after performing validation |
596 | | * of the recovery parameters. |
597 | | */ |
598 | 0 | if (read_backup_label(&CheckPointLoc, &CheckPointTLI, &backupEndRequired, |
599 | 0 | &backupFromStandby)) |
600 | 0 | { |
601 | 0 | List *tablespaces = NIL; |
602 | | |
603 | | /* |
604 | | * Archive recovery was requested, and thanks to the backup label |
605 | | * file, we know how far we need to replay to reach consistency. Enter |
606 | | * archive recovery directly. |
607 | | */ |
608 | 0 | InArchiveRecovery = true; |
609 | 0 | if (StandbyModeRequested) |
610 | 0 | EnableStandbyMode(); |
611 | | |
612 | | /* |
613 | | * Omitting backup_label when creating a new replica, PITR node etc. |
614 | | * unfortunately is a common cause of corruption. Logging that |
615 | | * backup_label was used makes it a bit easier to exclude that as the |
616 | | * cause of observed corruption. |
617 | | * |
618 | | * Do so before we try to read the checkpoint record (which can fail), |
619 | | * as otherwise it can be hard to understand why a checkpoint other |
620 | | * than ControlFile->checkPoint is used. |
621 | | */ |
622 | 0 | ereport(LOG, |
623 | 0 | errmsg("starting backup recovery with redo LSN %X/%08X, checkpoint LSN %X/%08X, on timeline ID %u", |
624 | 0 | LSN_FORMAT_ARGS(RedoStartLSN), |
625 | 0 | LSN_FORMAT_ARGS(CheckPointLoc), |
626 | 0 | CheckPointTLI)); |
627 | | |
628 | | /* |
629 | | * When a backup_label file is present, we want to roll forward from |
630 | | * the checkpoint it identifies, rather than using pg_control. |
631 | | */ |
632 | 0 | record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, |
633 | 0 | CheckPointTLI); |
634 | 0 | if (record != NULL) |
635 | 0 | { |
636 | 0 | memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); |
637 | 0 | wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); |
638 | 0 | ereport(DEBUG1, |
639 | 0 | errmsg_internal("checkpoint record is at %X/%08X", |
640 | 0 | LSN_FORMAT_ARGS(CheckPointLoc))); |
641 | 0 | InRecovery = true; /* force recovery even if SHUTDOWNED */ |
642 | | |
643 | | /* |
644 | | * Make sure that REDO location exists. This may not be the case |
645 | | * if there was a crash during an online backup, which left a |
646 | | * backup_label around that references a WAL segment that's |
647 | | * already been archived. |
648 | | */ |
649 | 0 | if (checkPoint.redo < CheckPointLoc) |
650 | 0 | { |
651 | 0 | XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo); |
652 | 0 | if (!ReadRecord(xlogprefetcher, LOG, false, |
653 | 0 | checkPoint.ThisTimeLineID)) |
654 | 0 | ereport(FATAL, |
655 | 0 | errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X", |
656 | 0 | LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)), |
657 | 0 | errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" |
658 | 0 | "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" |
659 | 0 | "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", |
660 | 0 | DataDir, DataDir, DataDir, DataDir)); |
661 | 0 | } |
662 | 0 | } |
663 | 0 | else |
664 | 0 | { |
665 | 0 | ereport(FATAL, |
666 | 0 | errmsg("could not locate required checkpoint record at %X/%08X", |
667 | 0 | LSN_FORMAT_ARGS(CheckPointLoc)), |
668 | 0 | errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" or \"%s/standby.signal\" and add required recovery options.\n" |
669 | 0 | "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" |
670 | 0 | "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", |
671 | 0 | DataDir, DataDir, DataDir, DataDir)); |
672 | 0 | wasShutdown = false; /* keep compiler quiet */ |
673 | 0 | } |
674 | | |
675 | | /* Read the tablespace_map file if present and create symlinks. */ |
676 | 0 | if (read_tablespace_map(&tablespaces)) |
677 | 0 | { |
678 | 0 | ListCell *lc; |
679 | |
|
680 | 0 | foreach(lc, tablespaces) |
681 | 0 | { |
682 | 0 | tablespaceinfo *ti = lfirst(lc); |
683 | 0 | char *linkloc; |
684 | |
|
685 | 0 | linkloc = psprintf("%s/%u", PG_TBLSPC_DIR, ti->oid); |
686 | | |
687 | | /* |
688 | | * Remove the existing symlink if any and Create the symlink |
689 | | * under PGDATA. |
690 | | */ |
691 | 0 | remove_tablespace_symlink(linkloc); |
692 | |
|
693 | 0 | if (symlink(ti->path, linkloc) < 0) |
694 | 0 | ereport(ERROR, |
695 | 0 | (errcode_for_file_access(), |
696 | 0 | errmsg("could not create symbolic link \"%s\": %m", |
697 | 0 | linkloc))); |
698 | | |
699 | 0 | pfree(ti->path); |
700 | 0 | pfree(ti); |
701 | 0 | } |
702 | | |
703 | | /* tell the caller to delete it later */ |
704 | 0 | haveTblspcMap = true; |
705 | 0 | } |
706 | | |
707 | | /* tell the caller to delete it later */ |
708 | 0 | haveBackupLabel = true; |
709 | 0 | } |
710 | 0 | else |
711 | 0 | { |
712 | | /* No backup_label file has been found if we are here. */ |
713 | | |
714 | | /* |
715 | | * If tablespace_map file is present without backup_label file, there |
716 | | * is no use of such file. There is no harm in retaining it, but it |
717 | | * is better to get rid of the map file so that we don't have any |
718 | | * redundant file in data directory and it will avoid any sort of |
719 | | * confusion. It seems prudent though to just rename the file out of |
720 | | * the way rather than delete it completely, also we ignore any error |
721 | | * that occurs in rename operation as even if map file is present |
722 | | * without backup_label file, it is harmless. |
723 | | */ |
724 | 0 | if (stat(TABLESPACE_MAP, &st) == 0) |
725 | 0 | { |
726 | 0 | unlink(TABLESPACE_MAP_OLD); |
727 | 0 | if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) |
728 | 0 | ereport(LOG, |
729 | 0 | (errmsg("ignoring file \"%s\" because no file \"%s\" exists", |
730 | 0 | TABLESPACE_MAP, BACKUP_LABEL_FILE), |
731 | 0 | errdetail("File \"%s\" was renamed to \"%s\".", |
732 | 0 | TABLESPACE_MAP, TABLESPACE_MAP_OLD))); |
733 | 0 | else |
734 | 0 | ereport(LOG, |
735 | 0 | (errmsg("ignoring file \"%s\" because no file \"%s\" exists", |
736 | 0 | TABLESPACE_MAP, BACKUP_LABEL_FILE), |
737 | 0 | errdetail("Could not rename file \"%s\" to \"%s\": %m.", |
738 | 0 | TABLESPACE_MAP, TABLESPACE_MAP_OLD))); |
739 | 0 | } |
740 | | |
741 | | /* |
742 | | * It's possible that archive recovery was requested, but we don't |
743 | | * know how far we need to replay the WAL before we reach consistency. |
744 | | * This can happen for example if a base backup is taken from a |
745 | | * running server using an atomic filesystem snapshot, without calling |
746 | | * pg_backup_start/stop. Or if you just kill a running primary server |
747 | | * and put it into archive recovery by creating a recovery signal |
748 | | * file. |
749 | | * |
750 | | * Our strategy in that case is to perform crash recovery first, |
751 | | * replaying all the WAL present in pg_wal, and only enter archive |
752 | | * recovery after that. |
753 | | * |
754 | | * But usually we already know how far we need to replay the WAL (up |
755 | | * to minRecoveryPoint, up to backupEndPoint, or until we see an |
756 | | * end-of-backup record), and we can enter archive recovery directly. |
757 | | */ |
758 | 0 | if (ArchiveRecoveryRequested && |
759 | 0 | (ControlFile->minRecoveryPoint != InvalidXLogRecPtr || |
760 | 0 | ControlFile->backupEndRequired || |
761 | 0 | ControlFile->backupEndPoint != InvalidXLogRecPtr || |
762 | 0 | ControlFile->state == DB_SHUTDOWNED)) |
763 | 0 | { |
764 | 0 | InArchiveRecovery = true; |
765 | 0 | if (StandbyModeRequested) |
766 | 0 | EnableStandbyMode(); |
767 | 0 | } |
768 | | |
769 | | /* |
770 | | * For the same reason as when starting up with backup_label present, |
771 | | * emit a log message when we continue initializing from a base |
772 | | * backup. |
773 | | */ |
774 | 0 | if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint)) |
775 | 0 | ereport(LOG, |
776 | 0 | errmsg("restarting backup recovery with redo LSN %X/%08X", |
777 | 0 | LSN_FORMAT_ARGS(ControlFile->backupStartPoint))); |
778 | | |
779 | | /* Get the last valid checkpoint record. */ |
780 | 0 | CheckPointLoc = ControlFile->checkPoint; |
781 | 0 | CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; |
782 | 0 | RedoStartLSN = ControlFile->checkPointCopy.redo; |
783 | 0 | RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID; |
784 | 0 | record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, |
785 | 0 | CheckPointTLI); |
786 | 0 | if (record != NULL) |
787 | 0 | { |
788 | 0 | ereport(DEBUG1, |
789 | 0 | errmsg_internal("checkpoint record is at %X/%08X", |
790 | 0 | LSN_FORMAT_ARGS(CheckPointLoc))); |
791 | 0 | } |
792 | 0 | else |
793 | 0 | { |
794 | | /* |
795 | | * We used to attempt to go back to a secondary checkpoint record |
796 | | * here, but only when not in standby mode. We now just fail if we |
797 | | * can't read the last checkpoint because this allows us to |
798 | | * simplify processing around checkpoints. |
799 | | */ |
800 | 0 | ereport(PANIC, |
801 | 0 | errmsg("could not locate a valid checkpoint record at %X/%08X", |
802 | 0 | LSN_FORMAT_ARGS(CheckPointLoc))); |
803 | 0 | } |
804 | 0 | memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); |
805 | 0 | wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN); |
806 | 0 | } |
807 | | |
808 | 0 | if (ArchiveRecoveryRequested) |
809 | 0 | { |
810 | 0 | if (StandbyModeRequested) |
811 | 0 | ereport(LOG, |
812 | 0 | (errmsg("entering standby mode"))); |
813 | 0 | else if (recoveryTarget == RECOVERY_TARGET_XID) |
814 | 0 | ereport(LOG, |
815 | 0 | (errmsg("starting point-in-time recovery to XID %u", |
816 | 0 | recoveryTargetXid))); |
817 | 0 | else if (recoveryTarget == RECOVERY_TARGET_TIME) |
818 | 0 | ereport(LOG, |
819 | 0 | (errmsg("starting point-in-time recovery to %s", |
820 | 0 | timestamptz_to_str(recoveryTargetTime)))); |
821 | 0 | else if (recoveryTarget == RECOVERY_TARGET_NAME) |
822 | 0 | ereport(LOG, |
823 | 0 | (errmsg("starting point-in-time recovery to \"%s\"", |
824 | 0 | recoveryTargetName))); |
825 | 0 | else if (recoveryTarget == RECOVERY_TARGET_LSN) |
826 | 0 | ereport(LOG, |
827 | 0 | errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%08X\"", |
828 | 0 | LSN_FORMAT_ARGS(recoveryTargetLSN))); |
829 | 0 | else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) |
830 | 0 | ereport(LOG, |
831 | 0 | (errmsg("starting point-in-time recovery to earliest consistent point"))); |
832 | 0 | else |
833 | 0 | ereport(LOG, |
834 | 0 | (errmsg("starting archive recovery"))); |
835 | 0 | } |
836 | | |
837 | | /* |
838 | | * If the location of the checkpoint record is not on the expected |
839 | | * timeline in the history of the requested timeline, we cannot proceed: |
840 | | * the backup is not part of the history of the requested timeline. |
841 | | */ |
842 | 0 | Assert(expectedTLEs); /* was initialized by reading checkpoint |
843 | | * record */ |
844 | 0 | if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) != |
845 | 0 | CheckPointTLI) |
846 | 0 | { |
847 | 0 | XLogRecPtr switchpoint; |
848 | | |
849 | | /* |
850 | | * tliSwitchPoint will throw an error if the checkpoint's timeline is |
851 | | * not in expectedTLEs at all. |
852 | | */ |
853 | 0 | switchpoint = tliSwitchPoint(CheckPointTLI, expectedTLEs, NULL); |
854 | 0 | ereport(FATAL, |
855 | 0 | (errmsg("requested timeline %u is not a child of this server's history", |
856 | 0 | recoveryTargetTLI), |
857 | | /* translator: %s is a backup_label file or a pg_control file */ |
858 | 0 | errdetail("Latest checkpoint in file \"%s\" is at %X/%08X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%08X.", |
859 | 0 | haveBackupLabel ? "backup_label" : "pg_control", |
860 | 0 | LSN_FORMAT_ARGS(CheckPointLoc), |
861 | 0 | CheckPointTLI, |
862 | 0 | LSN_FORMAT_ARGS(switchpoint)))); |
863 | 0 | } |
864 | | |
865 | | /* |
866 | | * The min recovery point should be part of the requested timeline's |
867 | | * history, too. |
868 | | */ |
869 | 0 | if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) && |
870 | 0 | tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) != |
871 | 0 | ControlFile->minRecoveryPointTLI) |
872 | 0 | ereport(FATAL, |
873 | 0 | errmsg("requested timeline %u does not contain minimum recovery point %X/%08X on timeline %u", |
874 | 0 | recoveryTargetTLI, |
875 | 0 | LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint), |
876 | 0 | ControlFile->minRecoveryPointTLI)); |
877 | | |
878 | 0 | ereport(DEBUG1, |
879 | 0 | errmsg_internal("redo record is at %X/%08X; shutdown %s", |
880 | 0 | LSN_FORMAT_ARGS(checkPoint.redo), |
881 | 0 | wasShutdown ? "true" : "false")); |
882 | 0 | ereport(DEBUG1, |
883 | 0 | (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u", |
884 | 0 | U64FromFullTransactionId(checkPoint.nextXid), |
885 | 0 | checkPoint.nextOid))); |
886 | 0 | ereport(DEBUG1, |
887 | 0 | (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u", |
888 | 0 | checkPoint.nextMulti, checkPoint.nextMultiOffset))); |
889 | 0 | ereport(DEBUG1, |
890 | 0 | (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u", |
891 | 0 | checkPoint.oldestXid, checkPoint.oldestXidDB))); |
892 | 0 | ereport(DEBUG1, |
893 | 0 | (errmsg_internal("oldest MultiXactId: %u, in database %u", |
894 | 0 | checkPoint.oldestMulti, checkPoint.oldestMultiDB))); |
895 | 0 | ereport(DEBUG1, |
896 | 0 | (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u", |
897 | 0 | checkPoint.oldestCommitTsXid, |
898 | 0 | checkPoint.newestCommitTsXid))); |
899 | 0 | if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid))) |
900 | 0 | ereport(PANIC, |
901 | 0 | (errmsg("invalid next transaction ID"))); |
902 | | |
903 | | /* sanity check */ |
904 | 0 | if (checkPoint.redo > CheckPointLoc) |
905 | 0 | ereport(PANIC, |
906 | 0 | (errmsg("invalid redo in checkpoint record"))); |
907 | | |
908 | | /* |
909 | | * Check whether we need to force recovery from WAL. If it appears to |
910 | | * have been a clean shutdown and we did not have a recovery signal file, |
911 | | * then assume no recovery needed. |
912 | | */ |
913 | 0 | if (checkPoint.redo < CheckPointLoc) |
914 | 0 | { |
915 | 0 | if (wasShutdown) |
916 | 0 | ereport(PANIC, |
917 | 0 | (errmsg("invalid redo record in shutdown checkpoint"))); |
918 | 0 | InRecovery = true; |
919 | 0 | } |
920 | 0 | else if (ControlFile->state != DB_SHUTDOWNED) |
921 | 0 | InRecovery = true; |
922 | 0 | else if (ArchiveRecoveryRequested) |
923 | 0 | { |
924 | | /* force recovery due to presence of recovery signal file */ |
925 | 0 | InRecovery = true; |
926 | 0 | } |
927 | | |
928 | | /* |
929 | | * If recovery is needed, update our in-memory copy of pg_control to show |
930 | | * that we are recovering and to show the selected checkpoint as the place |
931 | | * we are starting from. We also mark pg_control with any minimum recovery |
932 | | * stop point obtained from a backup history file. |
933 | | * |
934 | | * We don't write the changes to disk yet, though. Only do that after |
935 | | * initializing various subsystems. |
936 | | */ |
937 | 0 | if (InRecovery) |
938 | 0 | { |
939 | 0 | if (InArchiveRecovery) |
940 | 0 | { |
941 | 0 | ControlFile->state = DB_IN_ARCHIVE_RECOVERY; |
942 | 0 | } |
943 | 0 | else |
944 | 0 | { |
945 | 0 | ereport(LOG, |
946 | 0 | (errmsg("database system was not properly shut down; " |
947 | 0 | "automatic recovery in progress"))); |
948 | 0 | if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID) |
949 | 0 | ereport(LOG, |
950 | 0 | (errmsg("crash recovery starts in timeline %u " |
951 | 0 | "and has target timeline %u", |
952 | 0 | ControlFile->checkPointCopy.ThisTimeLineID, |
953 | 0 | recoveryTargetTLI))); |
954 | 0 | ControlFile->state = DB_IN_CRASH_RECOVERY; |
955 | 0 | } |
956 | 0 | ControlFile->checkPoint = CheckPointLoc; |
957 | 0 | ControlFile->checkPointCopy = checkPoint; |
958 | 0 | if (InArchiveRecovery) |
959 | 0 | { |
960 | | /* initialize minRecoveryPoint if not set yet */ |
961 | 0 | if (ControlFile->minRecoveryPoint < checkPoint.redo) |
962 | 0 | { |
963 | 0 | ControlFile->minRecoveryPoint = checkPoint.redo; |
964 | 0 | ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID; |
965 | 0 | } |
966 | 0 | } |
967 | | |
968 | | /* |
969 | | * Set backupStartPoint if we're starting recovery from a base backup. |
970 | | * |
971 | | * Also set backupEndPoint and use minRecoveryPoint as the backup end |
972 | | * location if we're starting recovery from a base backup which was |
973 | | * taken from a standby. In this case, the database system status in |
974 | | * pg_control must indicate that the database was already in recovery. |
975 | | * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be |
976 | | * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted |
977 | | * before reaching this point; e.g. because restore_command or |
978 | | * primary_conninfo were faulty. |
979 | | * |
980 | | * Any other state indicates that the backup somehow became corrupted |
981 | | * and we can't sensibly continue with recovery. |
982 | | */ |
983 | 0 | if (haveBackupLabel) |
984 | 0 | { |
985 | 0 | ControlFile->backupStartPoint = checkPoint.redo; |
986 | 0 | ControlFile->backupEndRequired = backupEndRequired; |
987 | |
|
988 | 0 | if (backupFromStandby) |
989 | 0 | { |
990 | 0 | if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY && |
991 | 0 | dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY) |
992 | 0 | ereport(FATAL, |
993 | 0 | (errmsg("backup_label contains data inconsistent with control file"), |
994 | 0 | errhint("This means that the backup is corrupted and you will " |
995 | 0 | "have to use another backup for recovery."))); |
996 | 0 | ControlFile->backupEndPoint = ControlFile->minRecoveryPoint; |
997 | 0 | } |
998 | 0 | } |
999 | 0 | } |
1000 | | |
1001 | | /* remember these, so that we know when we have reached consistency */ |
1002 | 0 | backupStartPoint = ControlFile->backupStartPoint; |
1003 | 0 | backupEndRequired = ControlFile->backupEndRequired; |
1004 | 0 | backupEndPoint = ControlFile->backupEndPoint; |
1005 | 0 | if (InArchiveRecovery) |
1006 | 0 | { |
1007 | 0 | minRecoveryPoint = ControlFile->minRecoveryPoint; |
1008 | 0 | minRecoveryPointTLI = ControlFile->minRecoveryPointTLI; |
1009 | 0 | } |
1010 | 0 | else |
1011 | 0 | { |
1012 | 0 | minRecoveryPoint = InvalidXLogRecPtr; |
1013 | 0 | minRecoveryPointTLI = 0; |
1014 | 0 | } |
1015 | | |
1016 | | /* |
1017 | | * Start recovery assuming that the final record isn't lost. |
1018 | | */ |
1019 | 0 | abortedRecPtr = InvalidXLogRecPtr; |
1020 | 0 | missingContrecPtr = InvalidXLogRecPtr; |
1021 | |
|
1022 | 0 | *wasShutdown_ptr = wasShutdown; |
1023 | 0 | *haveBackupLabel_ptr = haveBackupLabel; |
1024 | 0 | *haveTblspcMap_ptr = haveTblspcMap; |
1025 | 0 | } |
1026 | | |
1027 | | /* |
1028 | | * See if there are any recovery signal files and if so, set state for |
1029 | | * recovery. |
1030 | | * |
1031 | | * See if there is a recovery command file (recovery.conf), and if so |
1032 | | * throw an ERROR since as of PG12 we no longer recognize that. |
1033 | | */ |
1034 | | static void |
1035 | | readRecoverySignalFile(void) |
1036 | 0 | { |
1037 | 0 | struct stat stat_buf; |
1038 | |
|
1039 | 0 | if (IsBootstrapProcessingMode()) |
1040 | 0 | return; |
1041 | | |
1042 | | /* |
1043 | | * Check for old recovery API file: recovery.conf |
1044 | | */ |
1045 | 0 | if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0) |
1046 | 0 | ereport(FATAL, |
1047 | 0 | (errcode_for_file_access(), |
1048 | 0 | errmsg("using recovery command file \"%s\" is not supported", |
1049 | 0 | RECOVERY_COMMAND_FILE))); |
1050 | | |
1051 | | /* |
1052 | | * Remove unused .done file, if present. Ignore if absent. |
1053 | | */ |
1054 | 0 | unlink(RECOVERY_COMMAND_DONE); |
1055 | | |
1056 | | /* |
1057 | | * Check for recovery signal files and if found, fsync them since they |
1058 | | * represent server state information. We don't sweat too much about the |
1059 | | * possibility of fsync failure, however. |
1060 | | * |
1061 | | * If present, standby signal file takes precedence. If neither is present |
1062 | | * then we won't enter archive recovery. |
1063 | | */ |
1064 | 0 | if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0) |
1065 | 0 | { |
1066 | 0 | int fd; |
1067 | |
|
1068 | 0 | fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY, |
1069 | 0 | S_IRUSR | S_IWUSR); |
1070 | 0 | if (fd >= 0) |
1071 | 0 | { |
1072 | 0 | (void) pg_fsync(fd); |
1073 | 0 | close(fd); |
1074 | 0 | } |
1075 | 0 | standby_signal_file_found = true; |
1076 | 0 | } |
1077 | 0 | else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0) |
1078 | 0 | { |
1079 | 0 | int fd; |
1080 | |
|
1081 | 0 | fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY, |
1082 | 0 | S_IRUSR | S_IWUSR); |
1083 | 0 | if (fd >= 0) |
1084 | 0 | { |
1085 | 0 | (void) pg_fsync(fd); |
1086 | 0 | close(fd); |
1087 | 0 | } |
1088 | 0 | recovery_signal_file_found = true; |
1089 | 0 | } |
1090 | |
|
1091 | 0 | StandbyModeRequested = false; |
1092 | 0 | ArchiveRecoveryRequested = false; |
1093 | 0 | if (standby_signal_file_found) |
1094 | 0 | { |
1095 | 0 | StandbyModeRequested = true; |
1096 | 0 | ArchiveRecoveryRequested = true; |
1097 | 0 | } |
1098 | 0 | else if (recovery_signal_file_found) |
1099 | 0 | { |
1100 | 0 | StandbyModeRequested = false; |
1101 | 0 | ArchiveRecoveryRequested = true; |
1102 | 0 | } |
1103 | 0 | else |
1104 | 0 | return; |
1105 | | |
1106 | | /* |
1107 | | * We don't support standby mode in standalone backends; that requires |
1108 | | * other processes such as the WAL receiver to be alive. |
1109 | | */ |
1110 | 0 | if (StandbyModeRequested && !IsUnderPostmaster) |
1111 | 0 | ereport(FATAL, |
1112 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
1113 | 0 | errmsg("standby mode is not supported by single-user servers"))); |
1114 | 0 | } |
1115 | | |
1116 | | static void |
1117 | | validateRecoveryParameters(void) |
1118 | 0 | { |
1119 | 0 | if (!ArchiveRecoveryRequested) |
1120 | 0 | return; |
1121 | | |
1122 | | /* |
1123 | | * Check for compulsory parameters |
1124 | | */ |
1125 | 0 | if (StandbyModeRequested) |
1126 | 0 | { |
1127 | 0 | if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) && |
1128 | 0 | (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0)) |
1129 | 0 | ereport(WARNING, |
1130 | 0 | (errmsg("specified neither \"primary_conninfo\" nor \"restore_command\""), |
1131 | 0 | errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there."))); |
1132 | 0 | } |
1133 | 0 | else |
1134 | 0 | { |
1135 | 0 | if (recoveryRestoreCommand == NULL || |
1136 | 0 | strcmp(recoveryRestoreCommand, "") == 0) |
1137 | 0 | ereport(FATAL, |
1138 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
1139 | 0 | errmsg("must specify \"restore_command\" when standby mode is not enabled"))); |
1140 | 0 | } |
1141 | | |
1142 | | /* |
1143 | | * Override any inconsistent requests. Note that this is a change of |
1144 | | * behaviour in 9.5; prior to this we simply ignored a request to pause if |
1145 | | * hot_standby = off, which was surprising behaviour. |
1146 | | */ |
1147 | 0 | if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE && |
1148 | 0 | !EnableHotStandby) |
1149 | 0 | recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN; |
1150 | | |
1151 | | /* |
1152 | | * Final parsing of recovery_target_time string; see also |
1153 | | * check_recovery_target_time(). |
1154 | | */ |
1155 | 0 | if (recoveryTarget == RECOVERY_TARGET_TIME) |
1156 | 0 | { |
1157 | 0 | recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, |
1158 | 0 | CStringGetDatum(recovery_target_time_string), |
1159 | 0 | ObjectIdGetDatum(InvalidOid), |
1160 | 0 | Int32GetDatum(-1))); |
1161 | 0 | } |
1162 | | |
1163 | | /* |
1164 | | * If user specified recovery_target_timeline, validate it or compute the |
1165 | | * "latest" value. We can't do this until after we've gotten the restore |
1166 | | * command and set InArchiveRecovery, because we need to fetch timeline |
1167 | | * history files from the archive. |
1168 | | */ |
1169 | 0 | if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC) |
1170 | 0 | { |
1171 | 0 | TimeLineID rtli = recoveryTargetTLIRequested; |
1172 | | |
1173 | | /* Timeline 1 does not have a history file, all else should */ |
1174 | 0 | if (rtli != 1 && !existsTimeLineHistory(rtli)) |
1175 | 0 | ereport(FATAL, |
1176 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
1177 | 0 | errmsg("recovery target timeline %u does not exist", |
1178 | 0 | rtli))); |
1179 | 0 | recoveryTargetTLI = rtli; |
1180 | 0 | } |
1181 | 0 | else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) |
1182 | 0 | { |
1183 | | /* We start the "latest" search from pg_control's timeline */ |
1184 | 0 | recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI); |
1185 | 0 | } |
1186 | 0 | else |
1187 | 0 | { |
1188 | | /* |
1189 | | * else we just use the recoveryTargetTLI as already read from |
1190 | | * ControlFile |
1191 | | */ |
1192 | 0 | Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE); |
1193 | 0 | } |
1194 | 0 | } |
1195 | | |
1196 | | /* |
1197 | | * read_backup_label: check to see if a backup_label file is present |
1198 | | * |
1199 | | * If we see a backup_label during recovery, we assume that we are recovering |
1200 | | * from a backup dump file, and we therefore roll forward from the checkpoint |
1201 | | * identified by the label file, NOT what pg_control says. This avoids the |
1202 | | * problem that pg_control might have been archived one or more checkpoints |
1203 | | * later than the start of the dump, and so if we rely on it as the start |
1204 | | * point, we will fail to restore a consistent database state. |
1205 | | * |
1206 | | * Returns true if a backup_label was found (and fills the checkpoint |
1207 | | * location and TLI into *checkPointLoc and *backupLabelTLI, respectively); |
1208 | | * returns false if not. If this backup_label came from a streamed backup, |
1209 | | * *backupEndRequired is set to true. If this backup_label was created during |
1210 | | * recovery, *backupFromStandby is set to true. |
1211 | | * |
1212 | | * Also sets the global variables RedoStartLSN and RedoStartTLI with the LSN |
1213 | | * and TLI read from the backup file. |
1214 | | */ |
1215 | | static bool |
1216 | | read_backup_label(XLogRecPtr *checkPointLoc, TimeLineID *backupLabelTLI, |
1217 | | bool *backupEndRequired, bool *backupFromStandby) |
1218 | 0 | { |
1219 | 0 | char startxlogfilename[MAXFNAMELEN]; |
1220 | 0 | TimeLineID tli_from_walseg, |
1221 | 0 | tli_from_file; |
1222 | 0 | FILE *lfp; |
1223 | 0 | char ch; |
1224 | 0 | char backuptype[20]; |
1225 | 0 | char backupfrom[20]; |
1226 | 0 | char backuplabel[MAXPGPATH]; |
1227 | 0 | char backuptime[128]; |
1228 | 0 | uint32 hi, |
1229 | 0 | lo; |
1230 | | |
1231 | | /* suppress possible uninitialized-variable warnings */ |
1232 | 0 | *checkPointLoc = InvalidXLogRecPtr; |
1233 | 0 | *backupLabelTLI = 0; |
1234 | 0 | *backupEndRequired = false; |
1235 | 0 | *backupFromStandby = false; |
1236 | | |
1237 | | /* |
1238 | | * See if label file is present |
1239 | | */ |
1240 | 0 | lfp = AllocateFile(BACKUP_LABEL_FILE, "r"); |
1241 | 0 | if (!lfp) |
1242 | 0 | { |
1243 | 0 | if (errno != ENOENT) |
1244 | 0 | ereport(FATAL, |
1245 | 0 | (errcode_for_file_access(), |
1246 | 0 | errmsg("could not read file \"%s\": %m", |
1247 | 0 | BACKUP_LABEL_FILE))); |
1248 | 0 | return false; /* it's not there, all is fine */ |
1249 | 0 | } |
1250 | | |
1251 | | /* |
1252 | | * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code |
1253 | | * is pretty crude, but we are not expecting any variability in the file |
1254 | | * format). |
1255 | | */ |
1256 | 0 | if (fscanf(lfp, "START WAL LOCATION: %X/%08X (file %08X%16s)%c", |
1257 | 0 | &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n') |
1258 | 0 | ereport(FATAL, |
1259 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
1260 | 0 | errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); |
1261 | 0 | RedoStartLSN = ((uint64) hi) << 32 | lo; |
1262 | 0 | RedoStartTLI = tli_from_walseg; |
1263 | 0 | if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%08X%c", |
1264 | 0 | &hi, &lo, &ch) != 3 || ch != '\n') |
1265 | 0 | ereport(FATAL, |
1266 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
1267 | 0 | errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE))); |
1268 | 0 | *checkPointLoc = ((uint64) hi) << 32 | lo; |
1269 | 0 | *backupLabelTLI = tli_from_walseg; |
1270 | | |
1271 | | /* |
1272 | | * BACKUP METHOD lets us know if this was a typical backup ("streamed", |
1273 | | * which could mean either pg_basebackup or the pg_backup_start/stop |
1274 | | * method was used) or if this label came from somewhere else (the only |
1275 | | * other option today being from pg_rewind). If this was a streamed |
1276 | | * backup then we know that we need to play through until we get to the |
1277 | | * end of the WAL which was generated during the backup (at which point we |
1278 | | * will have reached consistency and backupEndRequired will be reset to be |
1279 | | * false). |
1280 | | */ |
1281 | 0 | if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1) |
1282 | 0 | { |
1283 | 0 | if (strcmp(backuptype, "streamed") == 0) |
1284 | 0 | *backupEndRequired = true; |
1285 | 0 | } |
1286 | | |
1287 | | /* |
1288 | | * BACKUP FROM lets us know if this was from a primary or a standby. If |
1289 | | * it was from a standby, we'll double-check that the control file state |
1290 | | * matches that of a standby. |
1291 | | */ |
1292 | 0 | if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1) |
1293 | 0 | { |
1294 | 0 | if (strcmp(backupfrom, "standby") == 0) |
1295 | 0 | *backupFromStandby = true; |
1296 | 0 | } |
1297 | | |
1298 | | /* |
1299 | | * Parse START TIME and LABEL. Those are not mandatory fields for recovery |
1300 | | * but checking for their presence is useful for debugging and the next |
1301 | | * sanity checks. Cope also with the fact that the result buffers have a |
1302 | | * pre-allocated size, hence if the backup_label file has been generated |
1303 | | * with strings longer than the maximum assumed here an incorrect parsing |
1304 | | * happens. That's fine as only minor consistency checks are done |
1305 | | * afterwards. |
1306 | | */ |
1307 | 0 | if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1) |
1308 | 0 | ereport(DEBUG1, |
1309 | 0 | (errmsg_internal("backup time %s in file \"%s\"", |
1310 | 0 | backuptime, BACKUP_LABEL_FILE))); |
1311 | | |
1312 | 0 | if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1) |
1313 | 0 | ereport(DEBUG1, |
1314 | 0 | (errmsg_internal("backup label %s in file \"%s\"", |
1315 | 0 | backuplabel, BACKUP_LABEL_FILE))); |
1316 | | |
1317 | | /* |
1318 | | * START TIMELINE is new as of 11. Its parsing is not mandatory, still use |
1319 | | * it as a sanity check if present. |
1320 | | */ |
1321 | 0 | if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1) |
1322 | 0 | { |
1323 | 0 | if (tli_from_walseg != tli_from_file) |
1324 | 0 | ereport(FATAL, |
1325 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
1326 | 0 | errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE), |
1327 | 0 | errdetail("Timeline ID parsed is %u, but expected %u.", |
1328 | 0 | tli_from_file, tli_from_walseg))); |
1329 | | |
1330 | 0 | ereport(DEBUG1, |
1331 | 0 | (errmsg_internal("backup timeline %u in file \"%s\"", |
1332 | 0 | tli_from_file, BACKUP_LABEL_FILE))); |
1333 | 0 | } |
1334 | | |
1335 | 0 | if (fscanf(lfp, "INCREMENTAL FROM LSN: %X/%08X\n", &hi, &lo) > 0) |
1336 | 0 | ereport(FATAL, |
1337 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
1338 | 0 | errmsg("this is an incremental backup, not a data directory"), |
1339 | 0 | errhint("Use pg_combinebackup to reconstruct a valid data directory."))); |
1340 | | |
1341 | 0 | if (ferror(lfp) || FreeFile(lfp)) |
1342 | 0 | ereport(FATAL, |
1343 | 0 | (errcode_for_file_access(), |
1344 | 0 | errmsg("could not read file \"%s\": %m", |
1345 | 0 | BACKUP_LABEL_FILE))); |
1346 | | |
1347 | 0 | return true; |
1348 | 0 | } |
1349 | | |
1350 | | /* |
1351 | | * read_tablespace_map: check to see if a tablespace_map file is present |
1352 | | * |
1353 | | * If we see a tablespace_map file during recovery, we assume that we are |
1354 | | * recovering from a backup dump file, and we therefore need to create symlinks |
1355 | | * as per the information present in tablespace_map file. |
1356 | | * |
1357 | | * Returns true if a tablespace_map file was found (and fills *tablespaces |
1358 | | * with a tablespaceinfo struct for each tablespace listed in the file); |
1359 | | * returns false if not. |
1360 | | */ |
1361 | | static bool |
1362 | | read_tablespace_map(List **tablespaces) |
1363 | 0 | { |
1364 | 0 | tablespaceinfo *ti; |
1365 | 0 | FILE *lfp; |
1366 | 0 | char str[MAXPGPATH]; |
1367 | 0 | int ch, |
1368 | 0 | i, |
1369 | 0 | n; |
1370 | 0 | bool was_backslash; |
1371 | | |
1372 | | /* |
1373 | | * See if tablespace_map file is present |
1374 | | */ |
1375 | 0 | lfp = AllocateFile(TABLESPACE_MAP, "r"); |
1376 | 0 | if (!lfp) |
1377 | 0 | { |
1378 | 0 | if (errno != ENOENT) |
1379 | 0 | ereport(FATAL, |
1380 | 0 | (errcode_for_file_access(), |
1381 | 0 | errmsg("could not read file \"%s\": %m", |
1382 | 0 | TABLESPACE_MAP))); |
1383 | 0 | return false; /* it's not there, all is fine */ |
1384 | 0 | } |
1385 | | |
1386 | | /* |
1387 | | * Read and parse the link name and path lines from tablespace_map file |
1388 | | * (this code is pretty crude, but we are not expecting any variability in |
1389 | | * the file format). De-escape any backslashes that were inserted. |
1390 | | */ |
1391 | 0 | i = 0; |
1392 | 0 | was_backslash = false; |
1393 | 0 | while ((ch = fgetc(lfp)) != EOF) |
1394 | 0 | { |
1395 | 0 | if (!was_backslash && (ch == '\n' || ch == '\r')) |
1396 | 0 | { |
1397 | 0 | char *endp; |
1398 | |
|
1399 | 0 | if (i == 0) |
1400 | 0 | continue; /* \r immediately followed by \n */ |
1401 | | |
1402 | | /* |
1403 | | * The de-escaped line should contain an OID followed by exactly |
1404 | | * one space followed by a path. The path might start with |
1405 | | * spaces, so don't be too liberal about parsing. |
1406 | | */ |
1407 | 0 | str[i] = '\0'; |
1408 | 0 | n = 0; |
1409 | 0 | while (str[n] && str[n] != ' ') |
1410 | 0 | n++; |
1411 | 0 | if (n < 1 || n >= i - 1) |
1412 | 0 | ereport(FATAL, |
1413 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
1414 | 0 | errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); |
1415 | 0 | str[n++] = '\0'; |
1416 | |
|
1417 | 0 | ti = palloc0(sizeof(tablespaceinfo)); |
1418 | 0 | errno = 0; |
1419 | 0 | ti->oid = strtoul(str, &endp, 10); |
1420 | 0 | if (*endp != '\0' || errno == EINVAL || errno == ERANGE) |
1421 | 0 | ereport(FATAL, |
1422 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
1423 | 0 | errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); |
1424 | 0 | ti->path = pstrdup(str + n); |
1425 | 0 | *tablespaces = lappend(*tablespaces, ti); |
1426 | |
|
1427 | 0 | i = 0; |
1428 | 0 | continue; |
1429 | 0 | } |
1430 | 0 | else if (!was_backslash && ch == '\\') |
1431 | 0 | was_backslash = true; |
1432 | 0 | else |
1433 | 0 | { |
1434 | 0 | if (i < sizeof(str) - 1) |
1435 | 0 | str[i++] = ch; |
1436 | 0 | was_backslash = false; |
1437 | 0 | } |
1438 | 0 | } |
1439 | | |
1440 | 0 | if (i != 0 || was_backslash) /* last line not terminated? */ |
1441 | 0 | ereport(FATAL, |
1442 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
1443 | 0 | errmsg("invalid data in file \"%s\"", TABLESPACE_MAP))); |
1444 | | |
1445 | 0 | if (ferror(lfp) || FreeFile(lfp)) |
1446 | 0 | ereport(FATAL, |
1447 | 0 | (errcode_for_file_access(), |
1448 | 0 | errmsg("could not read file \"%s\": %m", |
1449 | 0 | TABLESPACE_MAP))); |
1450 | | |
1451 | 0 | return true; |
1452 | 0 | } |
1453 | | |
1454 | | /* |
1455 | | * Finish WAL recovery. |
1456 | | * |
1457 | | * This does not close the 'xlogreader' yet, because in some cases the caller |
1458 | | * still wants to re-read the last checkpoint record by calling |
1459 | | * ReadCheckpointRecord(). |
1460 | | * |
1461 | | * Returns the position of the last valid or applied record, after which new |
1462 | | * WAL should be appended, information about why recovery was ended, and some |
1463 | | * other things. See the EndOfWalRecoveryInfo struct for details. |
1464 | | */ |
1465 | | EndOfWalRecoveryInfo * |
1466 | | FinishWalRecovery(void) |
1467 | 0 | { |
1468 | 0 | EndOfWalRecoveryInfo *result = palloc(sizeof(EndOfWalRecoveryInfo)); |
1469 | 0 | XLogRecPtr lastRec; |
1470 | 0 | TimeLineID lastRecTLI; |
1471 | 0 | XLogRecPtr endOfLog; |
1472 | | |
1473 | | /* |
1474 | | * Kill WAL receiver, if it's still running, before we continue to write |
1475 | | * the startup checkpoint and aborted-contrecord records. It will trump |
1476 | | * over these records and subsequent ones if it's still alive when we |
1477 | | * start writing WAL. |
1478 | | */ |
1479 | 0 | XLogShutdownWalRcv(); |
1480 | | |
1481 | | /* |
1482 | | * Shutdown the slot sync worker to drop any temporary slots acquired by |
1483 | | * it and to prevent it from keep trying to fetch the failover slots. |
1484 | | * |
1485 | | * We do not update the 'synced' column in 'pg_replication_slots' system |
1486 | | * view from true to false here, as any failed update could leave 'synced' |
1487 | | * column false for some slots. This could cause issues during slot sync |
1488 | | * after restarting the server as a standby. While updating the 'synced' |
1489 | | * column after switching to the new timeline is an option, it does not |
1490 | | * simplify the handling for the 'synced' column. Therefore, we retain the |
1491 | | * 'synced' column as true after promotion as it may provide useful |
1492 | | * information about the slot origin. |
1493 | | */ |
1494 | 0 | ShutDownSlotSync(); |
1495 | | |
1496 | | /* |
1497 | | * We are now done reading the xlog from stream. Turn off streaming |
1498 | | * recovery to force fetching the files (which would be required at end of |
1499 | | * recovery, e.g., timeline history file) from archive or pg_wal. |
1500 | | * |
1501 | | * Note that standby mode must be turned off after killing WAL receiver, |
1502 | | * i.e., calling XLogShutdownWalRcv(). |
1503 | | */ |
1504 | 0 | Assert(!WalRcvStreaming()); |
1505 | 0 | StandbyMode = false; |
1506 | | |
1507 | | /* |
1508 | | * Determine where to start writing WAL next. |
1509 | | * |
1510 | | * Re-fetch the last valid or last applied record, so we can identify the |
1511 | | * exact endpoint of what we consider the valid portion of WAL. There may |
1512 | | * be an incomplete continuation record after that, in which case |
1513 | | * 'abortedRecPtr' and 'missingContrecPtr' are set and the caller will |
1514 | | * write a special OVERWRITE_CONTRECORD message to mark that the rest of |
1515 | | * it is intentionally missing. See CreateOverwriteContrecordRecord(). |
1516 | | * |
1517 | | * An important side-effect of this is to load the last page into |
1518 | | * xlogreader. The caller uses it to initialize the WAL for writing. |
1519 | | */ |
1520 | 0 | if (!InRecovery) |
1521 | 0 | { |
1522 | 0 | lastRec = CheckPointLoc; |
1523 | 0 | lastRecTLI = CheckPointTLI; |
1524 | 0 | } |
1525 | 0 | else |
1526 | 0 | { |
1527 | 0 | lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr; |
1528 | 0 | lastRecTLI = XLogRecoveryCtl->lastReplayedTLI; |
1529 | 0 | } |
1530 | 0 | XLogPrefetcherBeginRead(xlogprefetcher, lastRec); |
1531 | 0 | (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); |
1532 | 0 | endOfLog = xlogreader->EndRecPtr; |
1533 | | |
1534 | | /* |
1535 | | * Remember the TLI in the filename of the XLOG segment containing the |
1536 | | * end-of-log. It could be different from the timeline that endOfLog |
1537 | | * nominally belongs to, if there was a timeline switch in that segment, |
1538 | | * and we were reading the old WAL from a segment belonging to a higher |
1539 | | * timeline. |
1540 | | */ |
1541 | 0 | result->endOfLogTLI = xlogreader->seg.ws_tli; |
1542 | |
|
1543 | 0 | if (ArchiveRecoveryRequested) |
1544 | 0 | { |
1545 | | /* |
1546 | | * We are no longer in archive recovery state. |
1547 | | * |
1548 | | * We are now done reading the old WAL. Turn off archive fetching if |
1549 | | * it was active. |
1550 | | */ |
1551 | 0 | Assert(InArchiveRecovery); |
1552 | 0 | InArchiveRecovery = false; |
1553 | | |
1554 | | /* |
1555 | | * If the ending log segment is still open, close it (to avoid |
1556 | | * problems on Windows with trying to rename or delete an open file). |
1557 | | */ |
1558 | 0 | if (readFile >= 0) |
1559 | 0 | { |
1560 | 0 | close(readFile); |
1561 | 0 | readFile = -1; |
1562 | 0 | } |
1563 | 0 | } |
1564 | | |
1565 | | /* |
1566 | | * Copy the last partial block to the caller, for initializing the WAL |
1567 | | * buffer for appending new WAL. |
1568 | | */ |
1569 | 0 | if (endOfLog % XLOG_BLCKSZ != 0) |
1570 | 0 | { |
1571 | 0 | char *page; |
1572 | 0 | int len; |
1573 | 0 | XLogRecPtr pageBeginPtr; |
1574 | |
|
1575 | 0 | pageBeginPtr = endOfLog - (endOfLog % XLOG_BLCKSZ); |
1576 | 0 | Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size)); |
1577 | | |
1578 | | /* Copy the valid part of the last block */ |
1579 | 0 | len = endOfLog % XLOG_BLCKSZ; |
1580 | 0 | page = palloc(len); |
1581 | 0 | memcpy(page, xlogreader->readBuf, len); |
1582 | |
|
1583 | 0 | result->lastPageBeginPtr = pageBeginPtr; |
1584 | 0 | result->lastPage = page; |
1585 | 0 | } |
1586 | 0 | else |
1587 | 0 | { |
1588 | | /* There is no partial block to copy. */ |
1589 | 0 | result->lastPageBeginPtr = endOfLog; |
1590 | 0 | result->lastPage = NULL; |
1591 | 0 | } |
1592 | | |
1593 | | /* |
1594 | | * Create a comment for the history file to explain why and where timeline |
1595 | | * changed. |
1596 | | */ |
1597 | 0 | result->recoveryStopReason = getRecoveryStopReason(); |
1598 | |
|
1599 | 0 | result->lastRec = lastRec; |
1600 | 0 | result->lastRecTLI = lastRecTLI; |
1601 | 0 | result->endOfLog = endOfLog; |
1602 | |
|
1603 | 0 | result->abortedRecPtr = abortedRecPtr; |
1604 | 0 | result->missingContrecPtr = missingContrecPtr; |
1605 | |
|
1606 | 0 | result->standby_signal_file_found = standby_signal_file_found; |
1607 | 0 | result->recovery_signal_file_found = recovery_signal_file_found; |
1608 | |
|
1609 | 0 | return result; |
1610 | 0 | } |
1611 | | |
1612 | | /* |
1613 | | * Clean up the WAL reader and leftovers from restoring WAL from archive |
1614 | | */ |
1615 | | void |
1616 | | ShutdownWalRecovery(void) |
1617 | 0 | { |
1618 | 0 | char recoveryPath[MAXPGPATH]; |
1619 | | |
1620 | | /* Final update of pg_stat_recovery_prefetch. */ |
1621 | 0 | XLogPrefetcherComputeStats(xlogprefetcher); |
1622 | | |
1623 | | /* Shut down xlogreader */ |
1624 | 0 | if (readFile >= 0) |
1625 | 0 | { |
1626 | 0 | close(readFile); |
1627 | 0 | readFile = -1; |
1628 | 0 | } |
1629 | 0 | pfree(xlogreader->private_data); |
1630 | 0 | XLogReaderFree(xlogreader); |
1631 | 0 | XLogPrefetcherFree(xlogprefetcher); |
1632 | |
|
1633 | 0 | if (ArchiveRecoveryRequested) |
1634 | 0 | { |
1635 | | /* |
1636 | | * Since there might be a partial WAL segment named RECOVERYXLOG, get |
1637 | | * rid of it. |
1638 | | */ |
1639 | 0 | snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG"); |
1640 | 0 | unlink(recoveryPath); /* ignore any error */ |
1641 | | |
1642 | | /* Get rid of any remaining recovered timeline-history file, too */ |
1643 | 0 | snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY"); |
1644 | 0 | unlink(recoveryPath); /* ignore any error */ |
1645 | 0 | } |
1646 | | |
1647 | | /* |
1648 | | * We don't need the latch anymore. It's not strictly necessary to disown |
1649 | | * it, but let's do it for the sake of tidiness. |
1650 | | */ |
1651 | 0 | if (ArchiveRecoveryRequested) |
1652 | 0 | DisownLatch(&XLogRecoveryCtl->recoveryWakeupLatch); |
1653 | 0 | } |
1654 | | |
1655 | | /* |
1656 | | * Perform WAL recovery. |
1657 | | * |
1658 | | * If the system was shut down cleanly, this is never called. |
1659 | | */ |
1660 | | void |
1661 | | PerformWalRecovery(void) |
1662 | 0 | { |
1663 | 0 | XLogRecord *record; |
1664 | 0 | bool reachedRecoveryTarget = false; |
1665 | 0 | TimeLineID replayTLI; |
1666 | | |
1667 | | /* |
1668 | | * Initialize shared variables for tracking progress of WAL replay, as if |
1669 | | * we had just replayed the record before the REDO location (or the |
1670 | | * checkpoint record itself, if it's a shutdown checkpoint). |
1671 | | */ |
1672 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
1673 | 0 | if (RedoStartLSN < CheckPointLoc) |
1674 | 0 | { |
1675 | 0 | XLogRecoveryCtl->lastReplayedReadRecPtr = InvalidXLogRecPtr; |
1676 | 0 | XLogRecoveryCtl->lastReplayedEndRecPtr = RedoStartLSN; |
1677 | 0 | XLogRecoveryCtl->lastReplayedTLI = RedoStartTLI; |
1678 | 0 | } |
1679 | 0 | else |
1680 | 0 | { |
1681 | 0 | XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr; |
1682 | 0 | XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr; |
1683 | 0 | XLogRecoveryCtl->lastReplayedTLI = CheckPointTLI; |
1684 | 0 | } |
1685 | 0 | XLogRecoveryCtl->replayEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr; |
1686 | 0 | XLogRecoveryCtl->replayEndTLI = XLogRecoveryCtl->lastReplayedTLI; |
1687 | 0 | XLogRecoveryCtl->recoveryLastXTime = 0; |
1688 | 0 | XLogRecoveryCtl->currentChunkStartTime = 0; |
1689 | 0 | XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; |
1690 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
1691 | | |
1692 | | /* Also ensure XLogReceiptTime has a sane value */ |
1693 | 0 | XLogReceiptTime = GetCurrentTimestamp(); |
1694 | | |
1695 | | /* |
1696 | | * Let postmaster know we've started redo now, so that it can launch the |
1697 | | * archiver if necessary. |
1698 | | */ |
1699 | 0 | if (IsUnderPostmaster) |
1700 | 0 | SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); |
1701 | | |
1702 | | /* |
1703 | | * Allow read-only connections immediately if we're consistent already. |
1704 | | */ |
1705 | 0 | CheckRecoveryConsistency(); |
1706 | | |
1707 | | /* |
1708 | | * Find the first record that logically follows the checkpoint --- it |
1709 | | * might physically precede it, though. |
1710 | | */ |
1711 | 0 | if (RedoStartLSN < CheckPointLoc) |
1712 | 0 | { |
1713 | | /* back up to find the record */ |
1714 | 0 | replayTLI = RedoStartTLI; |
1715 | 0 | XLogPrefetcherBeginRead(xlogprefetcher, RedoStartLSN); |
1716 | 0 | record = ReadRecord(xlogprefetcher, PANIC, false, replayTLI); |
1717 | | |
1718 | | /* |
1719 | | * If a checkpoint record's redo pointer points back to an earlier |
1720 | | * LSN, the record at that LSN should be an XLOG_CHECKPOINT_REDO |
1721 | | * record. |
1722 | | */ |
1723 | 0 | if (record->xl_rmid != RM_XLOG_ID || |
1724 | 0 | (record->xl_info & ~XLR_INFO_MASK) != XLOG_CHECKPOINT_REDO) |
1725 | 0 | ereport(FATAL, |
1726 | 0 | errmsg("unexpected record type found at redo point %X/%08X", |
1727 | 0 | LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))); |
1728 | 0 | } |
1729 | 0 | else |
1730 | 0 | { |
1731 | | /* just have to read next record after CheckPoint */ |
1732 | 0 | Assert(xlogreader->ReadRecPtr == CheckPointLoc); |
1733 | 0 | replayTLI = CheckPointTLI; |
1734 | 0 | record = ReadRecord(xlogprefetcher, LOG, false, replayTLI); |
1735 | 0 | } |
1736 | | |
1737 | 0 | if (record != NULL) |
1738 | 0 | { |
1739 | 0 | TimestampTz xtime; |
1740 | 0 | PGRUsage ru0; |
1741 | |
|
1742 | 0 | pg_rusage_init(&ru0); |
1743 | |
|
1744 | 0 | InRedo = true; |
1745 | |
|
1746 | 0 | RmgrStartup(); |
1747 | |
|
1748 | 0 | ereport(LOG, |
1749 | 0 | errmsg("redo starts at %X/%08X", |
1750 | 0 | LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))); |
1751 | | |
1752 | | /* Prepare to report progress of the redo phase. */ |
1753 | 0 | if (!StandbyMode) |
1754 | 0 | begin_startup_progress_phase(); |
1755 | | |
1756 | | /* |
1757 | | * main redo apply loop |
1758 | | */ |
1759 | 0 | do |
1760 | 0 | { |
1761 | 0 | if (!StandbyMode) |
1762 | 0 | ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%08X", |
1763 | 0 | LSN_FORMAT_ARGS(xlogreader->ReadRecPtr)); |
1764 | | |
1765 | | #ifdef WAL_DEBUG |
1766 | | if (XLOG_DEBUG) |
1767 | | { |
1768 | | StringInfoData buf; |
1769 | | |
1770 | | initStringInfo(&buf); |
1771 | | appendStringInfo(&buf, "REDO @ %X/%08X; LSN %X/%08X: ", |
1772 | | LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), |
1773 | | LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); |
1774 | | xlog_outrec(&buf, xlogreader); |
1775 | | appendStringInfoString(&buf, " - "); |
1776 | | xlog_outdesc(&buf, xlogreader); |
1777 | | elog(LOG, "%s", buf.data); |
1778 | | pfree(buf.data); |
1779 | | } |
1780 | | #endif |
1781 | | |
1782 | | /* Handle interrupt signals of startup process */ |
1783 | 0 | ProcessStartupProcInterrupts(); |
1784 | | |
1785 | | /* |
1786 | | * Pause WAL replay, if requested by a hot-standby session via |
1787 | | * SetRecoveryPause(). |
1788 | | * |
1789 | | * Note that we intentionally don't take the info_lck spinlock |
1790 | | * here. We might therefore read a slightly stale value of the |
1791 | | * recoveryPause flag, but it can't be very stale (no worse than |
1792 | | * the last spinlock we did acquire). Since a pause request is a |
1793 | | * pretty asynchronous thing anyway, possibly responding to it one |
1794 | | * WAL record later than we otherwise would is a minor issue, so |
1795 | | * it doesn't seem worth adding another spinlock cycle to prevent |
1796 | | * that. |
1797 | | */ |
1798 | 0 | if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState != |
1799 | 0 | RECOVERY_NOT_PAUSED) |
1800 | 0 | recoveryPausesHere(false); |
1801 | | |
1802 | | /* |
1803 | | * Have we reached our recovery target? |
1804 | | */ |
1805 | 0 | if (recoveryStopsBefore(xlogreader)) |
1806 | 0 | { |
1807 | 0 | reachedRecoveryTarget = true; |
1808 | 0 | break; |
1809 | 0 | } |
1810 | | |
1811 | | /* |
1812 | | * If we've been asked to lag the primary, wait on latch until |
1813 | | * enough time has passed. |
1814 | | */ |
1815 | 0 | if (recoveryApplyDelay(xlogreader)) |
1816 | 0 | { |
1817 | | /* |
1818 | | * We test for paused recovery again here. If user sets |
1819 | | * delayed apply, it may be because they expect to pause |
1820 | | * recovery in case of problems, so we must test again here |
1821 | | * otherwise pausing during the delay-wait wouldn't work. |
1822 | | */ |
1823 | 0 | if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState != |
1824 | 0 | RECOVERY_NOT_PAUSED) |
1825 | 0 | recoveryPausesHere(false); |
1826 | 0 | } |
1827 | | |
1828 | | /* |
1829 | | * Apply the record |
1830 | | */ |
1831 | 0 | ApplyWalRecord(xlogreader, record, &replayTLI); |
1832 | | |
1833 | | /* Exit loop if we reached inclusive recovery target */ |
1834 | 0 | if (recoveryStopsAfter(xlogreader)) |
1835 | 0 | { |
1836 | 0 | reachedRecoveryTarget = true; |
1837 | 0 | break; |
1838 | 0 | } |
1839 | | |
1840 | | /* Else, try to fetch the next WAL record */ |
1841 | 0 | record = ReadRecord(xlogprefetcher, LOG, false, replayTLI); |
1842 | 0 | } while (record != NULL); |
1843 | | |
1844 | | /* |
1845 | | * end of main redo apply loop |
1846 | | */ |
1847 | | |
1848 | 0 | if (reachedRecoveryTarget) |
1849 | 0 | { |
1850 | 0 | if (!reachedConsistency) |
1851 | 0 | ereport(FATAL, |
1852 | 0 | (errmsg("requested recovery stop point is before consistent recovery point"))); |
1853 | | |
1854 | | /* |
1855 | | * This is the last point where we can restart recovery with a new |
1856 | | * recovery target, if we shutdown and begin again. After this, |
1857 | | * Resource Managers may choose to do permanent corrective actions |
1858 | | * at end of recovery. |
1859 | | */ |
1860 | 0 | switch (recoveryTargetAction) |
1861 | 0 | { |
1862 | 0 | case RECOVERY_TARGET_ACTION_SHUTDOWN: |
1863 | | |
1864 | | /* |
1865 | | * exit with special return code to request shutdown of |
1866 | | * postmaster. Log messages issued from postmaster. |
1867 | | */ |
1868 | 0 | proc_exit(3); |
1869 | |
|
1870 | 0 | case RECOVERY_TARGET_ACTION_PAUSE: |
1871 | 0 | SetRecoveryPause(true); |
1872 | 0 | recoveryPausesHere(true); |
1873 | | |
1874 | | /* drop into promote */ |
1875 | |
|
1876 | 0 | case RECOVERY_TARGET_ACTION_PROMOTE: |
1877 | 0 | break; |
1878 | 0 | } |
1879 | 0 | } |
1880 | | |
1881 | 0 | RmgrCleanup(); |
1882 | |
|
1883 | 0 | ereport(LOG, |
1884 | 0 | errmsg("redo done at %X/%08X system usage: %s", |
1885 | 0 | LSN_FORMAT_ARGS(xlogreader->ReadRecPtr), |
1886 | 0 | pg_rusage_show(&ru0))); |
1887 | 0 | xtime = GetLatestXTime(); |
1888 | 0 | if (xtime) |
1889 | 0 | ereport(LOG, |
1890 | 0 | (errmsg("last completed transaction was at log time %s", |
1891 | 0 | timestamptz_to_str(xtime)))); |
1892 | | |
1893 | 0 | InRedo = false; |
1894 | 0 | } |
1895 | 0 | else |
1896 | 0 | { |
1897 | | /* there are no WAL records following the checkpoint */ |
1898 | 0 | ereport(LOG, |
1899 | 0 | (errmsg("redo is not required"))); |
1900 | 0 | } |
1901 | | |
1902 | | /* |
1903 | | * This check is intentionally after the above log messages that indicate |
1904 | | * how far recovery went. |
1905 | | */ |
1906 | 0 | if (ArchiveRecoveryRequested && |
1907 | 0 | recoveryTarget != RECOVERY_TARGET_UNSET && |
1908 | 0 | !reachedRecoveryTarget) |
1909 | 0 | ereport(FATAL, |
1910 | 0 | (errcode(ERRCODE_CONFIG_FILE_ERROR), |
1911 | 0 | errmsg("recovery ended before configured recovery target was reached"))); |
1912 | 0 | } |
1913 | | |
1914 | | /* |
1915 | | * Subroutine of PerformWalRecovery, to apply one WAL record. |
1916 | | */ |
1917 | | static void |
1918 | | ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *replayTLI) |
1919 | 0 | { |
1920 | 0 | ErrorContextCallback errcallback; |
1921 | 0 | bool switchedTLI = false; |
1922 | | |
1923 | | /* Setup error traceback support for ereport() */ |
1924 | 0 | errcallback.callback = rm_redo_error_callback; |
1925 | 0 | errcallback.arg = xlogreader; |
1926 | 0 | errcallback.previous = error_context_stack; |
1927 | 0 | error_context_stack = &errcallback; |
1928 | | |
1929 | | /* |
1930 | | * TransamVariables->nextXid must be beyond record's xid. |
1931 | | */ |
1932 | 0 | AdvanceNextFullTransactionIdPastXid(record->xl_xid); |
1933 | | |
1934 | | /* |
1935 | | * Before replaying this record, check if this record causes the current |
1936 | | * timeline to change. The record is already considered to be part of the |
1937 | | * new timeline, so we update replayTLI before replaying it. That's |
1938 | | * important so that replayEndTLI, which is recorded as the minimum |
1939 | | * recovery point's TLI if recovery stops after this record, is set |
1940 | | * correctly. |
1941 | | */ |
1942 | 0 | if (record->xl_rmid == RM_XLOG_ID) |
1943 | 0 | { |
1944 | 0 | TimeLineID newReplayTLI = *replayTLI; |
1945 | 0 | TimeLineID prevReplayTLI = *replayTLI; |
1946 | 0 | uint8 info = record->xl_info & ~XLR_INFO_MASK; |
1947 | |
|
1948 | 0 | if (info == XLOG_CHECKPOINT_SHUTDOWN) |
1949 | 0 | { |
1950 | 0 | CheckPoint checkPoint; |
1951 | |
|
1952 | 0 | memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint)); |
1953 | 0 | newReplayTLI = checkPoint.ThisTimeLineID; |
1954 | 0 | prevReplayTLI = checkPoint.PrevTimeLineID; |
1955 | 0 | } |
1956 | 0 | else if (info == XLOG_END_OF_RECOVERY) |
1957 | 0 | { |
1958 | 0 | xl_end_of_recovery xlrec; |
1959 | |
|
1960 | 0 | memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery)); |
1961 | 0 | newReplayTLI = xlrec.ThisTimeLineID; |
1962 | 0 | prevReplayTLI = xlrec.PrevTimeLineID; |
1963 | 0 | } |
1964 | |
|
1965 | 0 | if (newReplayTLI != *replayTLI) |
1966 | 0 | { |
1967 | | /* Check that it's OK to switch to this TLI */ |
1968 | 0 | checkTimeLineSwitch(xlogreader->EndRecPtr, |
1969 | 0 | newReplayTLI, prevReplayTLI, *replayTLI); |
1970 | | |
1971 | | /* Following WAL records should be run with new TLI */ |
1972 | 0 | *replayTLI = newReplayTLI; |
1973 | 0 | switchedTLI = true; |
1974 | 0 | } |
1975 | 0 | } |
1976 | | |
1977 | | /* |
1978 | | * Update shared replayEndRecPtr before replaying this record, so that |
1979 | | * XLogFlush will update minRecoveryPoint correctly. |
1980 | | */ |
1981 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
1982 | 0 | XLogRecoveryCtl->replayEndRecPtr = xlogreader->EndRecPtr; |
1983 | 0 | XLogRecoveryCtl->replayEndTLI = *replayTLI; |
1984 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
1985 | | |
1986 | | /* |
1987 | | * If we are attempting to enter Hot Standby mode, process XIDs we see |
1988 | | */ |
1989 | 0 | if (standbyState >= STANDBY_INITIALIZED && |
1990 | 0 | TransactionIdIsValid(record->xl_xid)) |
1991 | 0 | RecordKnownAssignedTransactionIds(record->xl_xid); |
1992 | | |
1993 | | /* |
1994 | | * Some XLOG record types that are related to recovery are processed |
1995 | | * directly here, rather than in xlog_redo() |
1996 | | */ |
1997 | 0 | if (record->xl_rmid == RM_XLOG_ID) |
1998 | 0 | xlogrecovery_redo(xlogreader, *replayTLI); |
1999 | | |
2000 | | /* Now apply the WAL record itself */ |
2001 | 0 | GetRmgr(record->xl_rmid).rm_redo(xlogreader); |
2002 | | |
2003 | | /* |
2004 | | * After redo, check whether the backup pages associated with the WAL |
2005 | | * record are consistent with the existing pages. This check is done only |
2006 | | * if consistency check is enabled for this record. |
2007 | | */ |
2008 | 0 | if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0) |
2009 | 0 | verifyBackupPageConsistency(xlogreader); |
2010 | | |
2011 | | /* Pop the error context stack */ |
2012 | 0 | error_context_stack = errcallback.previous; |
2013 | | |
2014 | | /* |
2015 | | * Update lastReplayedEndRecPtr after this record has been successfully |
2016 | | * replayed. |
2017 | | */ |
2018 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
2019 | 0 | XLogRecoveryCtl->lastReplayedReadRecPtr = xlogreader->ReadRecPtr; |
2020 | 0 | XLogRecoveryCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr; |
2021 | 0 | XLogRecoveryCtl->lastReplayedTLI = *replayTLI; |
2022 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
2023 | | |
2024 | | /* ------ |
2025 | | * Wakeup walsenders: |
2026 | | * |
2027 | | * On the standby, the WAL is flushed first (which will only wake up |
2028 | | * physical walsenders) and then applied, which will only wake up logical |
2029 | | * walsenders. |
2030 | | * |
2031 | | * Indeed, logical walsenders on standby can't decode and send data until |
2032 | | * it's been applied. |
2033 | | * |
2034 | | * Physical walsenders don't need to be woken up during replay unless |
2035 | | * cascading replication is allowed and time line change occurred (so that |
2036 | | * they can notice that they are on a new time line). |
2037 | | * |
2038 | | * That's why the wake up conditions are for: |
2039 | | * |
2040 | | * - physical walsenders in case of new time line and cascade |
2041 | | * replication is allowed |
2042 | | * - logical walsenders in case cascade replication is allowed (could not |
2043 | | * be created otherwise) |
2044 | | * ------ |
2045 | | */ |
2046 | 0 | if (AllowCascadeReplication()) |
2047 | 0 | WalSndWakeup(switchedTLI, true); |
2048 | | |
2049 | | /* |
2050 | | * If rm_redo called XLogRequestWalReceiverReply, then we wake up the |
2051 | | * receiver so that it notices the updated lastReplayedEndRecPtr and sends |
2052 | | * a reply to the primary. |
2053 | | */ |
2054 | 0 | if (doRequestWalReceiverReply) |
2055 | 0 | { |
2056 | 0 | doRequestWalReceiverReply = false; |
2057 | 0 | WalRcvForceReply(); |
2058 | 0 | } |
2059 | | |
2060 | | /* Allow read-only connections if we're consistent now */ |
2061 | 0 | CheckRecoveryConsistency(); |
2062 | | |
2063 | | /* Is this a timeline switch? */ |
2064 | 0 | if (switchedTLI) |
2065 | 0 | { |
2066 | | /* |
2067 | | * Before we continue on the new timeline, clean up any (possibly |
2068 | | * bogus) future WAL segments on the old timeline. |
2069 | | */ |
2070 | 0 | RemoveNonParentXlogFiles(xlogreader->EndRecPtr, *replayTLI); |
2071 | | |
2072 | | /* Reset the prefetcher. */ |
2073 | 0 | XLogPrefetchReconfigure(); |
2074 | 0 | } |
2075 | 0 | } |
2076 | | |
2077 | | /* |
2078 | | * Some XLOG RM record types that are directly related to WAL recovery are |
2079 | | * handled here rather than in the xlog_redo() |
2080 | | */ |
2081 | | static void |
2082 | | xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI) |
2083 | 0 | { |
2084 | 0 | uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
2085 | 0 | XLogRecPtr lsn = record->EndRecPtr; |
2086 | |
|
2087 | 0 | Assert(XLogRecGetRmid(record) == RM_XLOG_ID); |
2088 | |
|
2089 | 0 | if (info == XLOG_OVERWRITE_CONTRECORD) |
2090 | 0 | { |
2091 | | /* Verify the payload of a XLOG_OVERWRITE_CONTRECORD record. */ |
2092 | 0 | xl_overwrite_contrecord xlrec; |
2093 | |
|
2094 | 0 | memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord)); |
2095 | 0 | if (xlrec.overwritten_lsn != record->overwrittenRecPtr) |
2096 | 0 | elog(FATAL, "mismatching overwritten LSN %X/%08X -> %X/%08X", |
2097 | 0 | LSN_FORMAT_ARGS(xlrec.overwritten_lsn), |
2098 | 0 | LSN_FORMAT_ARGS(record->overwrittenRecPtr)); |
2099 | | |
2100 | | /* We have safely skipped the aborted record */ |
2101 | 0 | abortedRecPtr = InvalidXLogRecPtr; |
2102 | 0 | missingContrecPtr = InvalidXLogRecPtr; |
2103 | |
|
2104 | 0 | ereport(LOG, |
2105 | 0 | errmsg("successfully skipped missing contrecord at %X/%08X, overwritten at %s", |
2106 | 0 | LSN_FORMAT_ARGS(xlrec.overwritten_lsn), |
2107 | 0 | timestamptz_to_str(xlrec.overwrite_time))); |
2108 | | |
2109 | | /* Verifying the record should only happen once */ |
2110 | 0 | record->overwrittenRecPtr = InvalidXLogRecPtr; |
2111 | 0 | } |
2112 | 0 | else if (info == XLOG_BACKUP_END) |
2113 | 0 | { |
2114 | 0 | XLogRecPtr startpoint; |
2115 | |
|
2116 | 0 | memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint)); |
2117 | |
|
2118 | 0 | if (backupStartPoint == startpoint) |
2119 | 0 | { |
2120 | | /* |
2121 | | * We have reached the end of base backup, the point where |
2122 | | * pg_backup_stop() was done. The data on disk is now consistent |
2123 | | * (assuming we have also reached minRecoveryPoint). Set |
2124 | | * backupEndPoint to the current LSN, so that the next call to |
2125 | | * CheckRecoveryConsistency() will notice it and do the |
2126 | | * end-of-backup processing. |
2127 | | */ |
2128 | 0 | elog(DEBUG1, "end of backup record reached"); |
2129 | | |
2130 | 0 | backupEndPoint = lsn; |
2131 | 0 | } |
2132 | 0 | else |
2133 | 0 | elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%08X, waiting for %X/%08X", |
2134 | 0 | LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint)); |
2135 | 0 | } |
2136 | 0 | } |
2137 | | |
2138 | | /* |
2139 | | * Verify that, in non-test mode, ./pg_tblspc doesn't contain any real |
2140 | | * directories. |
2141 | | * |
2142 | | * Replay of database creation XLOG records for databases that were later |
2143 | | * dropped can create fake directories in pg_tblspc. By the time consistency |
2144 | | * is reached these directories should have been removed; here we verify |
2145 | | * that this did indeed happen. This is to be called at the point where |
2146 | | * consistent state is reached. |
2147 | | * |
2148 | | * allow_in_place_tablespaces turns the PANIC into a WARNING, which is |
2149 | | * useful for testing purposes, and also allows for an escape hatch in case |
2150 | | * things go south. |
2151 | | */ |
2152 | | static void |
2153 | | CheckTablespaceDirectory(void) |
2154 | 0 | { |
2155 | 0 | DIR *dir; |
2156 | 0 | struct dirent *de; |
2157 | |
|
2158 | 0 | dir = AllocateDir(PG_TBLSPC_DIR); |
2159 | 0 | while ((de = ReadDir(dir, PG_TBLSPC_DIR)) != NULL) |
2160 | 0 | { |
2161 | 0 | char path[MAXPGPATH + sizeof(PG_TBLSPC_DIR)]; |
2162 | | |
2163 | | /* Skip entries of non-oid names */ |
2164 | 0 | if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) |
2165 | 0 | continue; |
2166 | | |
2167 | 0 | snprintf(path, sizeof(path), "%s/%s", PG_TBLSPC_DIR, de->d_name); |
2168 | |
|
2169 | 0 | if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK) |
2170 | 0 | ereport(allow_in_place_tablespaces ? WARNING : PANIC, |
2171 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
2172 | 0 | errmsg("unexpected directory entry \"%s\" found in %s", |
2173 | 0 | de->d_name, PG_TBLSPC_DIR), |
2174 | 0 | errdetail("All directory entries in %s/ should be symbolic links.", |
2175 | 0 | PG_TBLSPC_DIR), |
2176 | 0 | errhint("Remove those directories, or set \"allow_in_place_tablespaces\" to ON transiently to let recovery complete."))); |
2177 | 0 | } |
2178 | 0 | } |
2179 | | |
2180 | | /* |
2181 | | * Checks if recovery has reached a consistent state. When consistency is |
2182 | | * reached and we have a valid starting standby snapshot, tell postmaster |
2183 | | * that it can start accepting read-only connections. |
2184 | | */ |
2185 | | static void |
2186 | | CheckRecoveryConsistency(void) |
2187 | 0 | { |
2188 | 0 | XLogRecPtr lastReplayedEndRecPtr; |
2189 | 0 | TimeLineID lastReplayedTLI; |
2190 | | |
2191 | | /* |
2192 | | * During crash recovery, we don't reach a consistent state until we've |
2193 | | * replayed all the WAL. |
2194 | | */ |
2195 | 0 | if (XLogRecPtrIsInvalid(minRecoveryPoint)) |
2196 | 0 | return; |
2197 | | |
2198 | 0 | Assert(InArchiveRecovery); |
2199 | | |
2200 | | /* |
2201 | | * assume that we are called in the startup process, and hence don't need |
2202 | | * a lock to read lastReplayedEndRecPtr |
2203 | | */ |
2204 | 0 | lastReplayedEndRecPtr = XLogRecoveryCtl->lastReplayedEndRecPtr; |
2205 | 0 | lastReplayedTLI = XLogRecoveryCtl->lastReplayedTLI; |
2206 | | |
2207 | | /* |
2208 | | * Have we reached the point where our base backup was completed? |
2209 | | */ |
2210 | 0 | if (!XLogRecPtrIsInvalid(backupEndPoint) && |
2211 | 0 | backupEndPoint <= lastReplayedEndRecPtr) |
2212 | 0 | { |
2213 | 0 | XLogRecPtr saveBackupStartPoint = backupStartPoint; |
2214 | 0 | XLogRecPtr saveBackupEndPoint = backupEndPoint; |
2215 | |
|
2216 | 0 | elog(DEBUG1, "end of backup reached"); |
2217 | | |
2218 | | /* |
2219 | | * We have reached the end of base backup, as indicated by pg_control. |
2220 | | * Update the control file accordingly. |
2221 | | */ |
2222 | 0 | ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI); |
2223 | 0 | backupStartPoint = InvalidXLogRecPtr; |
2224 | 0 | backupEndPoint = InvalidXLogRecPtr; |
2225 | 0 | backupEndRequired = false; |
2226 | |
|
2227 | 0 | ereport(LOG, |
2228 | 0 | errmsg("completed backup recovery with redo LSN %X/%08X and end LSN %X/%08X", |
2229 | 0 | LSN_FORMAT_ARGS(saveBackupStartPoint), |
2230 | 0 | LSN_FORMAT_ARGS(saveBackupEndPoint))); |
2231 | 0 | } |
2232 | | |
2233 | | /* |
2234 | | * Have we passed our safe starting point? Note that minRecoveryPoint is |
2235 | | * known to be incorrectly set if recovering from a backup, until the |
2236 | | * XLOG_BACKUP_END arrives to advise us of the correct minRecoveryPoint. |
2237 | | * All we know prior to that is that we're not consistent yet. |
2238 | | */ |
2239 | 0 | if (!reachedConsistency && !backupEndRequired && |
2240 | 0 | minRecoveryPoint <= lastReplayedEndRecPtr) |
2241 | 0 | { |
2242 | | /* |
2243 | | * Check to see if the XLOG sequence contained any unresolved |
2244 | | * references to uninitialized pages. |
2245 | | */ |
2246 | 0 | XLogCheckInvalidPages(); |
2247 | | |
2248 | | /* |
2249 | | * Check that pg_tblspc doesn't contain any real directories. Replay |
2250 | | * of Database/CREATE_* records may have created fictitious tablespace |
2251 | | * directories that should have been removed by the time consistency |
2252 | | * was reached. |
2253 | | */ |
2254 | 0 | CheckTablespaceDirectory(); |
2255 | |
|
2256 | 0 | reachedConsistency = true; |
2257 | 0 | SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT); |
2258 | 0 | ereport(LOG, |
2259 | 0 | errmsg("consistent recovery state reached at %X/%08X", |
2260 | 0 | LSN_FORMAT_ARGS(lastReplayedEndRecPtr))); |
2261 | 0 | } |
2262 | | |
2263 | | /* |
2264 | | * Have we got a valid starting snapshot that will allow queries to be |
2265 | | * run? If so, we can tell postmaster that the database is consistent now, |
2266 | | * enabling connections. |
2267 | | */ |
2268 | 0 | if (standbyState == STANDBY_SNAPSHOT_READY && |
2269 | 0 | !LocalHotStandbyActive && |
2270 | 0 | reachedConsistency && |
2271 | 0 | IsUnderPostmaster) |
2272 | 0 | { |
2273 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
2274 | 0 | XLogRecoveryCtl->SharedHotStandbyActive = true; |
2275 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
2276 | |
|
2277 | 0 | LocalHotStandbyActive = true; |
2278 | |
|
2279 | 0 | SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY); |
2280 | 0 | } |
2281 | 0 | } |
2282 | | |
2283 | | /* |
2284 | | * Error context callback for errors occurring during rm_redo(). |
2285 | | */ |
2286 | | static void |
2287 | | rm_redo_error_callback(void *arg) |
2288 | 0 | { |
2289 | 0 | XLogReaderState *record = (XLogReaderState *) arg; |
2290 | 0 | StringInfoData buf; |
2291 | |
|
2292 | 0 | initStringInfo(&buf); |
2293 | 0 | xlog_outdesc(&buf, record); |
2294 | 0 | xlog_block_info(&buf, record); |
2295 | | |
2296 | | /* translator: %s is a WAL record description */ |
2297 | 0 | errcontext("WAL redo at %X/%08X for %s", |
2298 | 0 | LSN_FORMAT_ARGS(record->ReadRecPtr), |
2299 | 0 | buf.data); |
2300 | |
|
2301 | 0 | pfree(buf.data); |
2302 | 0 | } |
2303 | | |
2304 | | /* |
2305 | | * Returns a string describing an XLogRecord, consisting of its identity |
2306 | | * optionally followed by a colon, a space, and a further description. |
2307 | | */ |
2308 | | void |
2309 | | xlog_outdesc(StringInfo buf, XLogReaderState *record) |
2310 | 0 | { |
2311 | 0 | RmgrData rmgr = GetRmgr(XLogRecGetRmid(record)); |
2312 | 0 | uint8 info = XLogRecGetInfo(record); |
2313 | 0 | const char *id; |
2314 | |
|
2315 | 0 | appendStringInfoString(buf, rmgr.rm_name); |
2316 | 0 | appendStringInfoChar(buf, '/'); |
2317 | |
|
2318 | 0 | id = rmgr.rm_identify(info); |
2319 | 0 | if (id == NULL) |
2320 | 0 | appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK); |
2321 | 0 | else |
2322 | 0 | appendStringInfo(buf, "%s: ", id); |
2323 | |
|
2324 | 0 | rmgr.rm_desc(buf, record); |
2325 | 0 | } |
2326 | | |
2327 | | #ifdef WAL_DEBUG |
2328 | | |
2329 | | static void |
2330 | | xlog_outrec(StringInfo buf, XLogReaderState *record) |
2331 | | { |
2332 | | appendStringInfo(buf, "prev %X/%08X; xid %u", |
2333 | | LSN_FORMAT_ARGS(XLogRecGetPrev(record)), |
2334 | | XLogRecGetXid(record)); |
2335 | | |
2336 | | appendStringInfo(buf, "; len %u", |
2337 | | XLogRecGetDataLen(record)); |
2338 | | |
2339 | | xlog_block_info(buf, record); |
2340 | | } |
2341 | | #endif /* WAL_DEBUG */ |
2342 | | |
2343 | | /* |
2344 | | * Returns a string giving information about all the blocks in an |
2345 | | * XLogRecord. |
2346 | | */ |
2347 | | static void |
2348 | | xlog_block_info(StringInfo buf, XLogReaderState *record) |
2349 | 0 | { |
2350 | 0 | int block_id; |
2351 | | |
2352 | | /* decode block references */ |
2353 | 0 | for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) |
2354 | 0 | { |
2355 | 0 | RelFileLocator rlocator; |
2356 | 0 | ForkNumber forknum; |
2357 | 0 | BlockNumber blk; |
2358 | |
|
2359 | 0 | if (!XLogRecGetBlockTagExtended(record, block_id, |
2360 | 0 | &rlocator, &forknum, &blk, NULL)) |
2361 | 0 | continue; |
2362 | | |
2363 | 0 | if (forknum != MAIN_FORKNUM) |
2364 | 0 | appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, fork %u, blk %u", |
2365 | 0 | block_id, |
2366 | 0 | rlocator.spcOid, rlocator.dbOid, |
2367 | 0 | rlocator.relNumber, |
2368 | 0 | forknum, |
2369 | 0 | blk); |
2370 | 0 | else |
2371 | 0 | appendStringInfo(buf, "; blkref #%d: rel %u/%u/%u, blk %u", |
2372 | 0 | block_id, |
2373 | 0 | rlocator.spcOid, rlocator.dbOid, |
2374 | 0 | rlocator.relNumber, |
2375 | 0 | blk); |
2376 | 0 | if (XLogRecHasBlockImage(record, block_id)) |
2377 | 0 | appendStringInfoString(buf, " FPW"); |
2378 | 0 | } |
2379 | 0 | } |
2380 | | |
2381 | | |
2382 | | /* |
2383 | | * Check that it's OK to switch to new timeline during recovery. |
2384 | | * |
2385 | | * 'lsn' is the address of the shutdown checkpoint record we're about to |
2386 | | * replay. (Currently, timeline can only change at a shutdown checkpoint). |
2387 | | */ |
2388 | | static void |
2389 | | checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI, |
2390 | | TimeLineID replayTLI) |
2391 | 0 | { |
2392 | | /* Check that the record agrees on what the current (old) timeline is */ |
2393 | 0 | if (prevTLI != replayTLI) |
2394 | 0 | ereport(PANIC, |
2395 | 0 | (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record", |
2396 | 0 | prevTLI, replayTLI))); |
2397 | | |
2398 | | /* |
2399 | | * The new timeline better be in the list of timelines we expect to see, |
2400 | | * according to the timeline history. It should also not decrease. |
2401 | | */ |
2402 | 0 | if (newTLI < replayTLI || !tliInHistory(newTLI, expectedTLEs)) |
2403 | 0 | ereport(PANIC, |
2404 | 0 | (errmsg("unexpected timeline ID %u (after %u) in checkpoint record", |
2405 | 0 | newTLI, replayTLI))); |
2406 | | |
2407 | | /* |
2408 | | * If we have not yet reached min recovery point, and we're about to |
2409 | | * switch to a timeline greater than the timeline of the min recovery |
2410 | | * point: trouble. After switching to the new timeline, we could not |
2411 | | * possibly visit the min recovery point on the correct timeline anymore. |
2412 | | * This can happen if there is a newer timeline in the archive that |
2413 | | * branched before the timeline the min recovery point is on, and you |
2414 | | * attempt to do PITR to the new timeline. |
2415 | | */ |
2416 | 0 | if (!XLogRecPtrIsInvalid(minRecoveryPoint) && |
2417 | 0 | lsn < minRecoveryPoint && |
2418 | 0 | newTLI > minRecoveryPointTLI) |
2419 | 0 | ereport(PANIC, |
2420 | 0 | errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%08X on timeline %u", |
2421 | 0 | newTLI, |
2422 | 0 | LSN_FORMAT_ARGS(minRecoveryPoint), |
2423 | 0 | minRecoveryPointTLI)); |
2424 | | |
2425 | | /* Looks good */ |
2426 | 0 | } |
2427 | | |
2428 | | |
2429 | | /* |
2430 | | * Extract timestamp from WAL record. |
2431 | | * |
2432 | | * If the record contains a timestamp, returns true, and saves the timestamp |
2433 | | * in *recordXtime. If the record type has no timestamp, returns false. |
2434 | | * Currently, only transaction commit/abort records and restore points contain |
2435 | | * timestamps. |
2436 | | */ |
2437 | | static bool |
2438 | | getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime) |
2439 | 0 | { |
2440 | 0 | uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
2441 | 0 | uint8 xact_info = info & XLOG_XACT_OPMASK; |
2442 | 0 | uint8 rmid = XLogRecGetRmid(record); |
2443 | |
|
2444 | 0 | if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) |
2445 | 0 | { |
2446 | 0 | *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time; |
2447 | 0 | return true; |
2448 | 0 | } |
2449 | 0 | if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT || |
2450 | 0 | xact_info == XLOG_XACT_COMMIT_PREPARED)) |
2451 | 0 | { |
2452 | 0 | *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time; |
2453 | 0 | return true; |
2454 | 0 | } |
2455 | 0 | if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT || |
2456 | 0 | xact_info == XLOG_XACT_ABORT_PREPARED)) |
2457 | 0 | { |
2458 | 0 | *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time; |
2459 | 0 | return true; |
2460 | 0 | } |
2461 | 0 | return false; |
2462 | 0 | } |
2463 | | |
2464 | | /* |
2465 | | * Checks whether the current buffer page and backup page stored in the |
2466 | | * WAL record are consistent or not. Before comparing the two pages, a |
2467 | | * masking can be applied to the pages to ignore certain areas like hint bits, |
2468 | | * unused space between pd_lower and pd_upper among other things. This |
2469 | | * function should be called once WAL replay has been completed for a |
2470 | | * given record. |
2471 | | */ |
2472 | | static void |
2473 | | verifyBackupPageConsistency(XLogReaderState *record) |
2474 | 0 | { |
2475 | 0 | RmgrData rmgr = GetRmgr(XLogRecGetRmid(record)); |
2476 | 0 | RelFileLocator rlocator; |
2477 | 0 | ForkNumber forknum; |
2478 | 0 | BlockNumber blkno; |
2479 | 0 | int block_id; |
2480 | | |
2481 | | /* Records with no backup blocks have no need for consistency checks. */ |
2482 | 0 | if (!XLogRecHasAnyBlockRefs(record)) |
2483 | 0 | return; |
2484 | | |
2485 | 0 | Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); |
2486 | |
|
2487 | 0 | for (block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) |
2488 | 0 | { |
2489 | 0 | Buffer buf; |
2490 | 0 | Page page; |
2491 | |
|
2492 | 0 | if (!XLogRecGetBlockTagExtended(record, block_id, |
2493 | 0 | &rlocator, &forknum, &blkno, NULL)) |
2494 | 0 | { |
2495 | | /* |
2496 | | * WAL record doesn't contain a block reference with the given id. |
2497 | | * Do nothing. |
2498 | | */ |
2499 | 0 | continue; |
2500 | 0 | } |
2501 | | |
2502 | 0 | Assert(XLogRecHasBlockImage(record, block_id)); |
2503 | |
|
2504 | 0 | if (XLogRecBlockImageApply(record, block_id)) |
2505 | 0 | { |
2506 | | /* |
2507 | | * WAL record has already applied the page, so bypass the |
2508 | | * consistency check as that would result in comparing the full |
2509 | | * page stored in the record with itself. |
2510 | | */ |
2511 | 0 | continue; |
2512 | 0 | } |
2513 | | |
2514 | | /* |
2515 | | * Read the contents from the current buffer and store it in a |
2516 | | * temporary page. |
2517 | | */ |
2518 | 0 | buf = XLogReadBufferExtended(rlocator, forknum, blkno, |
2519 | 0 | RBM_NORMAL_NO_LOG, |
2520 | 0 | InvalidBuffer); |
2521 | 0 | if (!BufferIsValid(buf)) |
2522 | 0 | continue; |
2523 | | |
2524 | 0 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
2525 | 0 | page = BufferGetPage(buf); |
2526 | | |
2527 | | /* |
2528 | | * Take a copy of the local page where WAL has been applied to have a |
2529 | | * comparison base before masking it... |
2530 | | */ |
2531 | 0 | memcpy(replay_image_masked, page, BLCKSZ); |
2532 | | |
2533 | | /* No need for this page anymore now that a copy is in. */ |
2534 | 0 | UnlockReleaseBuffer(buf); |
2535 | | |
2536 | | /* |
2537 | | * If the block LSN is already ahead of this WAL record, we can't |
2538 | | * expect contents to match. This can happen if recovery is |
2539 | | * restarted. |
2540 | | */ |
2541 | 0 | if (PageGetLSN(replay_image_masked) > record->EndRecPtr) |
2542 | 0 | continue; |
2543 | | |
2544 | | /* |
2545 | | * Read the contents from the backup copy, stored in WAL record and |
2546 | | * store it in a temporary page. There is no need to allocate a new |
2547 | | * page here, a local buffer is fine to hold its contents and a mask |
2548 | | * can be directly applied on it. |
2549 | | */ |
2550 | 0 | if (!RestoreBlockImage(record, block_id, primary_image_masked)) |
2551 | 0 | ereport(ERROR, |
2552 | 0 | (errcode(ERRCODE_INTERNAL_ERROR), |
2553 | 0 | errmsg_internal("%s", record->errormsg_buf))); |
2554 | | |
2555 | | /* |
2556 | | * If masking function is defined, mask both the primary and replay |
2557 | | * images |
2558 | | */ |
2559 | 0 | if (rmgr.rm_mask != NULL) |
2560 | 0 | { |
2561 | 0 | rmgr.rm_mask(replay_image_masked, blkno); |
2562 | 0 | rmgr.rm_mask(primary_image_masked, blkno); |
2563 | 0 | } |
2564 | | |
2565 | | /* Time to compare the primary and replay images. */ |
2566 | 0 | if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0) |
2567 | 0 | { |
2568 | 0 | elog(FATAL, |
2569 | 0 | "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", |
2570 | 0 | rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, |
2571 | 0 | forknum, blkno); |
2572 | 0 | } |
2573 | 0 | } |
2574 | 0 | } |
2575 | | |
2576 | | /* |
2577 | | * For point-in-time recovery, this function decides whether we want to |
2578 | | * stop applying the XLOG before the current record. |
2579 | | * |
2580 | | * Returns true if we are stopping, false otherwise. If stopping, some |
2581 | | * information is saved in recoveryStopXid et al for use in annotating the |
2582 | | * new timeline's history file. |
2583 | | */ |
2584 | | static bool |
2585 | | recoveryStopsBefore(XLogReaderState *record) |
2586 | 0 | { |
2587 | 0 | bool stopsHere = false; |
2588 | 0 | uint8 xact_info; |
2589 | 0 | bool isCommit; |
2590 | 0 | TimestampTz recordXtime = 0; |
2591 | 0 | TransactionId recordXid; |
2592 | | |
2593 | | /* |
2594 | | * Ignore recovery target settings when not in archive recovery (meaning |
2595 | | * we are in crash recovery). |
2596 | | */ |
2597 | 0 | if (!ArchiveRecoveryRequested) |
2598 | 0 | return false; |
2599 | | |
2600 | | /* Check if we should stop as soon as reaching consistency */ |
2601 | 0 | if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) |
2602 | 0 | { |
2603 | 0 | ereport(LOG, |
2604 | 0 | (errmsg("recovery stopping after reaching consistency"))); |
2605 | | |
2606 | 0 | recoveryStopAfter = false; |
2607 | 0 | recoveryStopXid = InvalidTransactionId; |
2608 | 0 | recoveryStopLSN = InvalidXLogRecPtr; |
2609 | 0 | recoveryStopTime = 0; |
2610 | 0 | recoveryStopName[0] = '\0'; |
2611 | 0 | return true; |
2612 | 0 | } |
2613 | | |
2614 | | /* Check if target LSN has been reached */ |
2615 | 0 | if (recoveryTarget == RECOVERY_TARGET_LSN && |
2616 | 0 | !recoveryTargetInclusive && |
2617 | 0 | record->ReadRecPtr >= recoveryTargetLSN) |
2618 | 0 | { |
2619 | 0 | recoveryStopAfter = false; |
2620 | 0 | recoveryStopXid = InvalidTransactionId; |
2621 | 0 | recoveryStopLSN = record->ReadRecPtr; |
2622 | 0 | recoveryStopTime = 0; |
2623 | 0 | recoveryStopName[0] = '\0'; |
2624 | 0 | ereport(LOG, |
2625 | 0 | errmsg("recovery stopping before WAL location (LSN) \"%X/%08X\"", |
2626 | 0 | LSN_FORMAT_ARGS(recoveryStopLSN))); |
2627 | 0 | return true; |
2628 | 0 | } |
2629 | | |
2630 | | /* Otherwise we only consider stopping before COMMIT or ABORT records. */ |
2631 | 0 | if (XLogRecGetRmid(record) != RM_XACT_ID) |
2632 | 0 | return false; |
2633 | | |
2634 | 0 | xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; |
2635 | |
|
2636 | 0 | if (xact_info == XLOG_XACT_COMMIT) |
2637 | 0 | { |
2638 | 0 | isCommit = true; |
2639 | 0 | recordXid = XLogRecGetXid(record); |
2640 | 0 | } |
2641 | 0 | else if (xact_info == XLOG_XACT_COMMIT_PREPARED) |
2642 | 0 | { |
2643 | 0 | xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); |
2644 | 0 | xl_xact_parsed_commit parsed; |
2645 | |
|
2646 | 0 | isCommit = true; |
2647 | 0 | ParseCommitRecord(XLogRecGetInfo(record), |
2648 | 0 | xlrec, |
2649 | 0 | &parsed); |
2650 | 0 | recordXid = parsed.twophase_xid; |
2651 | 0 | } |
2652 | 0 | else if (xact_info == XLOG_XACT_ABORT) |
2653 | 0 | { |
2654 | 0 | isCommit = false; |
2655 | 0 | recordXid = XLogRecGetXid(record); |
2656 | 0 | } |
2657 | 0 | else if (xact_info == XLOG_XACT_ABORT_PREPARED) |
2658 | 0 | { |
2659 | 0 | xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); |
2660 | 0 | xl_xact_parsed_abort parsed; |
2661 | |
|
2662 | 0 | isCommit = false; |
2663 | 0 | ParseAbortRecord(XLogRecGetInfo(record), |
2664 | 0 | xlrec, |
2665 | 0 | &parsed); |
2666 | 0 | recordXid = parsed.twophase_xid; |
2667 | 0 | } |
2668 | 0 | else |
2669 | 0 | return false; |
2670 | | |
2671 | 0 | if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive) |
2672 | 0 | { |
2673 | | /* |
2674 | | * There can be only one transaction end record with this exact |
2675 | | * transactionid |
2676 | | * |
2677 | | * when testing for an xid, we MUST test for equality only, since |
2678 | | * transactions are numbered in the order they start, not the order |
2679 | | * they complete. A higher numbered xid will complete before you about |
2680 | | * 50% of the time... |
2681 | | */ |
2682 | 0 | stopsHere = (recordXid == recoveryTargetXid); |
2683 | 0 | } |
2684 | | |
2685 | | /* |
2686 | | * Note: we must fetch recordXtime regardless of recoveryTarget setting. |
2687 | | * We don't expect getRecordTimestamp ever to fail, since we already know |
2688 | | * this is a commit or abort record; but test its result anyway. |
2689 | | */ |
2690 | 0 | if (getRecordTimestamp(record, &recordXtime) && |
2691 | 0 | recoveryTarget == RECOVERY_TARGET_TIME) |
2692 | 0 | { |
2693 | | /* |
2694 | | * There can be many transactions that share the same commit time, so |
2695 | | * we stop after the last one, if we are inclusive, or stop at the |
2696 | | * first one if we are exclusive |
2697 | | */ |
2698 | 0 | if (recoveryTargetInclusive) |
2699 | 0 | stopsHere = (recordXtime > recoveryTargetTime); |
2700 | 0 | else |
2701 | 0 | stopsHere = (recordXtime >= recoveryTargetTime); |
2702 | 0 | } |
2703 | |
|
2704 | 0 | if (stopsHere) |
2705 | 0 | { |
2706 | 0 | recoveryStopAfter = false; |
2707 | 0 | recoveryStopXid = recordXid; |
2708 | 0 | recoveryStopTime = recordXtime; |
2709 | 0 | recoveryStopLSN = InvalidXLogRecPtr; |
2710 | 0 | recoveryStopName[0] = '\0'; |
2711 | |
|
2712 | 0 | if (isCommit) |
2713 | 0 | { |
2714 | 0 | ereport(LOG, |
2715 | 0 | (errmsg("recovery stopping before commit of transaction %u, time %s", |
2716 | 0 | recoveryStopXid, |
2717 | 0 | timestamptz_to_str(recoveryStopTime)))); |
2718 | 0 | } |
2719 | 0 | else |
2720 | 0 | { |
2721 | 0 | ereport(LOG, |
2722 | 0 | (errmsg("recovery stopping before abort of transaction %u, time %s", |
2723 | 0 | recoveryStopXid, |
2724 | 0 | timestamptz_to_str(recoveryStopTime)))); |
2725 | 0 | } |
2726 | 0 | } |
2727 | | |
2728 | 0 | return stopsHere; |
2729 | 0 | } |
2730 | | |
2731 | | /* |
2732 | | * Same as recoveryStopsBefore, but called after applying the record. |
2733 | | * |
2734 | | * We also track the timestamp of the latest applied COMMIT/ABORT |
2735 | | * record in XLogRecoveryCtl->recoveryLastXTime. |
2736 | | */ |
2737 | | static bool |
2738 | | recoveryStopsAfter(XLogReaderState *record) |
2739 | 0 | { |
2740 | 0 | uint8 info; |
2741 | 0 | uint8 xact_info; |
2742 | 0 | uint8 rmid; |
2743 | 0 | TimestampTz recordXtime = 0; |
2744 | | |
2745 | | /* |
2746 | | * Ignore recovery target settings when not in archive recovery (meaning |
2747 | | * we are in crash recovery). |
2748 | | */ |
2749 | 0 | if (!ArchiveRecoveryRequested) |
2750 | 0 | return false; |
2751 | | |
2752 | 0 | info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; |
2753 | 0 | rmid = XLogRecGetRmid(record); |
2754 | | |
2755 | | /* |
2756 | | * There can be many restore points that share the same name; we stop at |
2757 | | * the first one. |
2758 | | */ |
2759 | 0 | if (recoveryTarget == RECOVERY_TARGET_NAME && |
2760 | 0 | rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT) |
2761 | 0 | { |
2762 | 0 | xl_restore_point *recordRestorePointData; |
2763 | |
|
2764 | 0 | recordRestorePointData = (xl_restore_point *) XLogRecGetData(record); |
2765 | |
|
2766 | 0 | if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0) |
2767 | 0 | { |
2768 | 0 | recoveryStopAfter = true; |
2769 | 0 | recoveryStopXid = InvalidTransactionId; |
2770 | 0 | recoveryStopLSN = InvalidXLogRecPtr; |
2771 | 0 | (void) getRecordTimestamp(record, &recoveryStopTime); |
2772 | 0 | strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN); |
2773 | |
|
2774 | 0 | ereport(LOG, |
2775 | 0 | (errmsg("recovery stopping at restore point \"%s\", time %s", |
2776 | 0 | recoveryStopName, |
2777 | 0 | timestamptz_to_str(recoveryStopTime)))); |
2778 | 0 | return true; |
2779 | 0 | } |
2780 | 0 | } |
2781 | | |
2782 | | /* Check if the target LSN has been reached */ |
2783 | 0 | if (recoveryTarget == RECOVERY_TARGET_LSN && |
2784 | 0 | recoveryTargetInclusive && |
2785 | 0 | record->ReadRecPtr >= recoveryTargetLSN) |
2786 | 0 | { |
2787 | 0 | recoveryStopAfter = true; |
2788 | 0 | recoveryStopXid = InvalidTransactionId; |
2789 | 0 | recoveryStopLSN = record->ReadRecPtr; |
2790 | 0 | recoveryStopTime = 0; |
2791 | 0 | recoveryStopName[0] = '\0'; |
2792 | 0 | ereport(LOG, |
2793 | 0 | errmsg("recovery stopping after WAL location (LSN) \"%X/%08X\"", |
2794 | 0 | LSN_FORMAT_ARGS(recoveryStopLSN))); |
2795 | 0 | return true; |
2796 | 0 | } |
2797 | | |
2798 | 0 | if (rmid != RM_XACT_ID) |
2799 | 0 | return false; |
2800 | | |
2801 | 0 | xact_info = info & XLOG_XACT_OPMASK; |
2802 | |
|
2803 | 0 | if (xact_info == XLOG_XACT_COMMIT || |
2804 | 0 | xact_info == XLOG_XACT_COMMIT_PREPARED || |
2805 | 0 | xact_info == XLOG_XACT_ABORT || |
2806 | 0 | xact_info == XLOG_XACT_ABORT_PREPARED) |
2807 | 0 | { |
2808 | 0 | TransactionId recordXid; |
2809 | | |
2810 | | /* Update the last applied transaction timestamp */ |
2811 | 0 | if (getRecordTimestamp(record, &recordXtime)) |
2812 | 0 | SetLatestXTime(recordXtime); |
2813 | | |
2814 | | /* Extract the XID of the committed/aborted transaction */ |
2815 | 0 | if (xact_info == XLOG_XACT_COMMIT_PREPARED) |
2816 | 0 | { |
2817 | 0 | xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record); |
2818 | 0 | xl_xact_parsed_commit parsed; |
2819 | |
|
2820 | 0 | ParseCommitRecord(XLogRecGetInfo(record), |
2821 | 0 | xlrec, |
2822 | 0 | &parsed); |
2823 | 0 | recordXid = parsed.twophase_xid; |
2824 | 0 | } |
2825 | 0 | else if (xact_info == XLOG_XACT_ABORT_PREPARED) |
2826 | 0 | { |
2827 | 0 | xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record); |
2828 | 0 | xl_xact_parsed_abort parsed; |
2829 | |
|
2830 | 0 | ParseAbortRecord(XLogRecGetInfo(record), |
2831 | 0 | xlrec, |
2832 | 0 | &parsed); |
2833 | 0 | recordXid = parsed.twophase_xid; |
2834 | 0 | } |
2835 | 0 | else |
2836 | 0 | recordXid = XLogRecGetXid(record); |
2837 | | |
2838 | | /* |
2839 | | * There can be only one transaction end record with this exact |
2840 | | * transactionid |
2841 | | * |
2842 | | * when testing for an xid, we MUST test for equality only, since |
2843 | | * transactions are numbered in the order they start, not the order |
2844 | | * they complete. A higher numbered xid will complete before you about |
2845 | | * 50% of the time... |
2846 | | */ |
2847 | 0 | if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive && |
2848 | 0 | recordXid == recoveryTargetXid) |
2849 | 0 | { |
2850 | 0 | recoveryStopAfter = true; |
2851 | 0 | recoveryStopXid = recordXid; |
2852 | 0 | recoveryStopTime = recordXtime; |
2853 | 0 | recoveryStopLSN = InvalidXLogRecPtr; |
2854 | 0 | recoveryStopName[0] = '\0'; |
2855 | |
|
2856 | 0 | if (xact_info == XLOG_XACT_COMMIT || |
2857 | 0 | xact_info == XLOG_XACT_COMMIT_PREPARED) |
2858 | 0 | { |
2859 | 0 | ereport(LOG, |
2860 | 0 | (errmsg("recovery stopping after commit of transaction %u, time %s", |
2861 | 0 | recoveryStopXid, |
2862 | 0 | timestamptz_to_str(recoveryStopTime)))); |
2863 | 0 | } |
2864 | 0 | else if (xact_info == XLOG_XACT_ABORT || |
2865 | 0 | xact_info == XLOG_XACT_ABORT_PREPARED) |
2866 | 0 | { |
2867 | 0 | ereport(LOG, |
2868 | 0 | (errmsg("recovery stopping after abort of transaction %u, time %s", |
2869 | 0 | recoveryStopXid, |
2870 | 0 | timestamptz_to_str(recoveryStopTime)))); |
2871 | 0 | } |
2872 | 0 | return true; |
2873 | 0 | } |
2874 | 0 | } |
2875 | | |
2876 | | /* Check if we should stop as soon as reaching consistency */ |
2877 | 0 | if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency) |
2878 | 0 | { |
2879 | 0 | ereport(LOG, |
2880 | 0 | (errmsg("recovery stopping after reaching consistency"))); |
2881 | | |
2882 | 0 | recoveryStopAfter = true; |
2883 | 0 | recoveryStopXid = InvalidTransactionId; |
2884 | 0 | recoveryStopTime = 0; |
2885 | 0 | recoveryStopLSN = InvalidXLogRecPtr; |
2886 | 0 | recoveryStopName[0] = '\0'; |
2887 | 0 | return true; |
2888 | 0 | } |
2889 | | |
2890 | 0 | return false; |
2891 | 0 | } |
2892 | | |
2893 | | /* |
2894 | | * Create a comment for the history file to explain why and where |
2895 | | * timeline changed. |
2896 | | */ |
2897 | | static char * |
2898 | | getRecoveryStopReason(void) |
2899 | 0 | { |
2900 | 0 | char reason[200]; |
2901 | |
|
2902 | 0 | if (recoveryTarget == RECOVERY_TARGET_XID) |
2903 | 0 | snprintf(reason, sizeof(reason), |
2904 | 0 | "%s transaction %u", |
2905 | 0 | recoveryStopAfter ? "after" : "before", |
2906 | 0 | recoveryStopXid); |
2907 | 0 | else if (recoveryTarget == RECOVERY_TARGET_TIME) |
2908 | 0 | snprintf(reason, sizeof(reason), |
2909 | 0 | "%s %s\n", |
2910 | 0 | recoveryStopAfter ? "after" : "before", |
2911 | 0 | timestamptz_to_str(recoveryStopTime)); |
2912 | 0 | else if (recoveryTarget == RECOVERY_TARGET_LSN) |
2913 | 0 | snprintf(reason, sizeof(reason), |
2914 | 0 | "%s LSN %X/%08X\n", |
2915 | 0 | recoveryStopAfter ? "after" : "before", |
2916 | 0 | LSN_FORMAT_ARGS(recoveryStopLSN)); |
2917 | 0 | else if (recoveryTarget == RECOVERY_TARGET_NAME) |
2918 | 0 | snprintf(reason, sizeof(reason), |
2919 | 0 | "at restore point \"%s\"", |
2920 | 0 | recoveryStopName); |
2921 | 0 | else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) |
2922 | 0 | snprintf(reason, sizeof(reason), "reached consistency"); |
2923 | 0 | else |
2924 | 0 | snprintf(reason, sizeof(reason), "no recovery target specified"); |
2925 | |
|
2926 | 0 | return pstrdup(reason); |
2927 | 0 | } |
2928 | | |
2929 | | /* |
2930 | | * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED. |
2931 | | * |
2932 | | * endOfRecovery is true if the recovery target is reached and |
2933 | | * the paused state starts at the end of recovery because of |
2934 | | * recovery_target_action=pause, and false otherwise. |
2935 | | */ |
2936 | | static void |
2937 | | recoveryPausesHere(bool endOfRecovery) |
2938 | 0 | { |
2939 | | /* Don't pause unless users can connect! */ |
2940 | 0 | if (!LocalHotStandbyActive) |
2941 | 0 | return; |
2942 | | |
2943 | | /* Don't pause after standby promotion has been triggered */ |
2944 | 0 | if (LocalPromoteIsTriggered) |
2945 | 0 | return; |
2946 | | |
2947 | 0 | if (endOfRecovery) |
2948 | 0 | ereport(LOG, |
2949 | 0 | (errmsg("pausing at the end of recovery"), |
2950 | 0 | errhint("Execute pg_wal_replay_resume() to promote."))); |
2951 | 0 | else |
2952 | 0 | ereport(LOG, |
2953 | 0 | (errmsg("recovery has paused"), |
2954 | 0 | errhint("Execute pg_wal_replay_resume() to continue."))); |
2955 | | |
2956 | | /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */ |
2957 | 0 | while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) |
2958 | 0 | { |
2959 | 0 | ProcessStartupProcInterrupts(); |
2960 | 0 | if (CheckForStandbyTrigger()) |
2961 | 0 | return; |
2962 | | |
2963 | | /* |
2964 | | * If recovery pause is requested then set it paused. While we are in |
2965 | | * the loop, user might resume and pause again so set this every time. |
2966 | | */ |
2967 | 0 | ConfirmRecoveryPaused(); |
2968 | | |
2969 | | /* |
2970 | | * We wait on a condition variable that will wake us as soon as the |
2971 | | * pause ends, but we use a timeout so we can check the above exit |
2972 | | * condition periodically too. |
2973 | | */ |
2974 | 0 | ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000, |
2975 | 0 | WAIT_EVENT_RECOVERY_PAUSE); |
2976 | 0 | } |
2977 | 0 | ConditionVariableCancelSleep(); |
2978 | 0 | } |
2979 | | |
2980 | | /* |
2981 | | * When recovery_min_apply_delay is set, we wait long enough to make sure |
2982 | | * certain record types are applied at least that interval behind the primary. |
2983 | | * |
2984 | | * Returns true if we waited. |
2985 | | * |
2986 | | * Note that the delay is calculated between the WAL record log time and |
2987 | | * the current time on standby. We would prefer to keep track of when this |
2988 | | * standby received each WAL record, which would allow a more consistent |
2989 | | * approach and one not affected by time synchronisation issues, but that |
2990 | | * is significantly more effort and complexity for little actual gain in |
2991 | | * usability. |
2992 | | */ |
2993 | | static bool |
2994 | | recoveryApplyDelay(XLogReaderState *record) |
2995 | 0 | { |
2996 | 0 | uint8 xact_info; |
2997 | 0 | TimestampTz xtime; |
2998 | 0 | TimestampTz delayUntil; |
2999 | 0 | long msecs; |
3000 | | |
3001 | | /* nothing to do if no delay configured */ |
3002 | 0 | if (recovery_min_apply_delay <= 0) |
3003 | 0 | return false; |
3004 | | |
3005 | | /* no delay is applied on a database not yet consistent */ |
3006 | 0 | if (!reachedConsistency) |
3007 | 0 | return false; |
3008 | | |
3009 | | /* nothing to do if crash recovery is requested */ |
3010 | 0 | if (!ArchiveRecoveryRequested) |
3011 | 0 | return false; |
3012 | | |
3013 | | /* |
3014 | | * Is it a COMMIT record? |
3015 | | * |
3016 | | * We deliberately choose not to delay aborts since they have no effect on |
3017 | | * MVCC. We already allow replay of records that don't have a timestamp, |
3018 | | * so there is already opportunity for issues caused by early conflicts on |
3019 | | * standbys. |
3020 | | */ |
3021 | 0 | if (XLogRecGetRmid(record) != RM_XACT_ID) |
3022 | 0 | return false; |
3023 | | |
3024 | 0 | xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK; |
3025 | |
|
3026 | 0 | if (xact_info != XLOG_XACT_COMMIT && |
3027 | 0 | xact_info != XLOG_XACT_COMMIT_PREPARED) |
3028 | 0 | return false; |
3029 | | |
3030 | 0 | if (!getRecordTimestamp(record, &xtime)) |
3031 | 0 | return false; |
3032 | | |
3033 | 0 | delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); |
3034 | | |
3035 | | /* |
3036 | | * Exit without arming the latch if it's already past time to apply this |
3037 | | * record |
3038 | | */ |
3039 | 0 | msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil); |
3040 | 0 | if (msecs <= 0) |
3041 | 0 | return false; |
3042 | | |
3043 | 0 | while (true) |
3044 | 0 | { |
3045 | 0 | ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); |
3046 | | |
3047 | | /* This might change recovery_min_apply_delay. */ |
3048 | 0 | ProcessStartupProcInterrupts(); |
3049 | |
|
3050 | 0 | if (CheckForStandbyTrigger()) |
3051 | 0 | break; |
3052 | | |
3053 | | /* |
3054 | | * Recalculate delayUntil as recovery_min_apply_delay could have |
3055 | | * changed while waiting in this loop. |
3056 | | */ |
3057 | 0 | delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay); |
3058 | | |
3059 | | /* |
3060 | | * Wait for difference between GetCurrentTimestamp() and delayUntil. |
3061 | | */ |
3062 | 0 | msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), |
3063 | 0 | delayUntil); |
3064 | |
|
3065 | 0 | if (msecs <= 0) |
3066 | 0 | break; |
3067 | | |
3068 | 0 | elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs); |
3069 | | |
3070 | 0 | (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, |
3071 | 0 | WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, |
3072 | 0 | msecs, |
3073 | 0 | WAIT_EVENT_RECOVERY_APPLY_DELAY); |
3074 | 0 | } |
3075 | 0 | return true; |
3076 | 0 | } |
3077 | | |
3078 | | /* |
3079 | | * Get the current state of the recovery pause request. |
3080 | | */ |
3081 | | RecoveryPauseState |
3082 | | GetRecoveryPauseState(void) |
3083 | 0 | { |
3084 | 0 | RecoveryPauseState state; |
3085 | |
|
3086 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
3087 | 0 | state = XLogRecoveryCtl->recoveryPauseState; |
3088 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
3089 | |
|
3090 | 0 | return state; |
3091 | 0 | } |
3092 | | |
3093 | | /* |
3094 | | * Set the recovery pause state. |
3095 | | * |
3096 | | * If recovery pause is requested then sets the recovery pause state to |
3097 | | * 'pause requested' if it is not already 'paused'. Otherwise, sets it |
3098 | | * to 'not paused' to resume the recovery. The recovery pause will be |
3099 | | * confirmed by the ConfirmRecoveryPaused. |
3100 | | */ |
3101 | | void |
3102 | | SetRecoveryPause(bool recoveryPause) |
3103 | 0 | { |
3104 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
3105 | |
|
3106 | 0 | if (!recoveryPause) |
3107 | 0 | XLogRecoveryCtl->recoveryPauseState = RECOVERY_NOT_PAUSED; |
3108 | 0 | else if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED) |
3109 | 0 | XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED; |
3110 | |
|
3111 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
3112 | |
|
3113 | 0 | if (!recoveryPause) |
3114 | 0 | ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV); |
3115 | 0 | } |
3116 | | |
3117 | | /* |
3118 | | * Confirm the recovery pause by setting the recovery pause state to |
3119 | | * RECOVERY_PAUSED. |
3120 | | */ |
3121 | | static void |
3122 | | ConfirmRecoveryPaused(void) |
3123 | 0 | { |
3124 | | /* If recovery pause is requested then set it paused */ |
3125 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
3126 | 0 | if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED) |
3127 | 0 | XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED; |
3128 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
3129 | 0 | } |
3130 | | |
3131 | | |
3132 | | /* |
3133 | | * Attempt to read the next XLOG record. |
3134 | | * |
3135 | | * Before first call, the reader needs to be positioned to the first record |
3136 | | * by calling XLogPrefetcherBeginRead(). |
3137 | | * |
3138 | | * If no valid record is available, returns NULL, or fails if emode is PANIC. |
3139 | | * (emode must be either PANIC, LOG). In standby mode, retries until a valid |
3140 | | * record is available. |
3141 | | */ |
3142 | | static XLogRecord * |
3143 | | ReadRecord(XLogPrefetcher *xlogprefetcher, int emode, |
3144 | | bool fetching_ckpt, TimeLineID replayTLI) |
3145 | 0 | { |
3146 | 0 | XLogRecord *record; |
3147 | 0 | XLogReaderState *xlogreader = XLogPrefetcherGetReader(xlogprefetcher); |
3148 | 0 | XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data; |
3149 | | |
3150 | | /* Pass through parameters to XLogPageRead */ |
3151 | 0 | private->fetching_ckpt = fetching_ckpt; |
3152 | 0 | private->emode = emode; |
3153 | 0 | private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr); |
3154 | 0 | private->replayTLI = replayTLI; |
3155 | | |
3156 | | /* This is the first attempt to read this page. */ |
3157 | 0 | lastSourceFailed = false; |
3158 | |
|
3159 | 0 | for (;;) |
3160 | 0 | { |
3161 | 0 | char *errormsg; |
3162 | |
|
3163 | 0 | record = XLogPrefetcherReadRecord(xlogprefetcher, &errormsg); |
3164 | 0 | if (record == NULL) |
3165 | 0 | { |
3166 | | /* |
3167 | | * When we find that WAL ends in an incomplete record, keep track |
3168 | | * of that record. After recovery is done, we'll write a record |
3169 | | * to indicate to downstream WAL readers that that portion is to |
3170 | | * be ignored. |
3171 | | * |
3172 | | * However, when ArchiveRecoveryRequested = true, we're going to |
3173 | | * switch to a new timeline at the end of recovery. We will only |
3174 | | * copy WAL over to the new timeline up to the end of the last |
3175 | | * complete record, so if we did this, we would later create an |
3176 | | * overwrite contrecord in the wrong place, breaking everything. |
3177 | | */ |
3178 | 0 | if (!ArchiveRecoveryRequested && |
3179 | 0 | !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr)) |
3180 | 0 | { |
3181 | 0 | abortedRecPtr = xlogreader->abortedRecPtr; |
3182 | 0 | missingContrecPtr = xlogreader->missingContrecPtr; |
3183 | 0 | } |
3184 | |
|
3185 | 0 | if (readFile >= 0) |
3186 | 0 | { |
3187 | 0 | close(readFile); |
3188 | 0 | readFile = -1; |
3189 | 0 | } |
3190 | | |
3191 | | /* |
3192 | | * We only end up here without a message when XLogPageRead() |
3193 | | * failed - in that case we already logged something. In |
3194 | | * StandbyMode that only happens if we have been triggered, so we |
3195 | | * shouldn't loop anymore in that case. |
3196 | | */ |
3197 | 0 | if (errormsg) |
3198 | 0 | ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), |
3199 | 0 | (errmsg_internal("%s", errormsg) /* already translated */ )); |
3200 | 0 | } |
3201 | | |
3202 | | /* |
3203 | | * Check page TLI is one of the expected values. |
3204 | | */ |
3205 | 0 | else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs)) |
3206 | 0 | { |
3207 | 0 | char fname[MAXFNAMELEN]; |
3208 | 0 | XLogSegNo segno; |
3209 | 0 | int32 offset; |
3210 | |
|
3211 | 0 | XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size); |
3212 | 0 | offset = XLogSegmentOffset(xlogreader->latestPagePtr, |
3213 | 0 | wal_segment_size); |
3214 | 0 | XLogFileName(fname, xlogreader->seg.ws_tli, segno, |
3215 | 0 | wal_segment_size); |
3216 | 0 | ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), |
3217 | 0 | errmsg("unexpected timeline ID %u in WAL segment %s, LSN %X/%08X, offset %u", |
3218 | 0 | xlogreader->latestPageTLI, |
3219 | 0 | fname, |
3220 | 0 | LSN_FORMAT_ARGS(xlogreader->latestPagePtr), |
3221 | 0 | offset)); |
3222 | 0 | record = NULL; |
3223 | 0 | } |
3224 | | |
3225 | 0 | if (record) |
3226 | 0 | { |
3227 | | /* Great, got a record */ |
3228 | 0 | return record; |
3229 | 0 | } |
3230 | 0 | else |
3231 | 0 | { |
3232 | | /* No valid record available from this source */ |
3233 | 0 | lastSourceFailed = true; |
3234 | | |
3235 | | /* |
3236 | | * If archive recovery was requested, but we were still doing |
3237 | | * crash recovery, switch to archive recovery and retry using the |
3238 | | * offline archive. We have now replayed all the valid WAL in |
3239 | | * pg_wal, so we are presumably now consistent. |
3240 | | * |
3241 | | * We require that there's at least some valid WAL present in |
3242 | | * pg_wal, however (!fetching_ckpt). We could recover using the |
3243 | | * WAL from the archive, even if pg_wal is completely empty, but |
3244 | | * we'd have no idea how far we'd have to replay to reach |
3245 | | * consistency. So err on the safe side and give up. |
3246 | | */ |
3247 | 0 | if (!InArchiveRecovery && ArchiveRecoveryRequested && |
3248 | 0 | !fetching_ckpt) |
3249 | 0 | { |
3250 | 0 | ereport(DEBUG1, |
3251 | 0 | (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); |
3252 | 0 | InArchiveRecovery = true; |
3253 | 0 | if (StandbyModeRequested) |
3254 | 0 | EnableStandbyMode(); |
3255 | |
|
3256 | 0 | SwitchIntoArchiveRecovery(xlogreader->EndRecPtr, replayTLI); |
3257 | 0 | minRecoveryPoint = xlogreader->EndRecPtr; |
3258 | 0 | minRecoveryPointTLI = replayTLI; |
3259 | |
|
3260 | 0 | CheckRecoveryConsistency(); |
3261 | | |
3262 | | /* |
3263 | | * Before we retry, reset lastSourceFailed and currentSource |
3264 | | * so that we will check the archive next. |
3265 | | */ |
3266 | 0 | lastSourceFailed = false; |
3267 | 0 | currentSource = XLOG_FROM_ANY; |
3268 | |
|
3269 | 0 | continue; |
3270 | 0 | } |
3271 | | |
3272 | | /* In standby mode, loop back to retry. Otherwise, give up. */ |
3273 | 0 | if (StandbyMode && !CheckForStandbyTrigger()) |
3274 | 0 | continue; |
3275 | 0 | else |
3276 | 0 | return NULL; |
3277 | 0 | } |
3278 | 0 | } |
3279 | 0 | } |
3280 | | |
3281 | | /* |
3282 | | * Read the XLOG page containing targetPagePtr into readBuf (if not read |
3283 | | * already). Returns number of bytes read, if the page is read successfully, |
3284 | | * or XLREAD_FAIL in case of errors. When errors occur, they are ereport'ed, |
3285 | | * but only if they have not been previously reported. |
3286 | | * |
3287 | | * See XLogReaderRoutine.page_read for more details. |
3288 | | * |
3289 | | * While prefetching, xlogreader->nonblocking may be set. In that case, |
3290 | | * returns XLREAD_WOULDBLOCK if we'd otherwise have to wait for more WAL. |
3291 | | * |
3292 | | * This is responsible for restoring files from archive as needed, as well |
3293 | | * as for waiting for the requested WAL record to arrive in standby mode. |
3294 | | * |
3295 | | * xlogreader->private_data->emode specifies the log level used for reporting |
3296 | | * "file not found" or "end of WAL" situations in archive recovery, or in |
3297 | | * standby mode when promotion is triggered. If set to WARNING or below, |
3298 | | * XLogPageRead() returns XLREAD_FAIL in those situations, on higher log |
3299 | | * levels the ereport() won't return. |
3300 | | * |
3301 | | * In standby mode, if after a successful return of XLogPageRead() the |
3302 | | * caller finds the record it's interested in to be broken, it should |
3303 | | * ereport the error with the level determined by |
3304 | | * emode_for_corrupt_record(), and then set lastSourceFailed |
3305 | | * and call XLogPageRead() again with the same arguments. This lets |
3306 | | * XLogPageRead() to try fetching the record from another source, or to |
3307 | | * sleep and retry. |
3308 | | */ |
3309 | | static int |
3310 | | XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, |
3311 | | XLogRecPtr targetRecPtr, char *readBuf) |
3312 | 0 | { |
3313 | 0 | XLogPageReadPrivate *private = |
3314 | 0 | (XLogPageReadPrivate *) xlogreader->private_data; |
3315 | 0 | int emode = private->emode; |
3316 | 0 | uint32 targetPageOff; |
3317 | 0 | XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY; |
3318 | 0 | int r; |
3319 | 0 | instr_time io_start; |
3320 | |
|
3321 | 0 | XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size); |
3322 | 0 | targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size); |
3323 | | |
3324 | | /* |
3325 | | * See if we need to switch to a new segment because the requested record |
3326 | | * is not in the currently open one. |
3327 | | */ |
3328 | 0 | if (readFile >= 0 && |
3329 | 0 | !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size)) |
3330 | 0 | { |
3331 | | /* |
3332 | | * Request a restartpoint if we've replayed too much xlog since the |
3333 | | * last one. |
3334 | | */ |
3335 | 0 | if (ArchiveRecoveryRequested && IsUnderPostmaster) |
3336 | 0 | { |
3337 | 0 | if (XLogCheckpointNeeded(readSegNo)) |
3338 | 0 | { |
3339 | 0 | (void) GetRedoRecPtr(); |
3340 | 0 | if (XLogCheckpointNeeded(readSegNo)) |
3341 | 0 | RequestCheckpoint(CHECKPOINT_CAUSE_XLOG); |
3342 | 0 | } |
3343 | 0 | } |
3344 | |
|
3345 | 0 | close(readFile); |
3346 | 0 | readFile = -1; |
3347 | 0 | readSource = XLOG_FROM_ANY; |
3348 | 0 | } |
3349 | |
|
3350 | 0 | XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size); |
3351 | |
|
3352 | 0 | retry: |
3353 | | /* See if we need to retrieve more data */ |
3354 | 0 | if (readFile < 0 || |
3355 | 0 | (readSource == XLOG_FROM_STREAM && |
3356 | 0 | flushedUpto < targetPagePtr + reqLen)) |
3357 | 0 | { |
3358 | 0 | if (readFile >= 0 && |
3359 | 0 | xlogreader->nonblocking && |
3360 | 0 | readSource == XLOG_FROM_STREAM && |
3361 | 0 | flushedUpto < targetPagePtr + reqLen) |
3362 | 0 | return XLREAD_WOULDBLOCK; |
3363 | | |
3364 | 0 | switch (WaitForWALToBecomeAvailable(targetPagePtr + reqLen, |
3365 | 0 | private->randAccess, |
3366 | 0 | private->fetching_ckpt, |
3367 | 0 | targetRecPtr, |
3368 | 0 | private->replayTLI, |
3369 | 0 | xlogreader->EndRecPtr, |
3370 | 0 | xlogreader->nonblocking)) |
3371 | 0 | { |
3372 | 0 | case XLREAD_WOULDBLOCK: |
3373 | 0 | return XLREAD_WOULDBLOCK; |
3374 | 0 | case XLREAD_FAIL: |
3375 | 0 | if (readFile >= 0) |
3376 | 0 | close(readFile); |
3377 | 0 | readFile = -1; |
3378 | 0 | readLen = 0; |
3379 | 0 | readSource = XLOG_FROM_ANY; |
3380 | 0 | return XLREAD_FAIL; |
3381 | 0 | case XLREAD_SUCCESS: |
3382 | 0 | break; |
3383 | 0 | } |
3384 | 0 | } |
3385 | | |
3386 | | /* |
3387 | | * At this point, we have the right segment open and if we're streaming we |
3388 | | * know the requested record is in it. |
3389 | | */ |
3390 | 0 | Assert(readFile != -1); |
3391 | | |
3392 | | /* |
3393 | | * If the current segment is being streamed from the primary, calculate |
3394 | | * how much of the current page we have received already. We know the |
3395 | | * requested record has been received, but this is for the benefit of |
3396 | | * future calls, to allow quick exit at the top of this function. |
3397 | | */ |
3398 | 0 | if (readSource == XLOG_FROM_STREAM) |
3399 | 0 | { |
3400 | 0 | if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ)) |
3401 | 0 | readLen = XLOG_BLCKSZ; |
3402 | 0 | else |
3403 | 0 | readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) - |
3404 | 0 | targetPageOff; |
3405 | 0 | } |
3406 | 0 | else |
3407 | 0 | readLen = XLOG_BLCKSZ; |
3408 | | |
3409 | | /* Read the requested page */ |
3410 | 0 | readOff = targetPageOff; |
3411 | | |
3412 | | /* Measure I/O timing when reading segment */ |
3413 | 0 | io_start = pgstat_prepare_io_time(track_wal_io_timing); |
3414 | |
|
3415 | 0 | pgstat_report_wait_start(WAIT_EVENT_WAL_READ); |
3416 | 0 | r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff); |
3417 | 0 | if (r != XLOG_BLCKSZ) |
3418 | 0 | { |
3419 | 0 | char fname[MAXFNAMELEN]; |
3420 | 0 | int save_errno = errno; |
3421 | |
|
3422 | 0 | pgstat_report_wait_end(); |
3423 | |
|
3424 | 0 | pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, |
3425 | 0 | io_start, 1, r); |
3426 | |
|
3427 | 0 | XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size); |
3428 | 0 | if (r < 0) |
3429 | 0 | { |
3430 | 0 | errno = save_errno; |
3431 | 0 | ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), |
3432 | 0 | (errcode_for_file_access(), |
3433 | 0 | errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: %m", |
3434 | 0 | fname, LSN_FORMAT_ARGS(targetPagePtr), |
3435 | 0 | readOff))); |
3436 | 0 | } |
3437 | 0 | else |
3438 | 0 | ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen), |
3439 | 0 | (errcode(ERRCODE_DATA_CORRUPTED), |
3440 | 0 | errmsg("could not read from WAL segment %s, LSN %X/%08X, offset %u: read %d of %zu", |
3441 | 0 | fname, LSN_FORMAT_ARGS(targetPagePtr), |
3442 | 0 | readOff, r, (Size) XLOG_BLCKSZ))); |
3443 | 0 | goto next_record_is_invalid; |
3444 | 0 | } |
3445 | 0 | pgstat_report_wait_end(); |
3446 | |
|
3447 | 0 | pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_READ, |
3448 | 0 | io_start, 1, r); |
3449 | |
|
3450 | 0 | Assert(targetSegNo == readSegNo); |
3451 | 0 | Assert(targetPageOff == readOff); |
3452 | 0 | Assert(reqLen <= readLen); |
3453 | |
|
3454 | 0 | xlogreader->seg.ws_tli = curFileTLI; |
3455 | | |
3456 | | /* |
3457 | | * Check the page header immediately, so that we can retry immediately if |
3458 | | * it's not valid. This may seem unnecessary, because ReadPageInternal() |
3459 | | * validates the page header anyway, and would propagate the failure up to |
3460 | | * ReadRecord(), which would retry. However, there's a corner case with |
3461 | | * continuation records, if a record is split across two pages such that |
3462 | | * we would need to read the two pages from different sources across two |
3463 | | * WAL segments. |
3464 | | * |
3465 | | * The first page is only available locally, in pg_wal, because it's |
3466 | | * already been recycled on the primary. The second page, however, is not |
3467 | | * present in pg_wal, and we should stream it from the primary. There is a |
3468 | | * recycled WAL segment present in pg_wal, with garbage contents, however. |
3469 | | * We would read the first page from the local WAL segment, but when |
3470 | | * reading the second page, we would read the bogus, recycled, WAL |
3471 | | * segment. If we didn't catch that case here, we would never recover, |
3472 | | * because ReadRecord() would retry reading the whole record from the |
3473 | | * beginning. |
3474 | | * |
3475 | | * Of course, this only catches errors in the page header, which is what |
3476 | | * happens in the case of a recycled WAL segment. Other kinds of errors or |
3477 | | * corruption still has the same problem. But this at least fixes the |
3478 | | * common case, which can happen as part of normal operation. |
3479 | | * |
3480 | | * Validating the page header is cheap enough that doing it twice |
3481 | | * shouldn't be a big deal from a performance point of view. |
3482 | | * |
3483 | | * When not in standby mode, an invalid page header should cause recovery |
3484 | | * to end, not retry reading the page, so we don't need to validate the |
3485 | | * page header here for the retry. Instead, ReadPageInternal() is |
3486 | | * responsible for the validation. |
3487 | | */ |
3488 | 0 | if (StandbyMode && |
3489 | 0 | (targetPagePtr % wal_segment_size) == 0 && |
3490 | 0 | !XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf)) |
3491 | 0 | { |
3492 | | /* |
3493 | | * Emit this error right now then retry this page immediately. Use |
3494 | | * errmsg_internal() because the message was already translated. |
3495 | | */ |
3496 | 0 | if (xlogreader->errormsg_buf[0]) |
3497 | 0 | ereport(emode_for_corrupt_record(emode, xlogreader->EndRecPtr), |
3498 | 0 | (errmsg_internal("%s", xlogreader->errormsg_buf))); |
3499 | | |
3500 | | /* reset any error XLogReaderValidatePageHeader() might have set */ |
3501 | 0 | XLogReaderResetError(xlogreader); |
3502 | 0 | goto next_record_is_invalid; |
3503 | 0 | } |
3504 | | |
3505 | 0 | return readLen; |
3506 | | |
3507 | 0 | next_record_is_invalid: |
3508 | | |
3509 | | /* |
3510 | | * If we're reading ahead, give up fast. Retries and error reporting will |
3511 | | * be handled by a later read when recovery catches up to this point. |
3512 | | */ |
3513 | 0 | if (xlogreader->nonblocking) |
3514 | 0 | return XLREAD_WOULDBLOCK; |
3515 | | |
3516 | 0 | lastSourceFailed = true; |
3517 | |
|
3518 | 0 | if (readFile >= 0) |
3519 | 0 | close(readFile); |
3520 | 0 | readFile = -1; |
3521 | 0 | readLen = 0; |
3522 | 0 | readSource = XLOG_FROM_ANY; |
3523 | | |
3524 | | /* In standby-mode, keep trying */ |
3525 | 0 | if (StandbyMode) |
3526 | 0 | goto retry; |
3527 | 0 | else |
3528 | 0 | return XLREAD_FAIL; |
3529 | 0 | } |
3530 | | |
3531 | | /* |
3532 | | * Open the WAL segment containing WAL location 'RecPtr'. |
3533 | | * |
3534 | | * The segment can be fetched via restore_command, or via walreceiver having |
3535 | | * streamed the record, or it can already be present in pg_wal. Checking |
3536 | | * pg_wal is mainly for crash recovery, but it will be polled in standby mode |
3537 | | * too, in case someone copies a new segment directly to pg_wal. That is not |
3538 | | * documented or recommended, though. |
3539 | | * |
3540 | | * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should |
3541 | | * prepare to read WAL starting from RedoStartLSN after this. |
3542 | | * |
3543 | | * 'RecPtr' might not point to the beginning of the record we're interested |
3544 | | * in, it might also point to the page or segment header. In that case, |
3545 | | * 'tliRecPtr' is the position of the WAL record we're interested in. It is |
3546 | | * used to decide which timeline to stream the requested WAL from. |
3547 | | * |
3548 | | * 'replayLSN' is the current replay LSN, so that if we scan for new |
3549 | | * timelines, we can reject a switch to a timeline that branched off before |
3550 | | * this point. |
3551 | | * |
3552 | | * If the record is not immediately available, the function returns false |
3553 | | * if we're not in standby mode. In standby mode, waits for it to become |
3554 | | * available. |
3555 | | * |
3556 | | * When the requested record becomes available, the function opens the file |
3557 | | * containing it (if not open already), and returns XLREAD_SUCCESS. When end |
3558 | | * of standby mode is triggered by the user, and there is no more WAL |
3559 | | * available, returns XLREAD_FAIL. |
3560 | | * |
3561 | | * If nonblocking is true, then give up immediately if we can't satisfy the |
3562 | | * request, returning XLREAD_WOULDBLOCK instead of waiting. |
3563 | | */ |
3564 | | static XLogPageReadResult |
3565 | | WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, |
3566 | | bool fetching_ckpt, XLogRecPtr tliRecPtr, |
3567 | | TimeLineID replayTLI, XLogRecPtr replayLSN, |
3568 | | bool nonblocking) |
3569 | 0 | { |
3570 | 0 | static TimestampTz last_fail_time = 0; |
3571 | 0 | TimestampTz now; |
3572 | 0 | bool streaming_reply_sent = false; |
3573 | | |
3574 | | /*------- |
3575 | | * Standby mode is implemented by a state machine: |
3576 | | * |
3577 | | * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just |
3578 | | * pg_wal (XLOG_FROM_PG_WAL) |
3579 | | * 2. Check for promotion trigger request |
3580 | | * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM) |
3581 | | * 4. Rescan timelines |
3582 | | * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1. |
3583 | | * |
3584 | | * Failure to read from the current source advances the state machine to |
3585 | | * the next state. |
3586 | | * |
3587 | | * 'currentSource' indicates the current state. There are no currentSource |
3588 | | * values for "check trigger", "rescan timelines", and "sleep" states, |
3589 | | * those actions are taken when reading from the previous source fails, as |
3590 | | * part of advancing to the next state. |
3591 | | * |
3592 | | * If standby mode is turned off while reading WAL from stream, we move |
3593 | | * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching |
3594 | | * the files (which would be required at end of recovery, e.g., timeline |
3595 | | * history file) from archive or pg_wal. We don't need to kill WAL receiver |
3596 | | * here because it's already stopped when standby mode is turned off at |
3597 | | * the end of recovery. |
3598 | | *------- |
3599 | | */ |
3600 | 0 | if (!InArchiveRecovery) |
3601 | 0 | currentSource = XLOG_FROM_PG_WAL; |
3602 | 0 | else if (currentSource == XLOG_FROM_ANY || |
3603 | 0 | (!StandbyMode && currentSource == XLOG_FROM_STREAM)) |
3604 | 0 | { |
3605 | 0 | lastSourceFailed = false; |
3606 | 0 | currentSource = XLOG_FROM_ARCHIVE; |
3607 | 0 | } |
3608 | |
|
3609 | 0 | for (;;) |
3610 | 0 | { |
3611 | 0 | XLogSource oldSource = currentSource; |
3612 | 0 | bool startWalReceiver = false; |
3613 | | |
3614 | | /* |
3615 | | * First check if we failed to read from the current source, and |
3616 | | * advance the state machine if so. The failure to read might've |
3617 | | * happened outside this function, e.g when a CRC check fails on a |
3618 | | * record, or within this loop. |
3619 | | */ |
3620 | 0 | if (lastSourceFailed) |
3621 | 0 | { |
3622 | | /* |
3623 | | * Don't allow any retry loops to occur during nonblocking |
3624 | | * readahead. Let the caller process everything that has been |
3625 | | * decoded already first. |
3626 | | */ |
3627 | 0 | if (nonblocking) |
3628 | 0 | return XLREAD_WOULDBLOCK; |
3629 | | |
3630 | 0 | switch (currentSource) |
3631 | 0 | { |
3632 | 0 | case XLOG_FROM_ARCHIVE: |
3633 | 0 | case XLOG_FROM_PG_WAL: |
3634 | | |
3635 | | /* |
3636 | | * Check to see if promotion is requested. Note that we do |
3637 | | * this only after failure, so when you promote, we still |
3638 | | * finish replaying as much as we can from archive and |
3639 | | * pg_wal before failover. |
3640 | | */ |
3641 | 0 | if (StandbyMode && CheckForStandbyTrigger()) |
3642 | 0 | { |
3643 | 0 | XLogShutdownWalRcv(); |
3644 | 0 | return XLREAD_FAIL; |
3645 | 0 | } |
3646 | | |
3647 | | /* |
3648 | | * Not in standby mode, and we've now tried the archive |
3649 | | * and pg_wal. |
3650 | | */ |
3651 | 0 | if (!StandbyMode) |
3652 | 0 | return XLREAD_FAIL; |
3653 | | |
3654 | | /* |
3655 | | * Move to XLOG_FROM_STREAM state, and set to start a |
3656 | | * walreceiver if necessary. |
3657 | | */ |
3658 | 0 | currentSource = XLOG_FROM_STREAM; |
3659 | 0 | startWalReceiver = true; |
3660 | 0 | break; |
3661 | | |
3662 | 0 | case XLOG_FROM_STREAM: |
3663 | | |
3664 | | /* |
3665 | | * Failure while streaming. Most likely, we got here |
3666 | | * because streaming replication was terminated, or |
3667 | | * promotion was triggered. But we also get here if we |
3668 | | * find an invalid record in the WAL streamed from the |
3669 | | * primary, in which case something is seriously wrong. |
3670 | | * There's little chance that the problem will just go |
3671 | | * away, but PANIC is not good for availability either, |
3672 | | * especially in hot standby mode. So, we treat that the |
3673 | | * same as disconnection, and retry from archive/pg_wal |
3674 | | * again. The WAL in the archive should be identical to |
3675 | | * what was streamed, so it's unlikely that it helps, but |
3676 | | * one can hope... |
3677 | | */ |
3678 | | |
3679 | | /* |
3680 | | * We should be able to move to XLOG_FROM_STREAM only in |
3681 | | * standby mode. |
3682 | | */ |
3683 | 0 | Assert(StandbyMode); |
3684 | | |
3685 | | /* |
3686 | | * Before we leave XLOG_FROM_STREAM state, make sure that |
3687 | | * walreceiver is not active, so that it won't overwrite |
3688 | | * WAL that we restore from archive. |
3689 | | */ |
3690 | 0 | XLogShutdownWalRcv(); |
3691 | | |
3692 | | /* |
3693 | | * Before we sleep, re-scan for possible new timelines if |
3694 | | * we were requested to recover to the latest timeline. |
3695 | | */ |
3696 | 0 | if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) |
3697 | 0 | { |
3698 | 0 | if (rescanLatestTimeLine(replayTLI, replayLSN)) |
3699 | 0 | { |
3700 | 0 | currentSource = XLOG_FROM_ARCHIVE; |
3701 | 0 | break; |
3702 | 0 | } |
3703 | 0 | } |
3704 | | |
3705 | | /* |
3706 | | * XLOG_FROM_STREAM is the last state in our state |
3707 | | * machine, so we've exhausted all the options for |
3708 | | * obtaining the requested WAL. We're going to loop back |
3709 | | * and retry from the archive, but if it hasn't been long |
3710 | | * since last attempt, sleep wal_retrieve_retry_interval |
3711 | | * milliseconds to avoid busy-waiting. |
3712 | | */ |
3713 | 0 | now = GetCurrentTimestamp(); |
3714 | 0 | if (!TimestampDifferenceExceeds(last_fail_time, now, |
3715 | 0 | wal_retrieve_retry_interval)) |
3716 | 0 | { |
3717 | 0 | long wait_time; |
3718 | |
|
3719 | 0 | wait_time = wal_retrieve_retry_interval - |
3720 | 0 | TimestampDifferenceMilliseconds(last_fail_time, now); |
3721 | |
|
3722 | 0 | elog(LOG, "waiting for WAL to become available at %X/%08X", |
3723 | 0 | LSN_FORMAT_ARGS(RecPtr)); |
3724 | | |
3725 | | /* Do background tasks that might benefit us later. */ |
3726 | 0 | KnownAssignedTransactionIdsIdleMaintenance(); |
3727 | |
|
3728 | 0 | (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, |
3729 | 0 | WL_LATCH_SET | WL_TIMEOUT | |
3730 | 0 | WL_EXIT_ON_PM_DEATH, |
3731 | 0 | wait_time, |
3732 | 0 | WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL); |
3733 | 0 | ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); |
3734 | 0 | now = GetCurrentTimestamp(); |
3735 | | |
3736 | | /* Handle interrupt signals of startup process */ |
3737 | 0 | ProcessStartupProcInterrupts(); |
3738 | 0 | } |
3739 | 0 | last_fail_time = now; |
3740 | 0 | currentSource = XLOG_FROM_ARCHIVE; |
3741 | 0 | break; |
3742 | | |
3743 | 0 | default: |
3744 | 0 | elog(ERROR, "unexpected WAL source %d", currentSource); |
3745 | 0 | } |
3746 | 0 | } |
3747 | 0 | else if (currentSource == XLOG_FROM_PG_WAL) |
3748 | 0 | { |
3749 | | /* |
3750 | | * We just successfully read a file in pg_wal. We prefer files in |
3751 | | * the archive over ones in pg_wal, so try the next file again |
3752 | | * from the archive first. |
3753 | | */ |
3754 | 0 | if (InArchiveRecovery) |
3755 | 0 | currentSource = XLOG_FROM_ARCHIVE; |
3756 | 0 | } |
3757 | | |
3758 | 0 | if (currentSource != oldSource) |
3759 | 0 | elog(DEBUG2, "switched WAL source from %s to %s after %s", |
3760 | 0 | xlogSourceNames[oldSource], xlogSourceNames[currentSource], |
3761 | 0 | lastSourceFailed ? "failure" : "success"); |
3762 | | |
3763 | | /* |
3764 | | * We've now handled possible failure. Try to read from the chosen |
3765 | | * source. |
3766 | | */ |
3767 | 0 | lastSourceFailed = false; |
3768 | |
|
3769 | 0 | switch (currentSource) |
3770 | 0 | { |
3771 | 0 | case XLOG_FROM_ARCHIVE: |
3772 | 0 | case XLOG_FROM_PG_WAL: |
3773 | | |
3774 | | /* |
3775 | | * WAL receiver must not be running when reading WAL from |
3776 | | * archive or pg_wal. |
3777 | | */ |
3778 | 0 | Assert(!WalRcvStreaming()); |
3779 | | |
3780 | | /* Close any old file we might have open. */ |
3781 | 0 | if (readFile >= 0) |
3782 | 0 | { |
3783 | 0 | close(readFile); |
3784 | 0 | readFile = -1; |
3785 | 0 | } |
3786 | | /* Reset curFileTLI if random fetch. */ |
3787 | 0 | if (randAccess) |
3788 | 0 | curFileTLI = 0; |
3789 | | |
3790 | | /* |
3791 | | * Try to restore the file from archive, or read an existing |
3792 | | * file from pg_wal. |
3793 | | */ |
3794 | 0 | readFile = XLogFileReadAnyTLI(readSegNo, |
3795 | 0 | currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY : |
3796 | 0 | currentSource); |
3797 | 0 | if (readFile >= 0) |
3798 | 0 | return XLREAD_SUCCESS; /* success! */ |
3799 | | |
3800 | | /* |
3801 | | * Nope, not found in archive or pg_wal. |
3802 | | */ |
3803 | 0 | lastSourceFailed = true; |
3804 | 0 | break; |
3805 | | |
3806 | 0 | case XLOG_FROM_STREAM: |
3807 | 0 | { |
3808 | 0 | bool havedata; |
3809 | | |
3810 | | /* |
3811 | | * We should be able to move to XLOG_FROM_STREAM only in |
3812 | | * standby mode. |
3813 | | */ |
3814 | 0 | Assert(StandbyMode); |
3815 | | |
3816 | | /* |
3817 | | * First, shutdown walreceiver if its restart has been |
3818 | | * requested -- but no point if we're already slated for |
3819 | | * starting it. |
3820 | | */ |
3821 | 0 | if (pendingWalRcvRestart && !startWalReceiver) |
3822 | 0 | { |
3823 | 0 | XLogShutdownWalRcv(); |
3824 | | |
3825 | | /* |
3826 | | * Re-scan for possible new timelines if we were |
3827 | | * requested to recover to the latest timeline. |
3828 | | */ |
3829 | 0 | if (recoveryTargetTimeLineGoal == |
3830 | 0 | RECOVERY_TARGET_TIMELINE_LATEST) |
3831 | 0 | rescanLatestTimeLine(replayTLI, replayLSN); |
3832 | |
|
3833 | 0 | startWalReceiver = true; |
3834 | 0 | } |
3835 | 0 | pendingWalRcvRestart = false; |
3836 | | |
3837 | | /* |
3838 | | * Launch walreceiver if needed. |
3839 | | * |
3840 | | * If fetching_ckpt is true, RecPtr points to the initial |
3841 | | * checkpoint location. In that case, we use RedoStartLSN |
3842 | | * as the streaming start position instead of RecPtr, so |
3843 | | * that when we later jump backwards to start redo at |
3844 | | * RedoStartLSN, we will have the logs streamed already. |
3845 | | */ |
3846 | 0 | if (startWalReceiver && |
3847 | 0 | PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0) |
3848 | 0 | { |
3849 | 0 | XLogRecPtr ptr; |
3850 | 0 | TimeLineID tli; |
3851 | |
|
3852 | 0 | if (fetching_ckpt) |
3853 | 0 | { |
3854 | 0 | ptr = RedoStartLSN; |
3855 | 0 | tli = RedoStartTLI; |
3856 | 0 | } |
3857 | 0 | else |
3858 | 0 | { |
3859 | 0 | ptr = RecPtr; |
3860 | | |
3861 | | /* |
3862 | | * Use the record begin position to determine the |
3863 | | * TLI, rather than the position we're reading. |
3864 | | */ |
3865 | 0 | tli = tliOfPointInHistory(tliRecPtr, expectedTLEs); |
3866 | |
|
3867 | 0 | if (curFileTLI > 0 && tli < curFileTLI) |
3868 | 0 | elog(ERROR, "according to history file, WAL location %X/%08X belongs to timeline %u, but previous recovered WAL file came from timeline %u", |
3869 | 0 | LSN_FORMAT_ARGS(tliRecPtr), |
3870 | 0 | tli, curFileTLI); |
3871 | 0 | } |
3872 | 0 | curFileTLI = tli; |
3873 | 0 | SetInstallXLogFileSegmentActive(); |
3874 | 0 | RequestXLogStreaming(tli, ptr, PrimaryConnInfo, |
3875 | 0 | PrimarySlotName, |
3876 | 0 | wal_receiver_create_temp_slot); |
3877 | 0 | flushedUpto = 0; |
3878 | 0 | } |
3879 | | |
3880 | | /* |
3881 | | * Check if WAL receiver is active or wait to start up. |
3882 | | */ |
3883 | 0 | if (!WalRcvStreaming()) |
3884 | 0 | { |
3885 | 0 | lastSourceFailed = true; |
3886 | 0 | break; |
3887 | 0 | } |
3888 | | |
3889 | | /* |
3890 | | * Walreceiver is active, so see if new data has arrived. |
3891 | | * |
3892 | | * We only advance XLogReceiptTime when we obtain fresh |
3893 | | * WAL from walreceiver and observe that we had already |
3894 | | * processed everything before the most recent "chunk" |
3895 | | * that it flushed to disk. In steady state where we are |
3896 | | * keeping up with the incoming data, XLogReceiptTime will |
3897 | | * be updated on each cycle. When we are behind, |
3898 | | * XLogReceiptTime will not advance, so the grace time |
3899 | | * allotted to conflicting queries will decrease. |
3900 | | */ |
3901 | 0 | if (RecPtr < flushedUpto) |
3902 | 0 | havedata = true; |
3903 | 0 | else |
3904 | 0 | { |
3905 | 0 | XLogRecPtr latestChunkStart; |
3906 | |
|
3907 | 0 | flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI); |
3908 | 0 | if (RecPtr < flushedUpto && receiveTLI == curFileTLI) |
3909 | 0 | { |
3910 | 0 | havedata = true; |
3911 | 0 | if (latestChunkStart <= RecPtr) |
3912 | 0 | { |
3913 | 0 | XLogReceiptTime = GetCurrentTimestamp(); |
3914 | 0 | SetCurrentChunkStartTime(XLogReceiptTime); |
3915 | 0 | } |
3916 | 0 | } |
3917 | 0 | else |
3918 | 0 | havedata = false; |
3919 | 0 | } |
3920 | 0 | if (havedata) |
3921 | 0 | { |
3922 | | /* |
3923 | | * Great, streamed far enough. Open the file if it's |
3924 | | * not open already. Also read the timeline history |
3925 | | * file if we haven't initialized timeline history |
3926 | | * yet; it should be streamed over and present in |
3927 | | * pg_wal by now. Use XLOG_FROM_STREAM so that source |
3928 | | * info is set correctly and XLogReceiptTime isn't |
3929 | | * changed. |
3930 | | * |
3931 | | * NB: We must set readTimeLineHistory based on |
3932 | | * recoveryTargetTLI, not receiveTLI. Normally they'll |
3933 | | * be the same, but if recovery_target_timeline is |
3934 | | * 'latest' and archiving is configured, then it's |
3935 | | * possible that we managed to retrieve one or more |
3936 | | * new timeline history files from the archive, |
3937 | | * updating recoveryTargetTLI. |
3938 | | */ |
3939 | 0 | if (readFile < 0) |
3940 | 0 | { |
3941 | 0 | if (!expectedTLEs) |
3942 | 0 | expectedTLEs = readTimeLineHistory(recoveryTargetTLI); |
3943 | 0 | readFile = XLogFileRead(readSegNo, receiveTLI, |
3944 | 0 | XLOG_FROM_STREAM, false); |
3945 | 0 | Assert(readFile >= 0); |
3946 | 0 | } |
3947 | 0 | else |
3948 | 0 | { |
3949 | | /* just make sure source info is correct... */ |
3950 | 0 | readSource = XLOG_FROM_STREAM; |
3951 | 0 | XLogReceiptSource = XLOG_FROM_STREAM; |
3952 | 0 | return XLREAD_SUCCESS; |
3953 | 0 | } |
3954 | 0 | break; |
3955 | 0 | } |
3956 | | |
3957 | | /* In nonblocking mode, return rather than sleeping. */ |
3958 | 0 | if (nonblocking) |
3959 | 0 | return XLREAD_WOULDBLOCK; |
3960 | | |
3961 | | /* |
3962 | | * Data not here yet. Check for trigger, then wait for |
3963 | | * walreceiver to wake us up when new WAL arrives. |
3964 | | */ |
3965 | 0 | if (CheckForStandbyTrigger()) |
3966 | 0 | { |
3967 | | /* |
3968 | | * Note that we don't return XLREAD_FAIL immediately |
3969 | | * here. After being triggered, we still want to |
3970 | | * replay all the WAL that was already streamed. It's |
3971 | | * in pg_wal now, so we just treat this as a failure, |
3972 | | * and the state machine will move on to replay the |
3973 | | * streamed WAL from pg_wal, and then recheck the |
3974 | | * trigger and exit replay. |
3975 | | */ |
3976 | 0 | lastSourceFailed = true; |
3977 | 0 | break; |
3978 | 0 | } |
3979 | | |
3980 | | /* |
3981 | | * Since we have replayed everything we have received so |
3982 | | * far and are about to start waiting for more WAL, let's |
3983 | | * tell the upstream server our replay location now so |
3984 | | * that pg_stat_replication doesn't show stale |
3985 | | * information. |
3986 | | */ |
3987 | 0 | if (!streaming_reply_sent) |
3988 | 0 | { |
3989 | 0 | WalRcvForceReply(); |
3990 | 0 | streaming_reply_sent = true; |
3991 | 0 | } |
3992 | | |
3993 | | /* Do any background tasks that might benefit us later. */ |
3994 | 0 | KnownAssignedTransactionIdsIdleMaintenance(); |
3995 | | |
3996 | | /* Update pg_stat_recovery_prefetch before sleeping. */ |
3997 | 0 | XLogPrefetcherComputeStats(xlogprefetcher); |
3998 | | |
3999 | | /* |
4000 | | * Wait for more WAL to arrive, when we will be woken |
4001 | | * immediately by the WAL receiver. |
4002 | | */ |
4003 | 0 | (void) WaitLatch(&XLogRecoveryCtl->recoveryWakeupLatch, |
4004 | 0 | WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, |
4005 | 0 | -1L, |
4006 | 0 | WAIT_EVENT_RECOVERY_WAL_STREAM); |
4007 | 0 | ResetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); |
4008 | 0 | break; |
4009 | 0 | } |
4010 | | |
4011 | 0 | default: |
4012 | 0 | elog(ERROR, "unexpected WAL source %d", currentSource); |
4013 | 0 | } |
4014 | | |
4015 | | /* |
4016 | | * Check for recovery pause here so that we can confirm more quickly |
4017 | | * that a requested pause has actually taken effect. |
4018 | | */ |
4019 | 0 | if (((volatile XLogRecoveryCtlData *) XLogRecoveryCtl)->recoveryPauseState != |
4020 | 0 | RECOVERY_NOT_PAUSED) |
4021 | 0 | recoveryPausesHere(false); |
4022 | | |
4023 | | /* |
4024 | | * This possibly-long loop needs to handle interrupts of startup |
4025 | | * process. |
4026 | | */ |
4027 | 0 | ProcessStartupProcInterrupts(); |
4028 | 0 | } |
4029 | | |
4030 | 0 | return XLREAD_FAIL; /* not reached */ |
4031 | 0 | } |
4032 | | |
4033 | | |
4034 | | /* |
4035 | | * Determine what log level should be used to report a corrupt WAL record |
4036 | | * in the current WAL page, previously read by XLogPageRead(). |
4037 | | * |
4038 | | * 'emode' is the error mode that would be used to report a file-not-found |
4039 | | * or legitimate end-of-WAL situation. Generally, we use it as-is, but if |
4040 | | * we're retrying the exact same record that we've tried previously, only |
4041 | | * complain the first time to keep the noise down. However, we only do when |
4042 | | * reading from pg_wal, because we don't expect any invalid records in archive |
4043 | | * or in records streamed from the primary. Files in the archive should be complete, |
4044 | | * and we should never hit the end of WAL because we stop and wait for more WAL |
4045 | | * to arrive before replaying it. |
4046 | | * |
4047 | | * NOTE: This function remembers the RecPtr value it was last called with, |
4048 | | * to suppress repeated messages about the same record. Only call this when |
4049 | | * you are about to ereport(), or you might cause a later message to be |
4050 | | * erroneously suppressed. |
4051 | | */ |
4052 | | static int |
4053 | | emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) |
4054 | 0 | { |
4055 | 0 | static XLogRecPtr lastComplaint = 0; |
4056 | |
|
4057 | 0 | if (readSource == XLOG_FROM_PG_WAL && emode == LOG) |
4058 | 0 | { |
4059 | 0 | if (RecPtr == lastComplaint) |
4060 | 0 | emode = DEBUG1; |
4061 | 0 | else |
4062 | 0 | lastComplaint = RecPtr; |
4063 | 0 | } |
4064 | 0 | return emode; |
4065 | 0 | } |
4066 | | |
4067 | | |
4068 | | /* |
4069 | | * Subroutine to try to fetch and validate a prior checkpoint record. |
4070 | | */ |
4071 | | static XLogRecord * |
4072 | | ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr, |
4073 | | TimeLineID replayTLI) |
4074 | 0 | { |
4075 | 0 | XLogRecord *record; |
4076 | 0 | uint8 info; |
4077 | |
|
4078 | 0 | Assert(xlogreader != NULL); |
4079 | |
|
4080 | 0 | if (!XRecOffIsValid(RecPtr)) |
4081 | 0 | { |
4082 | 0 | ereport(LOG, |
4083 | 0 | (errmsg("invalid checkpoint location"))); |
4084 | 0 | return NULL; |
4085 | 0 | } |
4086 | | |
4087 | 0 | XLogPrefetcherBeginRead(xlogprefetcher, RecPtr); |
4088 | 0 | record = ReadRecord(xlogprefetcher, LOG, true, replayTLI); |
4089 | |
|
4090 | 0 | if (record == NULL) |
4091 | 0 | { |
4092 | 0 | ereport(LOG, |
4093 | 0 | (errmsg("invalid checkpoint record"))); |
4094 | 0 | return NULL; |
4095 | 0 | } |
4096 | 0 | if (record->xl_rmid != RM_XLOG_ID) |
4097 | 0 | { |
4098 | 0 | ereport(LOG, |
4099 | 0 | (errmsg("invalid resource manager ID in checkpoint record"))); |
4100 | 0 | return NULL; |
4101 | 0 | } |
4102 | 0 | info = record->xl_info & ~XLR_INFO_MASK; |
4103 | 0 | if (info != XLOG_CHECKPOINT_SHUTDOWN && |
4104 | 0 | info != XLOG_CHECKPOINT_ONLINE) |
4105 | 0 | { |
4106 | 0 | ereport(LOG, |
4107 | 0 | (errmsg("invalid xl_info in checkpoint record"))); |
4108 | 0 | return NULL; |
4109 | 0 | } |
4110 | 0 | if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint)) |
4111 | 0 | { |
4112 | 0 | ereport(LOG, |
4113 | 0 | (errmsg("invalid length of checkpoint record"))); |
4114 | 0 | return NULL; |
4115 | 0 | } |
4116 | 0 | return record; |
4117 | 0 | } |
4118 | | |
4119 | | /* |
4120 | | * Scan for new timelines that might have appeared in the archive since we |
4121 | | * started recovery. |
4122 | | * |
4123 | | * If there are any, the function changes recovery target TLI to the latest |
4124 | | * one and returns 'true'. |
4125 | | */ |
4126 | | static bool |
4127 | | rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) |
4128 | 0 | { |
4129 | 0 | List *newExpectedTLEs; |
4130 | 0 | bool found; |
4131 | 0 | ListCell *cell; |
4132 | 0 | TimeLineID newtarget; |
4133 | 0 | TimeLineID oldtarget = recoveryTargetTLI; |
4134 | 0 | TimeLineHistoryEntry *currentTle = NULL; |
4135 | |
|
4136 | 0 | newtarget = findNewestTimeLine(recoveryTargetTLI); |
4137 | 0 | if (newtarget == recoveryTargetTLI) |
4138 | 0 | { |
4139 | | /* No new timelines found */ |
4140 | 0 | return false; |
4141 | 0 | } |
4142 | | |
4143 | | /* |
4144 | | * Determine the list of expected TLIs for the new TLI |
4145 | | */ |
4146 | | |
4147 | 0 | newExpectedTLEs = readTimeLineHistory(newtarget); |
4148 | | |
4149 | | /* |
4150 | | * If the current timeline is not part of the history of the new timeline, |
4151 | | * we cannot proceed to it. |
4152 | | */ |
4153 | 0 | found = false; |
4154 | 0 | foreach(cell, newExpectedTLEs) |
4155 | 0 | { |
4156 | 0 | currentTle = (TimeLineHistoryEntry *) lfirst(cell); |
4157 | |
|
4158 | 0 | if (currentTle->tli == recoveryTargetTLI) |
4159 | 0 | { |
4160 | 0 | found = true; |
4161 | 0 | break; |
4162 | 0 | } |
4163 | 0 | } |
4164 | 0 | if (!found) |
4165 | 0 | { |
4166 | 0 | ereport(LOG, |
4167 | 0 | (errmsg("new timeline %u is not a child of database system timeline %u", |
4168 | 0 | newtarget, |
4169 | 0 | replayTLI))); |
4170 | 0 | return false; |
4171 | 0 | } |
4172 | | |
4173 | | /* |
4174 | | * The current timeline was found in the history file, but check that the |
4175 | | * next timeline was forked off from it *after* the current recovery |
4176 | | * location. |
4177 | | */ |
4178 | 0 | if (currentTle->end < replayLSN) |
4179 | 0 | { |
4180 | 0 | ereport(LOG, |
4181 | 0 | errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%08X", |
4182 | 0 | newtarget, |
4183 | 0 | replayTLI, |
4184 | 0 | LSN_FORMAT_ARGS(replayLSN))); |
4185 | 0 | return false; |
4186 | 0 | } |
4187 | | |
4188 | | /* The new timeline history seems valid. Switch target */ |
4189 | 0 | recoveryTargetTLI = newtarget; |
4190 | 0 | list_free_deep(expectedTLEs); |
4191 | 0 | expectedTLEs = newExpectedTLEs; |
4192 | | |
4193 | | /* |
4194 | | * As in StartupXLOG(), try to ensure we have all the history files |
4195 | | * between the old target and new target in pg_wal. |
4196 | | */ |
4197 | 0 | restoreTimeLineHistoryFiles(oldtarget + 1, newtarget); |
4198 | |
|
4199 | 0 | ereport(LOG, |
4200 | 0 | (errmsg("new target timeline is %u", |
4201 | 0 | recoveryTargetTLI))); |
4202 | | |
4203 | 0 | return true; |
4204 | 0 | } |
4205 | | |
4206 | | |
4207 | | /* |
4208 | | * Open a logfile segment for reading (during recovery). |
4209 | | * |
4210 | | * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive. |
4211 | | * Otherwise, it's assumed to be already available in pg_wal. |
4212 | | */ |
4213 | | static int |
4214 | | XLogFileRead(XLogSegNo segno, TimeLineID tli, |
4215 | | XLogSource source, bool notfoundOk) |
4216 | 0 | { |
4217 | 0 | char xlogfname[MAXFNAMELEN]; |
4218 | 0 | char activitymsg[MAXFNAMELEN + 16]; |
4219 | 0 | char path[MAXPGPATH]; |
4220 | 0 | int fd; |
4221 | |
|
4222 | 0 | XLogFileName(xlogfname, tli, segno, wal_segment_size); |
4223 | |
|
4224 | 0 | switch (source) |
4225 | 0 | { |
4226 | 0 | case XLOG_FROM_ARCHIVE: |
4227 | | /* Report recovery progress in PS display */ |
4228 | 0 | snprintf(activitymsg, sizeof(activitymsg), "waiting for %s", |
4229 | 0 | xlogfname); |
4230 | 0 | set_ps_display(activitymsg); |
4231 | |
|
4232 | 0 | if (!RestoreArchivedFile(path, xlogfname, |
4233 | 0 | "RECOVERYXLOG", |
4234 | 0 | wal_segment_size, |
4235 | 0 | InRedo)) |
4236 | 0 | return -1; |
4237 | 0 | break; |
4238 | | |
4239 | 0 | case XLOG_FROM_PG_WAL: |
4240 | 0 | case XLOG_FROM_STREAM: |
4241 | 0 | XLogFilePath(path, tli, segno, wal_segment_size); |
4242 | 0 | break; |
4243 | | |
4244 | 0 | default: |
4245 | 0 | elog(ERROR, "invalid XLogFileRead source %d", source); |
4246 | 0 | } |
4247 | | |
4248 | | /* |
4249 | | * If the segment was fetched from archival storage, replace the existing |
4250 | | * xlog segment (if any) with the archival version. |
4251 | | */ |
4252 | 0 | if (source == XLOG_FROM_ARCHIVE) |
4253 | 0 | { |
4254 | 0 | Assert(!IsInstallXLogFileSegmentActive()); |
4255 | 0 | KeepFileRestoredFromArchive(path, xlogfname); |
4256 | | |
4257 | | /* |
4258 | | * Set path to point at the new file in pg_wal. |
4259 | | */ |
4260 | 0 | snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname); |
4261 | 0 | } |
4262 | |
|
4263 | 0 | fd = BasicOpenFile(path, O_RDONLY | PG_BINARY); |
4264 | 0 | if (fd >= 0) |
4265 | 0 | { |
4266 | | /* Success! */ |
4267 | 0 | curFileTLI = tli; |
4268 | | |
4269 | | /* Report recovery progress in PS display */ |
4270 | 0 | snprintf(activitymsg, sizeof(activitymsg), "recovering %s", |
4271 | 0 | xlogfname); |
4272 | 0 | set_ps_display(activitymsg); |
4273 | | |
4274 | | /* Track source of data in assorted state variables */ |
4275 | 0 | readSource = source; |
4276 | 0 | XLogReceiptSource = source; |
4277 | | /* In FROM_STREAM case, caller tracks receipt time, not me */ |
4278 | 0 | if (source != XLOG_FROM_STREAM) |
4279 | 0 | XLogReceiptTime = GetCurrentTimestamp(); |
4280 | |
|
4281 | 0 | return fd; |
4282 | 0 | } |
4283 | 0 | if (errno != ENOENT || !notfoundOk) /* unexpected failure? */ |
4284 | 0 | ereport(PANIC, |
4285 | 0 | (errcode_for_file_access(), |
4286 | 0 | errmsg("could not open file \"%s\": %m", path))); |
4287 | 0 | return -1; |
4288 | 0 | } |
4289 | | |
4290 | | /* |
4291 | | * Open a logfile segment for reading (during recovery). |
4292 | | * |
4293 | | * This version searches for the segment with any TLI listed in expectedTLEs. |
4294 | | */ |
4295 | | static int |
4296 | | XLogFileReadAnyTLI(XLogSegNo segno, XLogSource source) |
4297 | 0 | { |
4298 | 0 | char path[MAXPGPATH]; |
4299 | 0 | ListCell *cell; |
4300 | 0 | int fd; |
4301 | 0 | List *tles; |
4302 | | |
4303 | | /* |
4304 | | * Loop looking for a suitable timeline ID: we might need to read any of |
4305 | | * the timelines listed in expectedTLEs. |
4306 | | * |
4307 | | * We expect curFileTLI on entry to be the TLI of the preceding file in |
4308 | | * sequence, or 0 if there was no predecessor. We do not allow curFileTLI |
4309 | | * to go backwards; this prevents us from picking up the wrong file when a |
4310 | | * parent timeline extends to higher segment numbers than the child we |
4311 | | * want to read. |
4312 | | * |
4313 | | * If we haven't read the timeline history file yet, read it now, so that |
4314 | | * we know which TLIs to scan. We don't save the list in expectedTLEs, |
4315 | | * however, unless we actually find a valid segment. That way if there is |
4316 | | * neither a timeline history file nor a WAL segment in the archive, and |
4317 | | * streaming replication is set up, we'll read the timeline history file |
4318 | | * streamed from the primary when we start streaming, instead of |
4319 | | * recovering with a dummy history generated here. |
4320 | | */ |
4321 | 0 | if (expectedTLEs) |
4322 | 0 | tles = expectedTLEs; |
4323 | 0 | else |
4324 | 0 | tles = readTimeLineHistory(recoveryTargetTLI); |
4325 | |
|
4326 | 0 | foreach(cell, tles) |
4327 | 0 | { |
4328 | 0 | TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell); |
4329 | 0 | TimeLineID tli = hent->tli; |
4330 | |
|
4331 | 0 | if (tli < curFileTLI) |
4332 | 0 | break; /* don't bother looking at too-old TLIs */ |
4333 | | |
4334 | | /* |
4335 | | * Skip scanning the timeline ID that the logfile segment to read |
4336 | | * doesn't belong to |
4337 | | */ |
4338 | 0 | if (hent->begin != InvalidXLogRecPtr) |
4339 | 0 | { |
4340 | 0 | XLogSegNo beginseg = 0; |
4341 | |
|
4342 | 0 | XLByteToSeg(hent->begin, beginseg, wal_segment_size); |
4343 | | |
4344 | | /* |
4345 | | * The logfile segment that doesn't belong to the timeline is |
4346 | | * older or newer than the segment that the timeline started or |
4347 | | * ended at, respectively. It's sufficient to check only the |
4348 | | * starting segment of the timeline here. Since the timelines are |
4349 | | * scanned in descending order in this loop, any segments newer |
4350 | | * than the ending segment should belong to newer timeline and |
4351 | | * have already been read before. So it's not necessary to check |
4352 | | * the ending segment of the timeline here. |
4353 | | */ |
4354 | 0 | if (segno < beginseg) |
4355 | 0 | continue; |
4356 | 0 | } |
4357 | | |
4358 | 0 | if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE) |
4359 | 0 | { |
4360 | 0 | fd = XLogFileRead(segno, tli, XLOG_FROM_ARCHIVE, true); |
4361 | 0 | if (fd != -1) |
4362 | 0 | { |
4363 | 0 | elog(DEBUG1, "got WAL segment from archive"); |
4364 | 0 | if (!expectedTLEs) |
4365 | 0 | expectedTLEs = tles; |
4366 | 0 | return fd; |
4367 | 0 | } |
4368 | 0 | } |
4369 | | |
4370 | 0 | if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL) |
4371 | 0 | { |
4372 | 0 | fd = XLogFileRead(segno, tli, XLOG_FROM_PG_WAL, true); |
4373 | 0 | if (fd != -1) |
4374 | 0 | { |
4375 | 0 | if (!expectedTLEs) |
4376 | 0 | expectedTLEs = tles; |
4377 | 0 | return fd; |
4378 | 0 | } |
4379 | 0 | } |
4380 | 0 | } |
4381 | | |
4382 | | /* Couldn't find it. For simplicity, complain about front timeline */ |
4383 | 0 | XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size); |
4384 | 0 | errno = ENOENT; |
4385 | 0 | ereport(DEBUG2, |
4386 | 0 | (errcode_for_file_access(), |
4387 | 0 | errmsg("could not open file \"%s\": %m", path))); |
4388 | 0 | return -1; |
4389 | 0 | } |
4390 | | |
4391 | | /* |
4392 | | * Set flag to signal the walreceiver to restart. (The startup process calls |
4393 | | * this on noticing a relevant configuration change.) |
4394 | | */ |
4395 | | void |
4396 | | StartupRequestWalReceiverRestart(void) |
4397 | 0 | { |
4398 | 0 | if (currentSource == XLOG_FROM_STREAM && WalRcvRunning()) |
4399 | 0 | { |
4400 | 0 | ereport(LOG, |
4401 | 0 | (errmsg("WAL receiver process shutdown requested"))); |
4402 | | |
4403 | 0 | pendingWalRcvRestart = true; |
4404 | 0 | } |
4405 | 0 | } |
4406 | | |
4407 | | |
4408 | | /* |
4409 | | * Has a standby promotion already been triggered? |
4410 | | * |
4411 | | * Unlike CheckForStandbyTrigger(), this works in any process |
4412 | | * that's connected to shared memory. |
4413 | | */ |
4414 | | bool |
4415 | | PromoteIsTriggered(void) |
4416 | 0 | { |
4417 | | /* |
4418 | | * We check shared state each time only until a standby promotion is |
4419 | | * triggered. We can't trigger a promotion again, so there's no need to |
4420 | | * keep checking after the shared variable has once been seen true. |
4421 | | */ |
4422 | 0 | if (LocalPromoteIsTriggered) |
4423 | 0 | return true; |
4424 | | |
4425 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
4426 | 0 | LocalPromoteIsTriggered = XLogRecoveryCtl->SharedPromoteIsTriggered; |
4427 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
4428 | |
|
4429 | 0 | return LocalPromoteIsTriggered; |
4430 | 0 | } |
4431 | | |
4432 | | static void |
4433 | | SetPromoteIsTriggered(void) |
4434 | 0 | { |
4435 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
4436 | 0 | XLogRecoveryCtl->SharedPromoteIsTriggered = true; |
4437 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
4438 | | |
4439 | | /* |
4440 | | * Mark the recovery pause state as 'not paused' because the paused state |
4441 | | * ends and promotion continues if a promotion is triggered while recovery |
4442 | | * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly |
4443 | | * return 'paused' while a promotion is ongoing. |
4444 | | */ |
4445 | 0 | SetRecoveryPause(false); |
4446 | |
|
4447 | 0 | LocalPromoteIsTriggered = true; |
4448 | 0 | } |
4449 | | |
4450 | | /* |
4451 | | * Check whether a promote request has arrived. |
4452 | | */ |
4453 | | static bool |
4454 | | CheckForStandbyTrigger(void) |
4455 | 0 | { |
4456 | 0 | if (LocalPromoteIsTriggered) |
4457 | 0 | return true; |
4458 | | |
4459 | 0 | if (IsPromoteSignaled() && CheckPromoteSignal()) |
4460 | 0 | { |
4461 | 0 | ereport(LOG, (errmsg("received promote request"))); |
4462 | 0 | RemovePromoteSignalFiles(); |
4463 | 0 | ResetPromoteSignaled(); |
4464 | 0 | SetPromoteIsTriggered(); |
4465 | 0 | return true; |
4466 | 0 | } |
4467 | | |
4468 | 0 | return false; |
4469 | 0 | } |
4470 | | |
4471 | | /* |
4472 | | * Remove the files signaling a standby promotion request. |
4473 | | */ |
4474 | | void |
4475 | | RemovePromoteSignalFiles(void) |
4476 | 0 | { |
4477 | 0 | unlink(PROMOTE_SIGNAL_FILE); |
4478 | 0 | } |
4479 | | |
4480 | | /* |
4481 | | * Check to see if a promote request has arrived. |
4482 | | */ |
4483 | | bool |
4484 | | CheckPromoteSignal(void) |
4485 | 0 | { |
4486 | 0 | struct stat stat_buf; |
4487 | |
|
4488 | 0 | if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0) |
4489 | 0 | return true; |
4490 | | |
4491 | 0 | return false; |
4492 | 0 | } |
4493 | | |
4494 | | /* |
4495 | | * Wake up startup process to replay newly arrived WAL, or to notice that |
4496 | | * failover has been requested. |
4497 | | */ |
4498 | | void |
4499 | | WakeupRecovery(void) |
4500 | 0 | { |
4501 | 0 | SetLatch(&XLogRecoveryCtl->recoveryWakeupLatch); |
4502 | 0 | } |
4503 | | |
4504 | | /* |
4505 | | * Schedule a walreceiver wakeup in the main recovery loop. |
4506 | | */ |
4507 | | void |
4508 | | XLogRequestWalReceiverReply(void) |
4509 | 0 | { |
4510 | 0 | doRequestWalReceiverReply = true; |
4511 | 0 | } |
4512 | | |
4513 | | /* |
4514 | | * Is HotStandby active yet? This is only important in special backends |
4515 | | * since normal backends won't ever be able to connect until this returns |
4516 | | * true. Postmaster knows this by way of signal, not via shared memory. |
4517 | | * |
4518 | | * Unlike testing standbyState, this works in any process that's connected to |
4519 | | * shared memory. (And note that standbyState alone doesn't tell the truth |
4520 | | * anyway.) |
4521 | | */ |
4522 | | bool |
4523 | | HotStandbyActive(void) |
4524 | 0 | { |
4525 | | /* |
4526 | | * We check shared state each time only until Hot Standby is active. We |
4527 | | * can't de-activate Hot Standby, so there's no need to keep checking |
4528 | | * after the shared variable has once been seen true. |
4529 | | */ |
4530 | 0 | if (LocalHotStandbyActive) |
4531 | 0 | return true; |
4532 | 0 | else |
4533 | 0 | { |
4534 | | /* spinlock is essential on machines with weak memory ordering! */ |
4535 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
4536 | 0 | LocalHotStandbyActive = XLogRecoveryCtl->SharedHotStandbyActive; |
4537 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
4538 | |
|
4539 | 0 | return LocalHotStandbyActive; |
4540 | 0 | } |
4541 | 0 | } |
4542 | | |
4543 | | /* |
4544 | | * Like HotStandbyActive(), but to be used only in WAL replay code, |
4545 | | * where we don't need to ask any other process what the state is. |
4546 | | */ |
4547 | | static bool |
4548 | | HotStandbyActiveInReplay(void) |
4549 | 0 | { |
4550 | 0 | Assert(AmStartupProcess() || !IsPostmasterEnvironment); |
4551 | 0 | return LocalHotStandbyActive; |
4552 | 0 | } |
4553 | | |
4554 | | /* |
4555 | | * Get latest redo apply position. |
4556 | | * |
4557 | | * Exported to allow WALReceiver to read the pointer directly. |
4558 | | */ |
4559 | | XLogRecPtr |
4560 | | GetXLogReplayRecPtr(TimeLineID *replayTLI) |
4561 | 0 | { |
4562 | 0 | XLogRecPtr recptr; |
4563 | 0 | TimeLineID tli; |
4564 | |
|
4565 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
4566 | 0 | recptr = XLogRecoveryCtl->lastReplayedEndRecPtr; |
4567 | 0 | tli = XLogRecoveryCtl->lastReplayedTLI; |
4568 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
4569 | |
|
4570 | 0 | if (replayTLI) |
4571 | 0 | *replayTLI = tli; |
4572 | 0 | return recptr; |
4573 | 0 | } |
4574 | | |
4575 | | |
4576 | | /* |
4577 | | * Get position of last applied, or the record being applied. |
4578 | | * |
4579 | | * This is different from GetXLogReplayRecPtr() in that if a WAL |
4580 | | * record is currently being applied, this includes that record. |
4581 | | */ |
4582 | | XLogRecPtr |
4583 | | GetCurrentReplayRecPtr(TimeLineID *replayEndTLI) |
4584 | 0 | { |
4585 | 0 | XLogRecPtr recptr; |
4586 | 0 | TimeLineID tli; |
4587 | |
|
4588 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
4589 | 0 | recptr = XLogRecoveryCtl->replayEndRecPtr; |
4590 | 0 | tli = XLogRecoveryCtl->replayEndTLI; |
4591 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
4592 | |
|
4593 | 0 | if (replayEndTLI) |
4594 | 0 | *replayEndTLI = tli; |
4595 | 0 | return recptr; |
4596 | 0 | } |
4597 | | |
4598 | | /* |
4599 | | * Save timestamp of latest processed commit/abort record. |
4600 | | * |
4601 | | * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be |
4602 | | * seen by processes other than the startup process. Note in particular |
4603 | | * that CreateRestartPoint is executed in the checkpointer. |
4604 | | */ |
4605 | | static void |
4606 | | SetLatestXTime(TimestampTz xtime) |
4607 | 0 | { |
4608 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
4609 | 0 | XLogRecoveryCtl->recoveryLastXTime = xtime; |
4610 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
4611 | 0 | } |
4612 | | |
4613 | | /* |
4614 | | * Fetch timestamp of latest processed commit/abort record. |
4615 | | */ |
4616 | | TimestampTz |
4617 | | GetLatestXTime(void) |
4618 | 0 | { |
4619 | 0 | TimestampTz xtime; |
4620 | |
|
4621 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
4622 | 0 | xtime = XLogRecoveryCtl->recoveryLastXTime; |
4623 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
4624 | |
|
4625 | 0 | return xtime; |
4626 | 0 | } |
4627 | | |
4628 | | /* |
4629 | | * Save timestamp of the next chunk of WAL records to apply. |
4630 | | * |
4631 | | * We keep this in XLogRecoveryCtl, not a simple static variable, so that it can be |
4632 | | * seen by all backends. |
4633 | | */ |
4634 | | static void |
4635 | | SetCurrentChunkStartTime(TimestampTz xtime) |
4636 | 0 | { |
4637 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
4638 | 0 | XLogRecoveryCtl->currentChunkStartTime = xtime; |
4639 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
4640 | 0 | } |
4641 | | |
4642 | | /* |
4643 | | * Fetch timestamp of latest processed commit/abort record. |
4644 | | * Startup process maintains an accurate local copy in XLogReceiptTime |
4645 | | */ |
4646 | | TimestampTz |
4647 | | GetCurrentChunkReplayStartTime(void) |
4648 | 0 | { |
4649 | 0 | TimestampTz xtime; |
4650 | |
|
4651 | 0 | SpinLockAcquire(&XLogRecoveryCtl->info_lck); |
4652 | 0 | xtime = XLogRecoveryCtl->currentChunkStartTime; |
4653 | 0 | SpinLockRelease(&XLogRecoveryCtl->info_lck); |
4654 | |
|
4655 | 0 | return xtime; |
4656 | 0 | } |
4657 | | |
4658 | | /* |
4659 | | * Returns time of receipt of current chunk of XLOG data, as well as |
4660 | | * whether it was received from streaming replication or from archives. |
4661 | | */ |
4662 | | void |
4663 | | GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream) |
4664 | 0 | { |
4665 | | /* |
4666 | | * This must be executed in the startup process, since we don't export the |
4667 | | * relevant state to shared memory. |
4668 | | */ |
4669 | 0 | Assert(InRecovery); |
4670 | |
|
4671 | 0 | *rtime = XLogReceiptTime; |
4672 | 0 | *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM); |
4673 | 0 | } |
4674 | | |
4675 | | /* |
4676 | | * Note that text field supplied is a parameter name and does not require |
4677 | | * translation |
4678 | | */ |
4679 | | void |
4680 | | RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue) |
4681 | 0 | { |
4682 | 0 | if (currValue < minValue) |
4683 | 0 | { |
4684 | 0 | if (HotStandbyActiveInReplay()) |
4685 | 0 | { |
4686 | 0 | bool warned_for_promote = false; |
4687 | |
|
4688 | 0 | ereport(WARNING, |
4689 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
4690 | 0 | errmsg("hot standby is not possible because of insufficient parameter settings"), |
4691 | 0 | errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", |
4692 | 0 | param_name, |
4693 | 0 | currValue, |
4694 | 0 | minValue))); |
4695 | | |
4696 | 0 | SetRecoveryPause(true); |
4697 | |
|
4698 | 0 | ereport(LOG, |
4699 | 0 | (errmsg("recovery has paused"), |
4700 | 0 | errdetail("If recovery is unpaused, the server will shut down."), |
4701 | 0 | errhint("You can then restart the server after making the necessary configuration changes."))); |
4702 | | |
4703 | 0 | while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED) |
4704 | 0 | { |
4705 | 0 | ProcessStartupProcInterrupts(); |
4706 | |
|
4707 | 0 | if (CheckForStandbyTrigger()) |
4708 | 0 | { |
4709 | 0 | if (!warned_for_promote) |
4710 | 0 | ereport(WARNING, |
4711 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
4712 | 0 | errmsg("promotion is not possible because of insufficient parameter settings"), |
4713 | | |
4714 | | /* |
4715 | | * Repeat the detail from above so it's easy to find |
4716 | | * in the log. |
4717 | | */ |
4718 | 0 | errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", |
4719 | 0 | param_name, |
4720 | 0 | currValue, |
4721 | 0 | minValue), |
4722 | 0 | errhint("Restart the server after making the necessary configuration changes."))); |
4723 | 0 | warned_for_promote = true; |
4724 | 0 | } |
4725 | | |
4726 | | /* |
4727 | | * If recovery pause is requested then set it paused. While |
4728 | | * we are in the loop, user might resume and pause again so |
4729 | | * set this every time. |
4730 | | */ |
4731 | 0 | ConfirmRecoveryPaused(); |
4732 | | |
4733 | | /* |
4734 | | * We wait on a condition variable that will wake us as soon |
4735 | | * as the pause ends, but we use a timeout so we can check the |
4736 | | * above conditions periodically too. |
4737 | | */ |
4738 | 0 | ConditionVariableTimedSleep(&XLogRecoveryCtl->recoveryNotPausedCV, 1000, |
4739 | 0 | WAIT_EVENT_RECOVERY_PAUSE); |
4740 | 0 | } |
4741 | 0 | ConditionVariableCancelSleep(); |
4742 | 0 | } |
4743 | | |
4744 | 0 | ereport(FATAL, |
4745 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
4746 | 0 | errmsg("recovery aborted because of insufficient parameter settings"), |
4747 | | /* Repeat the detail from above so it's easy to find in the log. */ |
4748 | 0 | errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.", |
4749 | 0 | param_name, |
4750 | 0 | currValue, |
4751 | 0 | minValue), |
4752 | 0 | errhint("You can restart the server after making the necessary configuration changes."))); |
4753 | 0 | } |
4754 | 0 | } |
4755 | | |
4756 | | |
4757 | | /* |
4758 | | * GUC check_hook for primary_slot_name |
4759 | | */ |
4760 | | bool |
4761 | | check_primary_slot_name(char **newval, void **extra, GucSource source) |
4762 | 2 | { |
4763 | 2 | if (*newval && strcmp(*newval, "") != 0 && |
4764 | 2 | !ReplicationSlotValidateName(*newval, false, WARNING)) |
4765 | 0 | return false; |
4766 | | |
4767 | 2 | return true; |
4768 | 2 | } |
4769 | | |
4770 | | /* |
4771 | | * Recovery target settings: Only one of the several recovery_target* settings |
4772 | | * may be set. Setting a second one results in an error. The global variable |
4773 | | * recoveryTarget tracks which kind of recovery target was chosen. Other |
4774 | | * variables store the actual target value (for example a string or a xid). |
4775 | | * The assign functions of the parameters check whether a competing parameter |
4776 | | * was already set. But we want to allow setting the same parameter multiple |
4777 | | * times. We also want to allow unsetting a parameter and setting a different |
4778 | | * one, so we unset recoveryTarget when the parameter is set to an empty |
4779 | | * string. |
4780 | | * |
4781 | | * XXX this code is broken by design. Throwing an error from a GUC assign |
4782 | | * hook breaks fundamental assumptions of guc.c. So long as all the variables |
4783 | | * for which this can happen are PGC_POSTMASTER, the consequences are limited, |
4784 | | * since we'd just abort postmaster startup anyway. Nonetheless it's likely |
4785 | | * that we have odd behaviors such as unexpected GUC ordering dependencies. |
4786 | | */ |
4787 | | |
4788 | | pg_noreturn static void |
4789 | | error_multiple_recovery_targets(void) |
4790 | 0 | { |
4791 | 0 | ereport(ERROR, |
4792 | 0 | (errcode(ERRCODE_INVALID_PARAMETER_VALUE), |
4793 | 0 | errmsg("multiple recovery targets specified"), |
4794 | 0 | errdetail("At most one of \"recovery_target\", \"recovery_target_lsn\", \"recovery_target_name\", \"recovery_target_time\", \"recovery_target_xid\" may be set."))); |
4795 | 0 | } |
4796 | | |
4797 | | /* |
4798 | | * GUC check_hook for recovery_target |
4799 | | */ |
4800 | | bool |
4801 | | check_recovery_target(char **newval, void **extra, GucSource source) |
4802 | 2 | { |
4803 | 2 | if (strcmp(*newval, "immediate") != 0 && strcmp(*newval, "") != 0) |
4804 | 0 | { |
4805 | 0 | GUC_check_errdetail("The only allowed value is \"immediate\"."); |
4806 | 0 | return false; |
4807 | 0 | } |
4808 | 2 | return true; |
4809 | 2 | } |
4810 | | |
4811 | | /* |
4812 | | * GUC assign_hook for recovery_target |
4813 | | */ |
4814 | | void |
4815 | | assign_recovery_target(const char *newval, void *extra) |
4816 | 2 | { |
4817 | 2 | if (recoveryTarget != RECOVERY_TARGET_UNSET && |
4818 | 2 | recoveryTarget != RECOVERY_TARGET_IMMEDIATE) |
4819 | 0 | error_multiple_recovery_targets(); |
4820 | | |
4821 | 2 | if (newval && strcmp(newval, "") != 0) |
4822 | 0 | recoveryTarget = RECOVERY_TARGET_IMMEDIATE; |
4823 | 2 | else |
4824 | 2 | recoveryTarget = RECOVERY_TARGET_UNSET; |
4825 | 2 | } |
4826 | | |
4827 | | /* |
4828 | | * GUC check_hook for recovery_target_lsn |
4829 | | */ |
4830 | | bool |
4831 | | check_recovery_target_lsn(char **newval, void **extra, GucSource source) |
4832 | 2 | { |
4833 | 2 | if (strcmp(*newval, "") != 0) |
4834 | 0 | { |
4835 | 0 | XLogRecPtr lsn; |
4836 | 0 | XLogRecPtr *myextra; |
4837 | 0 | bool have_error = false; |
4838 | |
|
4839 | 0 | lsn = pg_lsn_in_internal(*newval, &have_error); |
4840 | 0 | if (have_error) |
4841 | 0 | return false; |
4842 | | |
4843 | 0 | myextra = (XLogRecPtr *) guc_malloc(LOG, sizeof(XLogRecPtr)); |
4844 | 0 | if (!myextra) |
4845 | 0 | return false; |
4846 | 0 | *myextra = lsn; |
4847 | 0 | *extra = myextra; |
4848 | 0 | } |
4849 | 2 | return true; |
4850 | 2 | } |
4851 | | |
4852 | | /* |
4853 | | * GUC assign_hook for recovery_target_lsn |
4854 | | */ |
4855 | | void |
4856 | | assign_recovery_target_lsn(const char *newval, void *extra) |
4857 | 2 | { |
4858 | 2 | if (recoveryTarget != RECOVERY_TARGET_UNSET && |
4859 | 2 | recoveryTarget != RECOVERY_TARGET_LSN) |
4860 | 0 | error_multiple_recovery_targets(); |
4861 | | |
4862 | 2 | if (newval && strcmp(newval, "") != 0) |
4863 | 0 | { |
4864 | 0 | recoveryTarget = RECOVERY_TARGET_LSN; |
4865 | 0 | recoveryTargetLSN = *((XLogRecPtr *) extra); |
4866 | 0 | } |
4867 | 2 | else |
4868 | 2 | recoveryTarget = RECOVERY_TARGET_UNSET; |
4869 | 2 | } |
4870 | | |
4871 | | /* |
4872 | | * GUC check_hook for recovery_target_name |
4873 | | */ |
4874 | | bool |
4875 | | check_recovery_target_name(char **newval, void **extra, GucSource source) |
4876 | 2 | { |
4877 | | /* Use the value of newval directly */ |
4878 | 2 | if (strlen(*newval) >= MAXFNAMELEN) |
4879 | 0 | { |
4880 | 0 | GUC_check_errdetail("\"%s\" is too long (maximum %d characters).", |
4881 | 0 | "recovery_target_name", MAXFNAMELEN - 1); |
4882 | 0 | return false; |
4883 | 0 | } |
4884 | 2 | return true; |
4885 | 2 | } |
4886 | | |
4887 | | /* |
4888 | | * GUC assign_hook for recovery_target_name |
4889 | | */ |
4890 | | void |
4891 | | assign_recovery_target_name(const char *newval, void *extra) |
4892 | 2 | { |
4893 | 2 | if (recoveryTarget != RECOVERY_TARGET_UNSET && |
4894 | 2 | recoveryTarget != RECOVERY_TARGET_NAME) |
4895 | 0 | error_multiple_recovery_targets(); |
4896 | | |
4897 | 2 | if (newval && strcmp(newval, "") != 0) |
4898 | 0 | { |
4899 | 0 | recoveryTarget = RECOVERY_TARGET_NAME; |
4900 | 0 | recoveryTargetName = newval; |
4901 | 0 | } |
4902 | 2 | else |
4903 | 2 | recoveryTarget = RECOVERY_TARGET_UNSET; |
4904 | 2 | } |
4905 | | |
4906 | | /* |
4907 | | * GUC check_hook for recovery_target_time |
4908 | | * |
4909 | | * The interpretation of the recovery_target_time string can depend on the |
4910 | | * time zone setting, so we need to wait until after all GUC processing is |
4911 | | * done before we can do the final parsing of the string. This check function |
4912 | | * only does a parsing pass to catch syntax errors, but we store the string |
4913 | | * and parse it again when we need to use it. |
4914 | | */ |
4915 | | bool |
4916 | | check_recovery_target_time(char **newval, void **extra, GucSource source) |
4917 | 2 | { |
4918 | 2 | if (strcmp(*newval, "") != 0) |
4919 | 0 | { |
4920 | | /* reject some special values */ |
4921 | 0 | if (strcmp(*newval, "now") == 0 || |
4922 | 0 | strcmp(*newval, "today") == 0 || |
4923 | 0 | strcmp(*newval, "tomorrow") == 0 || |
4924 | 0 | strcmp(*newval, "yesterday") == 0) |
4925 | 0 | { |
4926 | 0 | return false; |
4927 | 0 | } |
4928 | | |
4929 | | /* |
4930 | | * parse timestamp value (see also timestamptz_in()) |
4931 | | */ |
4932 | 0 | { |
4933 | 0 | char *str = *newval; |
4934 | 0 | fsec_t fsec; |
4935 | 0 | struct pg_tm tt, |
4936 | 0 | *tm = &tt; |
4937 | 0 | int tz; |
4938 | 0 | int dtype; |
4939 | 0 | int nf; |
4940 | 0 | int dterr; |
4941 | 0 | char *field[MAXDATEFIELDS]; |
4942 | 0 | int ftype[MAXDATEFIELDS]; |
4943 | 0 | char workbuf[MAXDATELEN + MAXDATEFIELDS]; |
4944 | 0 | DateTimeErrorExtra dtextra; |
4945 | 0 | TimestampTz timestamp; |
4946 | |
|
4947 | 0 | dterr = ParseDateTime(str, workbuf, sizeof(workbuf), |
4948 | 0 | field, ftype, MAXDATEFIELDS, &nf); |
4949 | 0 | if (dterr == 0) |
4950 | 0 | dterr = DecodeDateTime(field, ftype, nf, |
4951 | 0 | &dtype, tm, &fsec, &tz, &dtextra); |
4952 | 0 | if (dterr != 0) |
4953 | 0 | return false; |
4954 | 0 | if (dtype != DTK_DATE) |
4955 | 0 | return false; |
4956 | | |
4957 | 0 | if (tm2timestamp(tm, fsec, &tz, ×tamp) != 0) |
4958 | 0 | { |
4959 | 0 | GUC_check_errdetail("Timestamp out of range: \"%s\".", str); |
4960 | 0 | return false; |
4961 | 0 | } |
4962 | 0 | } |
4963 | 0 | } |
4964 | 2 | return true; |
4965 | 2 | } |
4966 | | |
4967 | | /* |
4968 | | * GUC assign_hook for recovery_target_time |
4969 | | */ |
4970 | | void |
4971 | | assign_recovery_target_time(const char *newval, void *extra) |
4972 | 2 | { |
4973 | 2 | if (recoveryTarget != RECOVERY_TARGET_UNSET && |
4974 | 2 | recoveryTarget != RECOVERY_TARGET_TIME) |
4975 | 0 | error_multiple_recovery_targets(); |
4976 | | |
4977 | 2 | if (newval && strcmp(newval, "") != 0) |
4978 | 0 | recoveryTarget = RECOVERY_TARGET_TIME; |
4979 | 2 | else |
4980 | 2 | recoveryTarget = RECOVERY_TARGET_UNSET; |
4981 | 2 | } |
4982 | | |
4983 | | /* |
4984 | | * GUC check_hook for recovery_target_timeline |
4985 | | */ |
4986 | | bool |
4987 | | check_recovery_target_timeline(char **newval, void **extra, GucSource source) |
4988 | 2 | { |
4989 | 2 | RecoveryTargetTimeLineGoal rttg; |
4990 | 2 | RecoveryTargetTimeLineGoal *myextra; |
4991 | | |
4992 | 2 | if (strcmp(*newval, "current") == 0) |
4993 | 0 | rttg = RECOVERY_TARGET_TIMELINE_CONTROLFILE; |
4994 | 2 | else if (strcmp(*newval, "latest") == 0) |
4995 | 2 | rttg = RECOVERY_TARGET_TIMELINE_LATEST; |
4996 | 0 | else |
4997 | 0 | { |
4998 | 0 | char *endp; |
4999 | 0 | uint64 timeline; |
5000 | |
|
5001 | 0 | rttg = RECOVERY_TARGET_TIMELINE_NUMERIC; |
5002 | |
|
5003 | 0 | errno = 0; |
5004 | 0 | timeline = strtou64(*newval, &endp, 0); |
5005 | |
|
5006 | 0 | if (*endp != '\0' || errno == EINVAL || errno == ERANGE) |
5007 | 0 | { |
5008 | 0 | GUC_check_errdetail("\"%s\" is not a valid number.", |
5009 | 0 | "recovery_target_timeline"); |
5010 | 0 | return false; |
5011 | 0 | } |
5012 | | |
5013 | 0 | if (timeline < 1 || timeline > PG_UINT32_MAX) |
5014 | 0 | { |
5015 | 0 | GUC_check_errdetail("\"%s\" must be between %u and %u.", |
5016 | 0 | "recovery_target_timeline", 1, UINT_MAX); |
5017 | 0 | return false; |
5018 | 0 | } |
5019 | 0 | } |
5020 | | |
5021 | 2 | myextra = (RecoveryTargetTimeLineGoal *) guc_malloc(LOG, sizeof(RecoveryTargetTimeLineGoal)); |
5022 | 2 | if (!myextra) |
5023 | 0 | return false; |
5024 | 2 | *myextra = rttg; |
5025 | 2 | *extra = myextra; |
5026 | | |
5027 | 2 | return true; |
5028 | 2 | } |
5029 | | |
5030 | | /* |
5031 | | * GUC assign_hook for recovery_target_timeline |
5032 | | */ |
5033 | | void |
5034 | | assign_recovery_target_timeline(const char *newval, void *extra) |
5035 | 2 | { |
5036 | 2 | recoveryTargetTimeLineGoal = *((RecoveryTargetTimeLineGoal *) extra); |
5037 | 2 | if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC) |
5038 | 0 | recoveryTargetTLIRequested = (TimeLineID) strtoul(newval, NULL, 0); |
5039 | 2 | else |
5040 | 2 | recoveryTargetTLIRequested = 0; |
5041 | 2 | } |
5042 | | |
5043 | | /* |
5044 | | * GUC check_hook for recovery_target_xid |
5045 | | */ |
5046 | | bool |
5047 | | check_recovery_target_xid(char **newval, void **extra, GucSource source) |
5048 | 2 | { |
5049 | 2 | if (strcmp(*newval, "") != 0) |
5050 | 0 | { |
5051 | 0 | TransactionId xid; |
5052 | 0 | TransactionId *myextra; |
5053 | |
|
5054 | 0 | errno = 0; |
5055 | 0 | xid = (TransactionId) strtou64(*newval, NULL, 0); |
5056 | 0 | if (errno == EINVAL || errno == ERANGE) |
5057 | 0 | return false; |
5058 | | |
5059 | 0 | myextra = (TransactionId *) guc_malloc(LOG, sizeof(TransactionId)); |
5060 | 0 | if (!myextra) |
5061 | 0 | return false; |
5062 | 0 | *myextra = xid; |
5063 | 0 | *extra = myextra; |
5064 | 0 | } |
5065 | 2 | return true; |
5066 | 2 | } |
5067 | | |
5068 | | /* |
5069 | | * GUC assign_hook for recovery_target_xid |
5070 | | */ |
5071 | | void |
5072 | | assign_recovery_target_xid(const char *newval, void *extra) |
5073 | 2 | { |
5074 | 2 | if (recoveryTarget != RECOVERY_TARGET_UNSET && |
5075 | 2 | recoveryTarget != RECOVERY_TARGET_XID) |
5076 | 0 | error_multiple_recovery_targets(); |
5077 | | |
5078 | 2 | if (newval && strcmp(newval, "") != 0) |
5079 | 0 | { |
5080 | 0 | recoveryTarget = RECOVERY_TARGET_XID; |
5081 | 0 | recoveryTargetXid = *((TransactionId *) extra); |
5082 | 0 | } |
5083 | 2 | else |
5084 | 2 | recoveryTarget = RECOVERY_TARGET_UNSET; |
5085 | 2 | } |