Coverage Report

Created: 2025-06-13 06:06

/src/postgres/src/backend/access/transam/xlog.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * xlog.c
4
 *    PostgreSQL write-ahead log manager
5
 *
6
 * The Write-Ahead Log (WAL) functionality is split into several source
7
 * files, in addition to this one:
8
 *
9
 * xloginsert.c - Functions for constructing WAL records
10
 * xlogrecovery.c - WAL recovery and standby code
11
 * xlogreader.c - Facility for reading WAL files and parsing WAL records
12
 * xlogutils.c - Helper functions for WAL redo routines
13
 *
14
 * This file contains functions for coordinating database startup and
15
 * checkpointing, and managing the write-ahead log buffers when the
16
 * system is running.
17
 *
18
 * StartupXLOG() is the main entry point of the startup process.  It
19
 * coordinates database startup, performing WAL recovery, and the
20
 * transition from WAL recovery into normal operations.
21
 *
22
 * XLogInsertRecord() inserts a WAL record into the WAL buffers.  Most
23
 * callers should not call this directly, but use the functions in
24
 * xloginsert.c to construct the WAL record.  XLogFlush() can be used
25
 * to force the WAL to disk.
26
 *
27
 * In addition to those, there are many other functions for interrogating
28
 * the current system state, and for starting/stopping backups.
29
 *
30
 *
31
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
32
 * Portions Copyright (c) 1994, Regents of the University of California
33
 *
34
 * src/backend/access/transam/xlog.c
35
 *
36
 *-------------------------------------------------------------------------
37
 */
38
39
#include "postgres.h"
40
41
#include <ctype.h>
42
#include <math.h>
43
#include <time.h>
44
#include <fcntl.h>
45
#include <sys/stat.h>
46
#include <sys/time.h>
47
#include <unistd.h>
48
49
#include "access/clog.h"
50
#include "access/commit_ts.h"
51
#include "access/heaptoast.h"
52
#include "access/multixact.h"
53
#include "access/rewriteheap.h"
54
#include "access/subtrans.h"
55
#include "access/timeline.h"
56
#include "access/transam.h"
57
#include "access/twophase.h"
58
#include "access/xact.h"
59
#include "access/xlog_internal.h"
60
#include "access/xlogarchive.h"
61
#include "access/xloginsert.h"
62
#include "access/xlogreader.h"
63
#include "access/xlogrecovery.h"
64
#include "access/xlogutils.h"
65
#include "backup/basebackup.h"
66
#include "catalog/catversion.h"
67
#include "catalog/pg_control.h"
68
#include "catalog/pg_database.h"
69
#include "common/controldata_utils.h"
70
#include "common/file_utils.h"
71
#include "executor/instrument.h"
72
#include "miscadmin.h"
73
#include "pg_trace.h"
74
#include "pgstat.h"
75
#include "port/atomics.h"
76
#include "postmaster/bgwriter.h"
77
#include "postmaster/startup.h"
78
#include "postmaster/walsummarizer.h"
79
#include "postmaster/walwriter.h"
80
#include "replication/origin.h"
81
#include "replication/slot.h"
82
#include "replication/snapbuild.h"
83
#include "replication/walreceiver.h"
84
#include "replication/walsender.h"
85
#include "storage/bufmgr.h"
86
#include "storage/fd.h"
87
#include "storage/ipc.h"
88
#include "storage/large_object.h"
89
#include "storage/latch.h"
90
#include "storage/predicate.h"
91
#include "storage/proc.h"
92
#include "storage/procarray.h"
93
#include "storage/reinit.h"
94
#include "storage/spin.h"
95
#include "storage/sync.h"
96
#include "utils/guc_hooks.h"
97
#include "utils/guc_tables.h"
98
#include "utils/injection_point.h"
99
#include "utils/ps_status.h"
100
#include "utils/relmapper.h"
101
#include "utils/snapmgr.h"
102
#include "utils/timeout.h"
103
#include "utils/timestamp.h"
104
#include "utils/varlena.h"
105
106
#ifdef WAL_DEBUG
107
#include "utils/memutils.h"
108
#endif
109
110
/* timeline ID to be used when bootstrapping */
111
0
#define BootstrapTimeLineID   1
112
113
/* User-settable parameters */
114
int     max_wal_size_mb = 1024; /* 1 GB */
115
int     min_wal_size_mb = 80; /* 80 MB */
116
int     wal_keep_size_mb = 0;
117
int     XLOGbuffers = -1;
118
int     XLogArchiveTimeout = 0;
119
int     XLogArchiveMode = ARCHIVE_MODE_OFF;
120
char     *XLogArchiveCommand = NULL;
121
bool    EnableHotStandby = false;
122
bool    fullPageWrites = true;
123
bool    wal_log_hints = false;
124
int     wal_compression = WAL_COMPRESSION_NONE;
125
char     *wal_consistency_checking_string = NULL;
126
bool     *wal_consistency_checking = NULL;
127
bool    wal_init_zero = true;
128
bool    wal_recycle = true;
129
bool    log_checkpoints = true;
130
int     wal_sync_method = DEFAULT_WAL_SYNC_METHOD;
131
int     wal_level = WAL_LEVEL_REPLICA;
132
int     CommitDelay = 0;  /* precommit delay in microseconds */
133
int     CommitSiblings = 5; /* # concurrent xacts needed to sleep */
134
int     wal_retrieve_retry_interval = 5000;
135
int     max_slot_wal_keep_size_mb = -1;
136
int     wal_decode_buffer_size = 512 * 1024;
137
bool    track_wal_io_timing = false;
138
139
#ifdef WAL_DEBUG
140
bool    XLOG_DEBUG = false;
141
#endif
142
143
int     wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
144
145
/*
146
 * Number of WAL insertion locks to use. A higher value allows more insertions
147
 * to happen concurrently, but adds some CPU overhead to flushing the WAL,
148
 * which needs to iterate all the locks.
149
 */
150
0
#define NUM_XLOGINSERT_LOCKS  8
151
152
/*
153
 * Max distance from last checkpoint, before triggering a new xlog-based
154
 * checkpoint.
155
 */
156
int     CheckPointSegments;
157
158
/* Estimated distance between checkpoints, in bytes */
159
static double CheckPointDistanceEstimate = 0;
160
static double PrevCheckPointDistance = 0;
161
162
/*
163
 * Track whether there were any deferred checks for custom resource managers
164
 * specified in wal_consistency_checking.
165
 */
166
static bool check_wal_consistency_checking_deferred = false;
167
168
/*
169
 * GUC support
170
 */
171
const struct config_enum_entry wal_sync_method_options[] = {
172
  {"fsync", WAL_SYNC_METHOD_FSYNC, false},
173
#ifdef HAVE_FSYNC_WRITETHROUGH
174
  {"fsync_writethrough", WAL_SYNC_METHOD_FSYNC_WRITETHROUGH, false},
175
#endif
176
  {"fdatasync", WAL_SYNC_METHOD_FDATASYNC, false},
177
#ifdef O_SYNC
178
  {"open_sync", WAL_SYNC_METHOD_OPEN, false},
179
#endif
180
#ifdef O_DSYNC
181
  {"open_datasync", WAL_SYNC_METHOD_OPEN_DSYNC, false},
182
#endif
183
  {NULL, 0, false}
184
};
185
186
187
/*
188
 * Although only "on", "off", and "always" are documented,
189
 * we accept all the likely variants of "on" and "off".
190
 */
191
const struct config_enum_entry archive_mode_options[] = {
192
  {"always", ARCHIVE_MODE_ALWAYS, false},
193
  {"on", ARCHIVE_MODE_ON, false},
194
  {"off", ARCHIVE_MODE_OFF, false},
195
  {"true", ARCHIVE_MODE_ON, true},
196
  {"false", ARCHIVE_MODE_OFF, true},
197
  {"yes", ARCHIVE_MODE_ON, true},
198
  {"no", ARCHIVE_MODE_OFF, true},
199
  {"1", ARCHIVE_MODE_ON, true},
200
  {"0", ARCHIVE_MODE_OFF, true},
201
  {NULL, 0, false}
202
};
203
204
/*
205
 * Statistics for current checkpoint are collected in this global struct.
206
 * Because only the checkpointer or a stand-alone backend can perform
207
 * checkpoints, this will be unused in normal backends.
208
 */
209
CheckpointStatsData CheckpointStats;
210
211
/*
212
 * During recovery, lastFullPageWrites keeps track of full_page_writes that
213
 * the replayed WAL records indicate. It's initialized with full_page_writes
214
 * that the recovery starting checkpoint record indicates, and then updated
215
 * each time XLOG_FPW_CHANGE record is replayed.
216
 */
217
static bool lastFullPageWrites;
218
219
/*
220
 * Local copy of the state tracked by SharedRecoveryState in shared memory,
221
 * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
222
 * means "not known, need to check the shared state".
223
 */
224
static bool LocalRecoveryInProgress = true;
225
226
/*
227
 * Local state for XLogInsertAllowed():
228
 *    1: unconditionally allowed to insert XLOG
229
 *    0: unconditionally not allowed to insert XLOG
230
 *    -1: must check RecoveryInProgress(); disallow until it is false
231
 * Most processes start with -1 and transition to 1 after seeing that recovery
232
 * is not in progress.  But we can also force the value for special cases.
233
 * The coding in XLogInsertAllowed() depends on the first two of these states
234
 * being numerically the same as bool true and false.
235
 */
236
static int  LocalXLogInsertAllowed = -1;
237
238
/*
239
 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
240
 * current backend.  It is updated for all inserts.  XactLastRecEnd points to
241
 * end+1 of the last record, and is reset when we end a top-level transaction,
242
 * or start a new one; so it can be used to tell if the current transaction has
243
 * created any XLOG records.
244
 *
245
 * While in parallel mode, this may not be fully up to date.  When committing,
246
 * a transaction can assume this covers all xlog records written either by the
247
 * user backend or by any parallel worker which was present at any point during
248
 * the transaction.  But when aborting, or when still in parallel mode, other
249
 * parallel backends may have written WAL records at later LSNs than the value
250
 * stored here.  The parallel leader advances its own copy, when necessary,
251
 * in WaitForParallelWorkersToFinish.
252
 */
253
XLogRecPtr  ProcLastRecPtr = InvalidXLogRecPtr;
254
XLogRecPtr  XactLastRecEnd = InvalidXLogRecPtr;
255
XLogRecPtr  XactLastCommitEnd = InvalidXLogRecPtr;
256
257
/*
258
 * RedoRecPtr is this backend's local copy of the REDO record pointer
259
 * (which is almost but not quite the same as a pointer to the most recent
260
 * CHECKPOINT record).  We update this from the shared-memory copy,
261
 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
262
 * hold an insertion lock).  See XLogInsertRecord for details.  We are also
263
 * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
264
 * see GetRedoRecPtr.
265
 *
266
 * NB: Code that uses this variable must be prepared not only for the
267
 * possibility that it may be arbitrarily out of date, but also for the
268
 * possibility that it might be set to InvalidXLogRecPtr. We used to
269
 * initialize it as a side effect of the first call to RecoveryInProgress(),
270
 * which meant that most code that might use it could assume that it had a
271
 * real if perhaps stale value. That's no longer the case.
272
 */
273
static XLogRecPtr RedoRecPtr;
274
275
/*
276
 * doPageWrites is this backend's local copy of (fullPageWrites ||
277
 * runningBackups > 0).  It is used together with RedoRecPtr to decide whether
278
 * a full-page image of a page need to be taken.
279
 *
280
 * NB: Initially this is false, and there's no guarantee that it will be
281
 * initialized to any other value before it is first used. Any code that
282
 * makes use of it must recheck the value after obtaining a WALInsertLock,
283
 * and respond appropriately if it turns out that the previous value wasn't
284
 * accurate.
285
 */
286
static bool doPageWrites;
287
288
/*----------
289
 * Shared-memory data structures for XLOG control
290
 *
291
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
292
 * the log up to (all records before that point must be written or fsynced).
293
 * The positions already written/fsynced are maintained in logWriteResult
294
 * and logFlushResult using atomic access.
295
 * In addition to the shared variable, each backend has a private copy of
296
 * both in LogwrtResult, which is updated when convenient.
297
 *
298
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
299
 * (protected by info_lck), but we don't need to cache any copies of it.
300
 *
301
 * info_lck is only held long enough to read/update the protected variables,
302
 * so it's a plain spinlock.  The other locks are held longer (potentially
303
 * over I/O operations), so we use LWLocks for them.  These locks are:
304
 *
305
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
306
 * XLogFlush).
307
 *
308
 * ControlFileLock: must be held to read/update control file or create
309
 * new log file.
310
 *
311
 *----------
312
 */
313
314
typedef struct XLogwrtRqst
315
{
316
  XLogRecPtr  Write;      /* last byte + 1 to write out */
317
  XLogRecPtr  Flush;      /* last byte + 1 to flush */
318
} XLogwrtRqst;
319
320
typedef struct XLogwrtResult
321
{
322
  XLogRecPtr  Write;      /* last byte + 1 written out */
323
  XLogRecPtr  Flush;      /* last byte + 1 flushed */
324
} XLogwrtResult;
325
326
/*
327
 * Inserting to WAL is protected by a small fixed number of WAL insertion
328
 * locks. To insert to the WAL, you must hold one of the locks - it doesn't
329
 * matter which one. To lock out other concurrent insertions, you must hold
330
 * of them. Each WAL insertion lock consists of a lightweight lock, plus an
331
 * indicator of how far the insertion has progressed (insertingAt).
332
 *
333
 * The insertingAt values are read when a process wants to flush WAL from
334
 * the in-memory buffers to disk, to check that all the insertions to the
335
 * region the process is about to write out have finished. You could simply
336
 * wait for all currently in-progress insertions to finish, but the
337
 * insertingAt indicator allows you to ignore insertions to later in the WAL,
338
 * so that you only wait for the insertions that are modifying the buffers
339
 * you're about to write out.
340
 *
341
 * This isn't just an optimization. If all the WAL buffers are dirty, an
342
 * inserter that's holding a WAL insert lock might need to evict an old WAL
343
 * buffer, which requires flushing the WAL. If it's possible for an inserter
344
 * to block on another inserter unnecessarily, deadlock can arise when two
345
 * inserters holding a WAL insert lock wait for each other to finish their
346
 * insertion.
347
 *
348
 * Small WAL records that don't cross a page boundary never update the value,
349
 * the WAL record is just copied to the page and the lock is released. But
350
 * to avoid the deadlock-scenario explained above, the indicator is always
351
 * updated before sleeping while holding an insertion lock.
352
 *
353
 * lastImportantAt contains the LSN of the last important WAL record inserted
354
 * using a given lock. This value is used to detect if there has been
355
 * important WAL activity since the last time some action, like a checkpoint,
356
 * was performed - allowing to not repeat the action if not. The LSN is
357
 * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
358
 * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
359
 * records.  Tracking the WAL activity directly in WALInsertLock has the
360
 * advantage of not needing any additional locks to update the value.
361
 */
362
typedef struct
363
{
364
  LWLock    lock;
365
  pg_atomic_uint64 insertingAt;
366
  XLogRecPtr  lastImportantAt;
367
} WALInsertLock;
368
369
/*
370
 * All the WAL insertion locks are allocated as an array in shared memory. We
371
 * force the array stride to be a power of 2, which saves a few cycles in
372
 * indexing, but more importantly also ensures that individual slots don't
373
 * cross cache line boundaries. (Of course, we have to also ensure that the
374
 * array start address is suitably aligned.)
375
 */
376
typedef union WALInsertLockPadded
377
{
378
  WALInsertLock l;
379
  char    pad[PG_CACHE_LINE_SIZE];
380
} WALInsertLockPadded;
381
382
/*
383
 * Session status of running backup, used for sanity checks in SQL-callable
384
 * functions to start and stop backups.
385
 */
386
static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
387
388
/*
389
 * Shared state data for WAL insertion.
390
 */
391
typedef struct XLogCtlInsert
392
{
393
  slock_t   insertpos_lck;  /* protects CurrBytePos and PrevBytePos */
394
395
  /*
396
   * CurrBytePos is the end of reserved WAL. The next record will be
397
   * inserted at that position. PrevBytePos is the start position of the
398
   * previously inserted (or rather, reserved) record - it is copied to the
399
   * prev-link of the next record. These are stored as "usable byte
400
   * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
401
   */
402
  uint64    CurrBytePos;
403
  uint64    PrevBytePos;
404
405
  /*
406
   * Make sure the above heavily-contended spinlock and byte positions are
407
   * on their own cache line. In particular, the RedoRecPtr and full page
408
   * write variables below should be on a different cache line. They are
409
   * read on every WAL insertion, but updated rarely, and we don't want
410
   * those reads to steal the cache line containing Curr/PrevBytePos.
411
   */
412
  char    pad[PG_CACHE_LINE_SIZE];
413
414
  /*
415
   * fullPageWrites is the authoritative value used by all backends to
416
   * determine whether to write full-page image to WAL. This shared value,
417
   * instead of the process-local fullPageWrites, is required because, when
418
   * full_page_writes is changed by SIGHUP, we must WAL-log it before it
419
   * actually affects WAL-logging by backends.  Checkpointer sets at startup
420
   * or after SIGHUP.
421
   *
422
   * To read these fields, you must hold an insertion lock. To modify them,
423
   * you must hold ALL the locks.
424
   */
425
  XLogRecPtr  RedoRecPtr;   /* current redo point for insertions */
426
  bool    fullPageWrites;
427
428
  /*
429
   * runningBackups is a counter indicating the number of backups currently
430
   * in progress. lastBackupStart is the latest checkpoint redo location
431
   * used as a starting point for an online backup.
432
   */
433
  int     runningBackups;
434
  XLogRecPtr  lastBackupStart;
435
436
  /*
437
   * WAL insertion locks.
438
   */
439
  WALInsertLockPadded *WALInsertLocks;
440
} XLogCtlInsert;
441
442
/*
443
 * Total shared-memory state for XLOG.
444
 */
445
typedef struct XLogCtlData
446
{
447
  XLogCtlInsert Insert;
448
449
  /* Protected by info_lck: */
450
  XLogwrtRqst LogwrtRqst;
451
  XLogRecPtr  RedoRecPtr;   /* a recent copy of Insert->RedoRecPtr */
452
  FullTransactionId ckptFullXid;  /* nextXid of latest checkpoint */
453
  XLogRecPtr  asyncXactLSN; /* LSN of newest async commit/abort */
454
  XLogRecPtr  replicationSlotMinLSN;  /* oldest LSN needed by any slot */
455
456
  XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
457
458
  /* Fake LSN counter, for unlogged relations. */
459
  pg_atomic_uint64 unloggedLSN;
460
461
  /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
462
  pg_time_t lastSegSwitchTime;
463
  XLogRecPtr  lastSegSwitchLSN;
464
465
  /* These are accessed using atomics -- info_lck not needed */
466
  pg_atomic_uint64 logInsertResult; /* last byte + 1 inserted to buffers */
467
  pg_atomic_uint64 logWriteResult;  /* last byte + 1 written out */
468
  pg_atomic_uint64 logFlushResult;  /* last byte + 1 flushed */
469
470
  /*
471
   * First initialized page in the cache (first byte position).
472
   */
473
  XLogRecPtr  InitializedFrom;
474
475
  /*
476
   * Latest reserved for initialization page in the cache (last byte
477
   * position + 1).
478
   *
479
   * To change the identity of a buffer, you need to advance
480
   * InitializeReserved first.  To change the identity of a buffer that's
481
   * still dirty, the old page needs to be written out first, and for that
482
   * you need WALWriteLock, and you need to ensure that there are no
483
   * in-progress insertions to the page by calling
484
   * WaitXLogInsertionsToFinish().
485
   */
486
  pg_atomic_uint64 InitializeReserved;
487
488
  /*
489
   * Latest initialized page in the cache (last byte position + 1).
490
   *
491
   * InitializedUpTo is updated after the buffer initialization.  After
492
   * update, waiters got notification using InitializedUpToCondVar.
493
   */
494
  pg_atomic_uint64 InitializedUpTo;
495
  ConditionVariable InitializedUpToCondVar;
496
497
  /*
498
   * These values do not change after startup, although the pointed-to pages
499
   * and xlblocks values certainly do.  xlblocks values are changed
500
   * lock-free according to the check for the xlog write position and are
501
   * accompanied by changes of InitializeReserved and InitializedUpTo.
502
   */
503
  char     *pages;      /* buffers for unwritten XLOG pages */
504
  pg_atomic_uint64 *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
505
  int     XLogCacheBlck;  /* highest allocated xlog buffer index */
506
507
  /*
508
   * InsertTimeLineID is the timeline into which new WAL is being inserted
509
   * and flushed. It is zero during recovery, and does not change once set.
510
   *
511
   * If we create a new timeline when the system was started up,
512
   * PrevTimeLineID is the old timeline's ID that we forked off from.
513
   * Otherwise it's equal to InsertTimeLineID.
514
   *
515
   * We set these fields while holding info_lck. Most that reads these
516
   * values knows that recovery is no longer in progress and so can safely
517
   * read the value without a lock, but code that could be run either during
518
   * or after recovery can take info_lck while reading these values.
519
   */
520
  TimeLineID  InsertTimeLineID;
521
  TimeLineID  PrevTimeLineID;
522
523
  /*
524
   * SharedRecoveryState indicates if we're still in crash or archive
525
   * recovery.  Protected by info_lck.
526
   */
527
  RecoveryState SharedRecoveryState;
528
529
  /*
530
   * InstallXLogFileSegmentActive indicates whether the checkpointer should
531
   * arrange for future segments by recycling and/or PreallocXlogFiles().
532
   * Protected by ControlFileLock.  Only the startup process changes it.  If
533
   * true, anyone can use InstallXLogFileSegment().  If false, the startup
534
   * process owns the exclusive right to install segments, by reading from
535
   * the archive and possibly replacing existing files.
536
   */
537
  bool    InstallXLogFileSegmentActive;
538
539
  /*
540
   * WalWriterSleeping indicates whether the WAL writer is currently in
541
   * low-power mode (and hence should be nudged if an async commit occurs).
542
   * Protected by info_lck.
543
   */
544
  bool    WalWriterSleeping;
545
546
  /*
547
   * During recovery, we keep a copy of the latest checkpoint record here.
548
   * lastCheckPointRecPtr points to start of checkpoint record and
549
   * lastCheckPointEndPtr points to end+1 of checkpoint record.  Used by the
550
   * checkpointer when it wants to create a restartpoint.
551
   *
552
   * Protected by info_lck.
553
   */
554
  XLogRecPtr  lastCheckPointRecPtr;
555
  XLogRecPtr  lastCheckPointEndPtr;
556
  CheckPoint  lastCheckPoint;
557
558
  /*
559
   * lastFpwDisableRecPtr points to the start of the last replayed
560
   * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
561
   */
562
  XLogRecPtr  lastFpwDisableRecPtr;
563
564
  slock_t   info_lck;   /* locks shared variables shown above */
565
} XLogCtlData;
566
567
/*
568
 * Classification of XLogInsertRecord operations.
569
 */
570
typedef enum
571
{
572
  WALINSERT_NORMAL,
573
  WALINSERT_SPECIAL_SWITCH,
574
  WALINSERT_SPECIAL_CHECKPOINT
575
} WalInsertClass;
576
577
static XLogCtlData *XLogCtl = NULL;
578
579
/* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
580
static WALInsertLockPadded *WALInsertLocks = NULL;
581
582
/*
583
 * We maintain an image of pg_control in shared memory.
584
 */
585
static ControlFileData *ControlFile = NULL;
586
587
/*
588
 * Calculate the amount of space left on the page after 'endptr'. Beware
589
 * multiple evaluation!
590
 */
591
#define INSERT_FREESPACE(endptr)  \
592
0
  (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
593
594
/* Macro to advance to next buffer index. */
595
#define NextBufIdx(idx)   \
596
0
    (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
597
598
/*
599
 * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
600
 * would hold if it was in cache, the page containing 'recptr'.
601
 */
602
#define XLogRecPtrToBufIdx(recptr)  \
603
0
  (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
604
605
/*
606
 * These are the number of bytes in a WAL page usable for WAL data.
607
 */
608
0
#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
609
610
/*
611
 * Convert values of GUCs measured in megabytes to equiv. segment count.
612
 * Rounds down.
613
 */
614
0
#define ConvertToXSegs(x, segsize)  XLogMBVarToSegs((x), (segsize))
615
616
/* The number of bytes in a WAL segment usable for WAL data. */
617
static int  UsableBytesInSegment;
618
619
/*
620
 * Private, possibly out-of-date copy of shared LogwrtResult.
621
 * See discussion above.
622
 */
623
static XLogwrtResult LogwrtResult = {0, 0};
624
625
/*
626
 * Update local copy of shared XLogCtl->log{Write,Flush}Result
627
 *
628
 * It's critical that Flush always trails Write, so the order of the reads is
629
 * important, as is the barrier.  See also XLogWrite.
630
 */
631
#define RefreshXLogWriteResult(_target) \
632
0
  do { \
633
0
    _target.Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult); \
634
0
    pg_read_barrier(); \
635
0
    _target.Write = pg_atomic_read_u64(&XLogCtl->logWriteResult); \
636
0
  } while (0)
637
638
/*
639
 * openLogFile is -1 or a kernel FD for an open log file segment.
640
 * openLogSegNo identifies the segment, and openLogTLI the corresponding TLI.
641
 * These variables are only used to write the XLOG, and so will normally refer
642
 * to the active segment.
643
 *
644
 * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
645
 */
646
static int  openLogFile = -1;
647
static XLogSegNo openLogSegNo = 0;
648
static TimeLineID openLogTLI = 0;
649
650
/*
651
 * Local copies of equivalent fields in the control file.  When running
652
 * crash recovery, LocalMinRecoveryPoint is set to InvalidXLogRecPtr as we
653
 * expect to replay all the WAL available, and updateMinRecoveryPoint is
654
 * switched to false to prevent any updates while replaying records.
655
 * Those values are kept consistent as long as crash recovery runs.
656
 */
657
static XLogRecPtr LocalMinRecoveryPoint;
658
static TimeLineID LocalMinRecoveryPointTLI;
659
static bool updateMinRecoveryPoint = true;
660
661
/* For WALInsertLockAcquire/Release functions */
662
static int  MyLockNo = 0;
663
static bool holdingAllLocks = false;
664
665
#ifdef WAL_DEBUG
666
static MemoryContext walDebugCxt = NULL;
667
#endif
668
669
static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI,
670
                    XLogRecPtr EndOfLog,
671
                    TimeLineID newTLI);
672
static void CheckRequiredParameterValues(void);
673
static void XLogReportParameters(void);
674
static int  LocalSetXLogInsertAllowed(void);
675
static void CreateEndOfRecoveryRecord(void);
676
static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn,
677
                          XLogRecPtr pagePtr,
678
                          TimeLineID newTLI);
679
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
680
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
681
static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
682
683
static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli,
684
                  bool opportunistic);
685
static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible);
686
static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
687
                   bool find_free, XLogSegNo max_segno,
688
                   TimeLineID tli);
689
static void XLogFileClose(void);
690
static void PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli);
691
static void RemoveTempXlogFiles(void);
692
static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr,
693
                 XLogRecPtr endptr, TimeLineID insertTLI);
694
static void RemoveXlogFile(const struct dirent *segment_de,
695
               XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
696
               TimeLineID insertTLI);
697
static void UpdateLastRemovedPtr(char *filename);
698
static void ValidateXLOGDirectoryStructure(void);
699
static void CleanupBackupHistory(void);
700
static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
701
static bool PerformRecoveryXLogAction(void);
702
static void InitControlFile(uint64 sysidentifier, uint32 data_checksum_version);
703
static void WriteControlFile(void);
704
static void ReadControlFile(void);
705
static void UpdateControlFile(void);
706
static char *str_time(pg_time_t tnow);
707
708
static int  get_sync_bit(int method);
709
710
static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
711
                XLogRecData *rdata,
712
                XLogRecPtr StartPos, XLogRecPtr EndPos,
713
                TimeLineID tli);
714
static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
715
                    XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
716
static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
717
                XLogRecPtr *PrevPtr);
718
static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
719
static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
720
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
721
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
722
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
723
724
static void WALInsertLockAcquire(void);
725
static void WALInsertLockAcquireExclusive(void);
726
static void WALInsertLockRelease(void);
727
static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
728
729
/*
730
 * Insert an XLOG record represented by an already-constructed chain of data
731
 * chunks.  This is a low-level routine; to construct the WAL record header
732
 * and data, use the higher-level routines in xloginsert.c.
733
 *
734
 * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
735
 * WAL record applies to, that were not included in the record as full page
736
 * images.  If fpw_lsn <= RedoRecPtr, the function does not perform the
737
 * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
738
 * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
739
 * record is always inserted.
740
 *
741
 * 'flags' gives more in-depth control on the record being inserted. See
742
 * XLogSetRecordFlags() for details.
743
 *
744
 * 'topxid_included' tells whether the top-transaction id is logged along with
745
 * current subtransaction. See XLogRecordAssemble().
746
 *
747
 * The first XLogRecData in the chain must be for the record header, and its
748
 * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
749
 * xl_crc fields in the header, the rest of the header must already be filled
750
 * by the caller.
751
 *
752
 * Returns XLOG pointer to end of record (beginning of next record).
753
 * This can be used as LSN for data pages affected by the logged action.
754
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
755
 * before the data page can be written out.  This implements the basic
756
 * WAL rule "write the log before the data".)
757
 */
758
XLogRecPtr
759
XLogInsertRecord(XLogRecData *rdata,
760
         XLogRecPtr fpw_lsn,
761
         uint8 flags,
762
         int num_fpi,
763
         bool topxid_included)
764
0
{
765
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
766
0
  pg_crc32c rdata_crc;
767
0
  bool    inserted;
768
0
  XLogRecord *rechdr = (XLogRecord *) rdata->data;
769
0
  uint8   info = rechdr->xl_info & ~XLR_INFO_MASK;
770
0
  WalInsertClass class = WALINSERT_NORMAL;
771
0
  XLogRecPtr  StartPos;
772
0
  XLogRecPtr  EndPos;
773
0
  bool    prevDoPageWrites = doPageWrites;
774
0
  TimeLineID  insertTLI;
775
776
  /* Does this record type require special handling? */
777
0
  if (unlikely(rechdr->xl_rmid == RM_XLOG_ID))
778
0
  {
779
0
    if (info == XLOG_SWITCH)
780
0
      class = WALINSERT_SPECIAL_SWITCH;
781
0
    else if (info == XLOG_CHECKPOINT_REDO)
782
0
      class = WALINSERT_SPECIAL_CHECKPOINT;
783
0
  }
784
785
  /* we assume that all of the record header is in the first chunk */
786
0
  Assert(rdata->len >= SizeOfXLogRecord);
787
788
  /* cross-check on whether we should be here or not */
789
0
  if (!XLogInsertAllowed())
790
0
    elog(ERROR, "cannot make new WAL entries during recovery");
791
792
  /*
793
   * Given that we're not in recovery, InsertTimeLineID is set and can't
794
   * change, so we can read it without a lock.
795
   */
796
0
  insertTLI = XLogCtl->InsertTimeLineID;
797
798
  /*----------
799
   *
800
   * We have now done all the preparatory work we can without holding a
801
   * lock or modifying shared state. From here on, inserting the new WAL
802
   * record to the shared WAL buffer cache is a two-step process:
803
   *
804
   * 1. Reserve the right amount of space from the WAL. The current head of
805
   *    reserved space is kept in Insert->CurrBytePos, and is protected by
806
   *    insertpos_lck.
807
   *
808
   * 2. Copy the record to the reserved WAL space. This involves finding the
809
   *    correct WAL buffer containing the reserved space, and copying the
810
   *    record in place. This can be done concurrently in multiple processes.
811
   *
812
   * To keep track of which insertions are still in-progress, each concurrent
813
   * inserter acquires an insertion lock. In addition to just indicating that
814
   * an insertion is in progress, the lock tells others how far the inserter
815
   * has progressed. There is a small fixed number of insertion locks,
816
   * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
817
   * boundary, it updates the value stored in the lock to the how far it has
818
   * inserted, to allow the previous buffer to be flushed.
819
   *
820
   * Holding onto an insertion lock also protects RedoRecPtr and
821
   * fullPageWrites from changing until the insertion is finished.
822
   *
823
   * Step 2 can usually be done completely in parallel. If the required WAL
824
   * page is not initialized yet, you have to go through AdvanceXLInsertBuffer,
825
   * which will ensure it is initialized. But the WAL writer tries to do that
826
   * ahead of insertions to avoid that from happening in the critical path.
827
   *
828
   *----------
829
   */
830
0
  START_CRIT_SECTION();
831
832
0
  if (likely(class == WALINSERT_NORMAL))
833
0
  {
834
0
    WALInsertLockAcquire();
835
836
    /*
837
     * Check to see if my copy of RedoRecPtr is out of date. If so, may
838
     * have to go back and have the caller recompute everything. This can
839
     * only happen just after a checkpoint, so it's better to be slow in
840
     * this case and fast otherwise.
841
     *
842
     * Also check to see if fullPageWrites was just turned on or there's a
843
     * running backup (which forces full-page writes); if we weren't
844
     * already doing full-page writes then go back and recompute.
845
     *
846
     * If we aren't doing full-page writes then RedoRecPtr doesn't
847
     * actually affect the contents of the XLOG record, so we'll update
848
     * our local copy but not force a recomputation.  (If doPageWrites was
849
     * just turned off, we could recompute the record without full pages,
850
     * but we choose not to bother.)
851
     */
852
0
    if (RedoRecPtr != Insert->RedoRecPtr)
853
0
    {
854
0
      Assert(RedoRecPtr < Insert->RedoRecPtr);
855
0
      RedoRecPtr = Insert->RedoRecPtr;
856
0
    }
857
0
    doPageWrites = (Insert->fullPageWrites || Insert->runningBackups > 0);
858
859
0
    if (doPageWrites &&
860
0
      (!prevDoPageWrites ||
861
0
       (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
862
0
    {
863
      /*
864
       * Oops, some buffer now needs to be backed up that the caller
865
       * didn't back up.  Start over.
866
       */
867
0
      WALInsertLockRelease();
868
0
      END_CRIT_SECTION();
869
0
      return InvalidXLogRecPtr;
870
0
    }
871
872
    /*
873
     * Reserve space for the record in the WAL. This also sets the xl_prev
874
     * pointer.
875
     */
876
0
    ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
877
0
                  &rechdr->xl_prev);
878
879
    /* Normal records are always inserted. */
880
0
    inserted = true;
881
0
  }
882
0
  else if (class == WALINSERT_SPECIAL_SWITCH)
883
0
  {
884
    /*
885
     * In order to insert an XLOG_SWITCH record, we need to hold all of
886
     * the WAL insertion locks, not just one, so that no one else can
887
     * begin inserting a record until we've figured out how much space
888
     * remains in the current WAL segment and claimed all of it.
889
     *
890
     * Nonetheless, this case is simpler than the normal cases handled
891
     * below, which must check for changes in doPageWrites and RedoRecPtr.
892
     * Those checks are only needed for records that can contain buffer
893
     * references, and an XLOG_SWITCH record never does.
894
     */
895
0
    Assert(fpw_lsn == InvalidXLogRecPtr);
896
0
    WALInsertLockAcquireExclusive();
897
0
    inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
898
0
  }
899
0
  else
900
0
  {
901
0
    Assert(class == WALINSERT_SPECIAL_CHECKPOINT);
902
903
    /*
904
     * We need to update both the local and shared copies of RedoRecPtr,
905
     * which means that we need to hold all the WAL insertion locks.
906
     * However, there can't be any buffer references, so as above, we need
907
     * not check RedoRecPtr before inserting the record; we just need to
908
     * update it afterwards.
909
     */
910
0
    Assert(fpw_lsn == InvalidXLogRecPtr);
911
0
    WALInsertLockAcquireExclusive();
912
0
    ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
913
0
                  &rechdr->xl_prev);
914
0
    RedoRecPtr = Insert->RedoRecPtr = StartPos;
915
0
    inserted = true;
916
0
  }
917
918
0
  if (inserted)
919
0
  {
920
    /*
921
     * Now that xl_prev has been filled in, calculate CRC of the record
922
     * header.
923
     */
924
0
    rdata_crc = rechdr->xl_crc;
925
0
    COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
926
0
    FIN_CRC32C(rdata_crc);
927
0
    rechdr->xl_crc = rdata_crc;
928
929
    /*
930
     * All the record data, including the header, is now ready to be
931
     * inserted. Copy the record in the space reserved.
932
     */
933
0
    CopyXLogRecordToWAL(rechdr->xl_tot_len,
934
0
              class == WALINSERT_SPECIAL_SWITCH, rdata,
935
0
              StartPos, EndPos, insertTLI);
936
937
    /*
938
     * Unless record is flagged as not important, update LSN of last
939
     * important record in the current slot. When holding all locks, just
940
     * update the first one.
941
     */
942
0
    if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
943
0
    {
944
0
      int     lockno = holdingAllLocks ? 0 : MyLockNo;
945
946
0
      WALInsertLocks[lockno].l.lastImportantAt = StartPos;
947
0
    }
948
0
  }
949
0
  else
950
0
  {
951
    /*
952
     * This was an xlog-switch record, but the current insert location was
953
     * already exactly at the beginning of a segment, so there was no need
954
     * to do anything.
955
     */
956
0
  }
957
958
  /*
959
   * Done! Let others know that we're finished.
960
   */
961
0
  WALInsertLockRelease();
962
963
0
  END_CRIT_SECTION();
964
965
0
  MarkCurrentTransactionIdLoggedIfAny();
966
967
  /*
968
   * Mark top transaction id is logged (if needed) so that we should not try
969
   * to log it again with the next WAL record in the current subtransaction.
970
   */
971
0
  if (topxid_included)
972
0
    MarkSubxactTopXidLogged();
973
974
  /*
975
   * Update shared LogwrtRqst.Write, if we crossed page boundary.
976
   */
977
0
  if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
978
0
  {
979
0
    SpinLockAcquire(&XLogCtl->info_lck);
980
    /* advance global request to include new block(s) */
981
0
    if (XLogCtl->LogwrtRqst.Write < EndPos)
982
0
      XLogCtl->LogwrtRqst.Write = EndPos;
983
0
    SpinLockRelease(&XLogCtl->info_lck);
984
0
    RefreshXLogWriteResult(LogwrtResult);
985
0
  }
986
987
  /*
988
   * If this was an XLOG_SWITCH record, flush the record and the empty
989
   * padding space that fills the rest of the segment, and perform
990
   * end-of-segment actions (eg, notifying archiver).
991
   */
992
0
  if (class == WALINSERT_SPECIAL_SWITCH)
993
0
  {
994
0
    TRACE_POSTGRESQL_WAL_SWITCH();
995
0
    XLogFlush(EndPos);
996
997
    /*
998
     * Even though we reserved the rest of the segment for us, which is
999
     * reflected in EndPos, we return a pointer to just the end of the
1000
     * xlog-switch record.
1001
     */
1002
0
    if (inserted)
1003
0
    {
1004
0
      EndPos = StartPos + SizeOfXLogRecord;
1005
0
      if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1006
0
      {
1007
0
        uint64    offset = XLogSegmentOffset(EndPos, wal_segment_size);
1008
1009
0
        if (offset == EndPos % XLOG_BLCKSZ)
1010
0
          EndPos += SizeOfXLogLongPHD;
1011
0
        else
1012
0
          EndPos += SizeOfXLogShortPHD;
1013
0
      }
1014
0
    }
1015
0
  }
1016
1017
#ifdef WAL_DEBUG
1018
  if (XLOG_DEBUG)
1019
  {
1020
    static XLogReaderState *debug_reader = NULL;
1021
    XLogRecord *record;
1022
    DecodedXLogRecord *decoded;
1023
    StringInfoData buf;
1024
    StringInfoData recordBuf;
1025
    char     *errormsg = NULL;
1026
    MemoryContext oldCxt;
1027
1028
    oldCxt = MemoryContextSwitchTo(walDebugCxt);
1029
1030
    initStringInfo(&buf);
1031
    appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
1032
1033
    /*
1034
     * We have to piece together the WAL record data from the XLogRecData
1035
     * entries, so that we can pass it to the rm_desc function as one
1036
     * contiguous chunk.
1037
     */
1038
    initStringInfo(&recordBuf);
1039
    for (; rdata != NULL; rdata = rdata->next)
1040
      appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1041
1042
    /* We also need temporary space to decode the record. */
1043
    record = (XLogRecord *) recordBuf.data;
1044
    decoded = (DecodedXLogRecord *)
1045
      palloc(DecodeXLogRecordRequiredSpace(record->xl_tot_len));
1046
1047
    if (!debug_reader)
1048
      debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
1049
                        XL_ROUTINE(.page_read = NULL,
1050
                             .segment_open = NULL,
1051
                             .segment_close = NULL),
1052
                        NULL);
1053
    if (!debug_reader)
1054
    {
1055
      appendStringInfoString(&buf, "error decoding record: out of memory while allocating a WAL reading processor");
1056
    }
1057
    else if (!DecodeXLogRecord(debug_reader,
1058
                   decoded,
1059
                   record,
1060
                   EndPos,
1061
                   &errormsg))
1062
    {
1063
      appendStringInfo(&buf, "error decoding record: %s",
1064
               errormsg ? errormsg : "no error message");
1065
    }
1066
    else
1067
    {
1068
      appendStringInfoString(&buf, " - ");
1069
1070
      debug_reader->record = decoded;
1071
      xlog_outdesc(&buf, debug_reader);
1072
      debug_reader->record = NULL;
1073
    }
1074
    elog(LOG, "%s", buf.data);
1075
1076
    pfree(decoded);
1077
    pfree(buf.data);
1078
    pfree(recordBuf.data);
1079
    MemoryContextSwitchTo(oldCxt);
1080
  }
1081
#endif
1082
1083
  /*
1084
   * Update our global variables
1085
   */
1086
0
  ProcLastRecPtr = StartPos;
1087
0
  XactLastRecEnd = EndPos;
1088
1089
  /* Report WAL traffic to the instrumentation. */
1090
0
  if (inserted)
1091
0
  {
1092
0
    pgWalUsage.wal_bytes += rechdr->xl_tot_len;
1093
0
    pgWalUsage.wal_records++;
1094
0
    pgWalUsage.wal_fpi += num_fpi;
1095
0
  }
1096
1097
0
  return EndPos;
1098
0
}
1099
1100
/*
1101
 * Reserves the right amount of space for a record of given size from the WAL.
1102
 * *StartPos is set to the beginning of the reserved section, *EndPos to
1103
 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1104
 * used to set the xl_prev of this record.
1105
 *
1106
 * This is the performance critical part of XLogInsert that must be serialized
1107
 * across backends. The rest can happen mostly in parallel. Try to keep this
1108
 * section as short as possible, insertpos_lck can be heavily contended on a
1109
 * busy system.
1110
 *
1111
 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1112
 * where we actually copy the record to the reserved space.
1113
 *
1114
 * NB: Testing shows that XLogInsertRecord runs faster if this code is inlined;
1115
 * however, because there are two call sites, the compiler is reluctant to
1116
 * inline. We use pg_attribute_always_inline here to try to convince it.
1117
 */
1118
static pg_attribute_always_inline void
1119
ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1120
              XLogRecPtr *PrevPtr)
1121
0
{
1122
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
1123
0
  uint64    startbytepos;
1124
0
  uint64    endbytepos;
1125
0
  uint64    prevbytepos;
1126
1127
0
  size = MAXALIGN(size);
1128
1129
  /* All (non xlog-switch) records should contain data. */
1130
0
  Assert(size > SizeOfXLogRecord);
1131
1132
  /*
1133
   * The duration the spinlock needs to be held is minimized by minimizing
1134
   * the calculations that have to be done while holding the lock. The
1135
   * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1136
   * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1137
   * page headers. The mapping between "usable" byte positions and physical
1138
   * positions (XLogRecPtrs) can be done outside the locked region, and
1139
   * because the usable byte position doesn't include any headers, reserving
1140
   * X bytes from WAL is almost as simple as "CurrBytePos += X".
1141
   */
1142
0
  SpinLockAcquire(&Insert->insertpos_lck);
1143
1144
0
  startbytepos = Insert->CurrBytePos;
1145
0
  endbytepos = startbytepos + size;
1146
0
  prevbytepos = Insert->PrevBytePos;
1147
0
  Insert->CurrBytePos = endbytepos;
1148
0
  Insert->PrevBytePos = startbytepos;
1149
1150
0
  SpinLockRelease(&Insert->insertpos_lck);
1151
1152
0
  *StartPos = XLogBytePosToRecPtr(startbytepos);
1153
0
  *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1154
0
  *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1155
1156
  /*
1157
   * Check that the conversions between "usable byte positions" and
1158
   * XLogRecPtrs work consistently in both directions.
1159
   */
1160
0
  Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1161
0
  Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1162
0
  Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1163
0
}
1164
1165
/*
1166
 * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1167
 *
1168
 * A log-switch record is handled slightly differently. The rest of the
1169
 * segment will be reserved for this insertion, as indicated by the returned
1170
 * *EndPos value. However, if we are already at the beginning of the current
1171
 * segment, *StartPos and *EndPos are set to the current location without
1172
 * reserving any space, and the function returns false.
1173
*/
1174
static bool
1175
ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1176
0
{
1177
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
1178
0
  uint64    startbytepos;
1179
0
  uint64    endbytepos;
1180
0
  uint64    prevbytepos;
1181
0
  uint32    size = MAXALIGN(SizeOfXLogRecord);
1182
0
  XLogRecPtr  ptr;
1183
0
  uint32    segleft;
1184
1185
  /*
1186
   * These calculations are a bit heavy-weight to be done while holding a
1187
   * spinlock, but since we're holding all the WAL insertion locks, there
1188
   * are no other inserters competing for it. GetXLogInsertRecPtr() does
1189
   * compete for it, but that's not called very frequently.
1190
   */
1191
0
  SpinLockAcquire(&Insert->insertpos_lck);
1192
1193
0
  startbytepos = Insert->CurrBytePos;
1194
1195
0
  ptr = XLogBytePosToEndRecPtr(startbytepos);
1196
0
  if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1197
0
  {
1198
0
    SpinLockRelease(&Insert->insertpos_lck);
1199
0
    *EndPos = *StartPos = ptr;
1200
0
    return false;
1201
0
  }
1202
1203
0
  endbytepos = startbytepos + size;
1204
0
  prevbytepos = Insert->PrevBytePos;
1205
1206
0
  *StartPos = XLogBytePosToRecPtr(startbytepos);
1207
0
  *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1208
1209
0
  segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1210
0
  if (segleft != wal_segment_size)
1211
0
  {
1212
    /* consume the rest of the segment */
1213
0
    *EndPos += segleft;
1214
0
    endbytepos = XLogRecPtrToBytePos(*EndPos);
1215
0
  }
1216
0
  Insert->CurrBytePos = endbytepos;
1217
0
  Insert->PrevBytePos = startbytepos;
1218
1219
0
  SpinLockRelease(&Insert->insertpos_lck);
1220
1221
0
  *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1222
1223
0
  Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1224
0
  Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1225
0
  Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1226
0
  Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1227
1228
0
  return true;
1229
0
}
1230
1231
/*
1232
 * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
1233
 * area in the WAL.
1234
 */
1235
static void
1236
CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1237
          XLogRecPtr StartPos, XLogRecPtr EndPos, TimeLineID tli)
1238
0
{
1239
0
  char     *currpos;
1240
0
  int     freespace;
1241
0
  int     written;
1242
0
  XLogRecPtr  CurrPos;
1243
0
  XLogPageHeader pagehdr;
1244
1245
  /*
1246
   * Get a pointer to the right place in the right WAL buffer to start
1247
   * inserting to.
1248
   */
1249
0
  CurrPos = StartPos;
1250
0
  currpos = GetXLogBuffer(CurrPos, tli);
1251
0
  freespace = INSERT_FREESPACE(CurrPos);
1252
1253
  /*
1254
   * there should be enough space for at least the first field (xl_tot_len)
1255
   * on this page.
1256
   */
1257
0
  Assert(freespace >= sizeof(uint32));
1258
1259
  /* Copy record data */
1260
0
  written = 0;
1261
0
  while (rdata != NULL)
1262
0
  {
1263
0
    const char *rdata_data = rdata->data;
1264
0
    int     rdata_len = rdata->len;
1265
1266
0
    while (rdata_len > freespace)
1267
0
    {
1268
      /*
1269
       * Write what fits on this page, and continue on the next page.
1270
       */
1271
0
      Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1272
0
      memcpy(currpos, rdata_data, freespace);
1273
0
      rdata_data += freespace;
1274
0
      rdata_len -= freespace;
1275
0
      written += freespace;
1276
0
      CurrPos += freespace;
1277
1278
      /*
1279
       * Get pointer to beginning of next page, and set the xlp_rem_len
1280
       * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1281
       *
1282
       * It's safe to set the contrecord flag and xlp_rem_len without a
1283
       * lock on the page. All the other flags were already set when the
1284
       * page was initialized, in AdvanceXLInsertBuffer, and we're the
1285
       * only backend that needs to set the contrecord flag.
1286
       */
1287
0
      currpos = GetXLogBuffer(CurrPos, tli);
1288
0
      pagehdr = (XLogPageHeader) currpos;
1289
0
      pagehdr->xlp_rem_len = write_len - written;
1290
0
      pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1291
1292
      /* skip over the page header */
1293
0
      if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1294
0
      {
1295
0
        CurrPos += SizeOfXLogLongPHD;
1296
0
        currpos += SizeOfXLogLongPHD;
1297
0
      }
1298
0
      else
1299
0
      {
1300
0
        CurrPos += SizeOfXLogShortPHD;
1301
0
        currpos += SizeOfXLogShortPHD;
1302
0
      }
1303
0
      freespace = INSERT_FREESPACE(CurrPos);
1304
0
    }
1305
1306
0
    Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1307
0
    memcpy(currpos, rdata_data, rdata_len);
1308
0
    currpos += rdata_len;
1309
0
    CurrPos += rdata_len;
1310
0
    freespace -= rdata_len;
1311
0
    written += rdata_len;
1312
1313
0
    rdata = rdata->next;
1314
0
  }
1315
0
  Assert(written == write_len);
1316
1317
  /*
1318
   * If this was an xlog-switch, it's not enough to write the switch record,
1319
   * we also have to consume all the remaining space in the WAL segment.  We
1320
   * have already reserved that space, but we need to actually fill it.
1321
   */
1322
0
  if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1323
0
  {
1324
    /* An xlog-switch record doesn't contain any data besides the header */
1325
0
    Assert(write_len == SizeOfXLogRecord);
1326
1327
    /* Assert that we did reserve the right amount of space */
1328
0
    Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1329
1330
    /* Use up all the remaining space on the current page */
1331
0
    CurrPos += freespace;
1332
1333
    /*
1334
     * Cause all remaining pages in the segment to be flushed, leaving the
1335
     * XLog position where it should be, at the start of the next segment.
1336
     * We do this one page at a time, to make sure we don't deadlock
1337
     * against ourselves if wal_buffers < wal_segment_size.
1338
     */
1339
0
    while (CurrPos < EndPos)
1340
0
    {
1341
      /*
1342
       * The minimal action to flush the page would be to call
1343
       * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1344
       * AdvanceXLInsertBuffer(...).  The page would be left initialized
1345
       * mostly to zeros, except for the page header (always the short
1346
       * variant, as this is never a segment's first page).
1347
       *
1348
       * The large vistas of zeros are good for compressibility, but the
1349
       * headers interrupting them every XLOG_BLCKSZ (with values that
1350
       * differ from page to page) are not.  The effect varies with
1351
       * compression tool, but bzip2 for instance compresses about an
1352
       * order of magnitude worse if those headers are left in place.
1353
       *
1354
       * Rather than complicating AdvanceXLInsertBuffer itself (which is
1355
       * called in heavily-loaded circumstances as well as this lightly-
1356
       * loaded one) with variant behavior, we just use GetXLogBuffer
1357
       * (which itself calls the two methods we need) to get the pointer
1358
       * and zero most of the page.  Then we just zero the page header.
1359
       */
1360
0
      currpos = GetXLogBuffer(CurrPos, tli);
1361
0
      MemSet(currpos, 0, SizeOfXLogShortPHD);
1362
1363
0
      CurrPos += XLOG_BLCKSZ;
1364
0
    }
1365
0
  }
1366
0
  else
1367
0
  {
1368
    /* Align the end position, so that the next record starts aligned */
1369
0
    CurrPos = MAXALIGN64(CurrPos);
1370
0
  }
1371
1372
0
  if (CurrPos != EndPos)
1373
0
    ereport(PANIC,
1374
0
        errcode(ERRCODE_DATA_CORRUPTED),
1375
0
        errmsg_internal("space reserved for WAL record does not match what was written"));
1376
0
}
1377
1378
/*
1379
 * Acquire a WAL insertion lock, for inserting to WAL.
1380
 */
1381
static void
1382
WALInsertLockAcquire(void)
1383
0
{
1384
0
  bool    immed;
1385
1386
  /*
1387
   * It doesn't matter which of the WAL insertion locks we acquire, so try
1388
   * the one we used last time.  If the system isn't particularly busy, it's
1389
   * a good bet that it's still available, and it's good to have some
1390
   * affinity to a particular lock so that you don't unnecessarily bounce
1391
   * cache lines between processes when there's no contention.
1392
   *
1393
   * If this is the first time through in this backend, pick a lock
1394
   * (semi-)randomly.  This allows the locks to be used evenly if you have a
1395
   * lot of very short connections.
1396
   */
1397
0
  static int  lockToTry = -1;
1398
1399
0
  if (lockToTry == -1)
1400
0
    lockToTry = MyProcNumber % NUM_XLOGINSERT_LOCKS;
1401
0
  MyLockNo = lockToTry;
1402
1403
  /*
1404
   * The insertingAt value is initially set to 0, as we don't know our
1405
   * insert location yet.
1406
   */
1407
0
  immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1408
0
  if (!immed)
1409
0
  {
1410
    /*
1411
     * If we couldn't get the lock immediately, try another lock next
1412
     * time.  On a system with more insertion locks than concurrent
1413
     * inserters, this causes all the inserters to eventually migrate to a
1414
     * lock that no-one else is using.  On a system with more inserters
1415
     * than locks, it still helps to distribute the inserters evenly
1416
     * across the locks.
1417
     */
1418
0
    lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1419
0
  }
1420
0
}
1421
1422
/*
1423
 * Acquire all WAL insertion locks, to prevent other backends from inserting
1424
 * to WAL.
1425
 */
1426
static void
1427
WALInsertLockAcquireExclusive(void)
1428
0
{
1429
0
  int     i;
1430
1431
  /*
1432
   * When holding all the locks, all but the last lock's insertingAt
1433
   * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1434
   * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1435
   */
1436
0
  for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1437
0
  {
1438
0
    LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1439
0
    LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1440
0
            &WALInsertLocks[i].l.insertingAt,
1441
0
            PG_UINT64_MAX);
1442
0
  }
1443
  /* Variable value reset to 0 at release */
1444
0
  LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1445
1446
0
  holdingAllLocks = true;
1447
0
}
1448
1449
/*
1450
 * Release our insertion lock (or locks, if we're holding them all).
1451
 *
1452
 * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1453
 * next time the lock is acquired.
1454
 */
1455
static void
1456
WALInsertLockRelease(void)
1457
0
{
1458
0
  if (holdingAllLocks)
1459
0
  {
1460
0
    int     i;
1461
1462
0
    for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1463
0
      LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1464
0
                  &WALInsertLocks[i].l.insertingAt,
1465
0
                  0);
1466
1467
0
    holdingAllLocks = false;
1468
0
  }
1469
0
  else
1470
0
  {
1471
0
    LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1472
0
                &WALInsertLocks[MyLockNo].l.insertingAt,
1473
0
                0);
1474
0
  }
1475
0
}
1476
1477
/*
1478
 * Update our insertingAt value, to let others know that we've finished
1479
 * inserting up to that point.
1480
 */
1481
static void
1482
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1483
0
{
1484
0
  if (holdingAllLocks)
1485
0
  {
1486
    /*
1487
     * We use the last lock to mark our actual position, see comments in
1488
     * WALInsertLockAcquireExclusive.
1489
     */
1490
0
    LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1491
0
            &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1492
0
            insertingAt);
1493
0
  }
1494
0
  else
1495
0
    LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1496
0
            &WALInsertLocks[MyLockNo].l.insertingAt,
1497
0
            insertingAt);
1498
0
}
1499
1500
/*
1501
 * Wait for any WAL insertions < upto to finish.
1502
 *
1503
 * Returns the location of the oldest insertion that is still in-progress.
1504
 * Any WAL prior to that point has been fully copied into WAL buffers, and
1505
 * can be flushed out to disk. Because this waits for any insertions older
1506
 * than 'upto' to finish, the return value is always >= 'upto'.
1507
 *
1508
 * Note: When you are about to write out WAL, you must call this function
1509
 * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1510
 * need to wait for an insertion to finish (or at least advance to next
1511
 * uninitialized page), and the inserter might need to evict an old WAL buffer
1512
 * to make room for a new one, which in turn requires WALWriteLock.
1513
 */
1514
static XLogRecPtr
1515
WaitXLogInsertionsToFinish(XLogRecPtr upto)
1516
0
{
1517
0
  uint64    bytepos;
1518
0
  XLogRecPtr  inserted;
1519
0
  XLogRecPtr  reservedUpto;
1520
0
  XLogRecPtr  finishedUpto;
1521
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
1522
0
  int     i;
1523
1524
0
  if (MyProc == NULL)
1525
0
    elog(PANIC, "cannot wait without a PGPROC structure");
1526
1527
  /*
1528
   * Check if there's any work to do.  Use a barrier to ensure we get the
1529
   * freshest value.
1530
   */
1531
0
  inserted = pg_atomic_read_membarrier_u64(&XLogCtl->logInsertResult);
1532
0
  if (upto <= inserted)
1533
0
    return inserted;
1534
1535
  /* Read the current insert position */
1536
0
  SpinLockAcquire(&Insert->insertpos_lck);
1537
0
  bytepos = Insert->CurrBytePos;
1538
0
  SpinLockRelease(&Insert->insertpos_lck);
1539
0
  reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1540
1541
  /*
1542
   * No-one should request to flush a piece of WAL that hasn't even been
1543
   * reserved yet. However, it can happen if there is a block with a bogus
1544
   * LSN on disk, for example. XLogFlush checks for that situation and
1545
   * complains, but only after the flush. Here we just assume that to mean
1546
   * that all WAL that has been reserved needs to be finished. In this
1547
   * corner-case, the return value can be smaller than 'upto' argument.
1548
   */
1549
0
  if (upto > reservedUpto)
1550
0
  {
1551
0
    ereport(LOG,
1552
0
        (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
1553
0
            LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
1554
0
    upto = reservedUpto;
1555
0
  }
1556
1557
  /*
1558
   * Loop through all the locks, sleeping on any in-progress insert older
1559
   * than 'upto'.
1560
   *
1561
   * finishedUpto is our return value, indicating the point upto which all
1562
   * the WAL insertions have been finished. Initialize it to the head of
1563
   * reserved WAL, and as we iterate through the insertion locks, back it
1564
   * out for any insertion that's still in progress.
1565
   */
1566
0
  finishedUpto = reservedUpto;
1567
0
  for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1568
0
  {
1569
0
    XLogRecPtr  insertingat = InvalidXLogRecPtr;
1570
1571
0
    do
1572
0
    {
1573
      /*
1574
       * See if this insertion is in progress.  LWLockWaitForVar will
1575
       * wait for the lock to be released, or for the 'value' to be set
1576
       * by a LWLockUpdateVar call.  When a lock is initially acquired,
1577
       * its value is 0 (InvalidXLogRecPtr), which means that we don't
1578
       * know where it's inserting yet.  We will have to wait for it. If
1579
       * it's a small insertion, the record will most likely fit on the
1580
       * same page and the inserter will release the lock without ever
1581
       * calling LWLockUpdateVar.  But if it has to sleep, it will
1582
       * advertise the insertion point with LWLockUpdateVar before
1583
       * sleeping.
1584
       *
1585
       * In this loop we are only waiting for insertions that started
1586
       * before WaitXLogInsertionsToFinish was called.  The lack of
1587
       * memory barriers in the loop means that we might see locks as
1588
       * "unused" that have since become used.  This is fine because
1589
       * they only can be used for later insertions that we would not
1590
       * want to wait on anyway.  Not taking a lock to acquire the
1591
       * current insertingAt value means that we might see older
1592
       * insertingAt values.  This is also fine, because if we read a
1593
       * value too old, we will add ourselves to the wait queue, which
1594
       * contains atomic operations.
1595
       */
1596
0
      if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1597
0
                 &WALInsertLocks[i].l.insertingAt,
1598
0
                 insertingat, &insertingat))
1599
0
      {
1600
        /* the lock was free, so no insertion in progress */
1601
0
        insertingat = InvalidXLogRecPtr;
1602
0
        break;
1603
0
      }
1604
1605
      /*
1606
       * This insertion is still in progress. Have to wait, unless the
1607
       * inserter has proceeded past 'upto'.
1608
       */
1609
0
    } while (insertingat < upto);
1610
1611
0
    if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1612
0
      finishedUpto = insertingat;
1613
0
  }
1614
1615
  /*
1616
   * Advance the limit we know to have been inserted and return the freshest
1617
   * value we know of, which might be beyond what we requested if somebody
1618
   * is concurrently doing this with an 'upto' pointer ahead of us.
1619
   */
1620
0
  finishedUpto = pg_atomic_monotonic_advance_u64(&XLogCtl->logInsertResult,
1621
0
                           finishedUpto);
1622
1623
0
  return finishedUpto;
1624
0
}
1625
1626
/*
1627
 * Get a pointer to the right location in the WAL buffer containing the
1628
 * given XLogRecPtr.
1629
 *
1630
 * If the page is not initialized yet, it is initialized. That might require
1631
 * evicting an old dirty buffer from the buffer cache, which means I/O.
1632
 *
1633
 * The caller must ensure that the page containing the requested location
1634
 * isn't evicted yet, and won't be evicted. The way to ensure that is to
1635
 * hold onto a WAL insertion lock with the insertingAt position set to
1636
 * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1637
 * to evict an old page from the buffer. (This means that once you call
1638
 * GetXLogBuffer() with a given 'ptr', you must not access anything before
1639
 * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1640
 * later, because older buffers might be recycled already)
1641
 */
1642
static char *
1643
GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
1644
0
{
1645
0
  int     idx;
1646
0
  XLogRecPtr  endptr;
1647
0
  static uint64 cachedPage = 0;
1648
0
  static char *cachedPos = NULL;
1649
0
  XLogRecPtr  expectedEndPtr;
1650
1651
  /*
1652
   * Fast path for the common case that we need to access again the same
1653
   * page as last time.
1654
   */
1655
0
  if (ptr / XLOG_BLCKSZ == cachedPage)
1656
0
  {
1657
0
    Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1658
0
    Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1659
0
    return cachedPos + ptr % XLOG_BLCKSZ;
1660
0
  }
1661
1662
  /*
1663
   * The XLog buffer cache is organized so that a page is always loaded to a
1664
   * particular buffer.  That way we can easily calculate the buffer a given
1665
   * page must be loaded into, from the XLogRecPtr alone.
1666
   */
1667
0
  idx = XLogRecPtrToBufIdx(ptr);
1668
1669
  /*
1670
   * See what page is loaded in the buffer at the moment. It could be the
1671
   * page we're looking for, or something older. It can't be anything newer
1672
   * - that would imply the page we're looking for has already been written
1673
   * out to disk and evicted, and the caller is responsible for making sure
1674
   * that doesn't happen.
1675
   *
1676
   * We don't hold a lock while we read the value. If someone is just about
1677
   * to initialize or has just initialized the page, it's possible that we
1678
   * get InvalidXLogRecPtr. That's ok, we'll grab the mapping lock (in
1679
   * AdvanceXLInsertBuffer) and retry if we see anything other than the page
1680
   * we're looking for.
1681
   */
1682
0
  expectedEndPtr = ptr;
1683
0
  expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1684
1685
0
  endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1686
0
  if (expectedEndPtr != endptr)
1687
0
  {
1688
0
    XLogRecPtr  initializedUpto;
1689
1690
    /*
1691
     * Before calling AdvanceXLInsertBuffer(), which can block, let others
1692
     * know how far we're finished with inserting the record.
1693
     *
1694
     * NB: If 'ptr' points to just after the page header, advertise a
1695
     * position at the beginning of the page rather than 'ptr' itself. If
1696
     * there are no other insertions running, someone might try to flush
1697
     * up to our advertised location. If we advertised a position after
1698
     * the page header, someone might try to flush the page header, even
1699
     * though page might actually not be initialized yet. As the first
1700
     * inserter on the page, we are effectively responsible for making
1701
     * sure that it's initialized, before we let insertingAt to move past
1702
     * the page header.
1703
     */
1704
0
    if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1705
0
      XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1706
0
      initializedUpto = ptr - SizeOfXLogShortPHD;
1707
0
    else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1708
0
         XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1709
0
      initializedUpto = ptr - SizeOfXLogLongPHD;
1710
0
    else
1711
0
      initializedUpto = ptr;
1712
1713
0
    WALInsertLockUpdateInsertingAt(initializedUpto);
1714
1715
0
    AdvanceXLInsertBuffer(ptr, tli, false);
1716
0
    endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1717
1718
0
    if (expectedEndPtr != endptr)
1719
0
      elog(PANIC, "could not find WAL buffer for %X/%X",
1720
0
         LSN_FORMAT_ARGS(ptr));
1721
0
  }
1722
0
  else
1723
0
  {
1724
    /*
1725
     * Make sure the initialization of the page is visible to us, and
1726
     * won't arrive later to overwrite the WAL data we write on the page.
1727
     */
1728
0
    pg_memory_barrier();
1729
0
  }
1730
1731
  /*
1732
   * Found the buffer holding this page. Return a pointer to the right
1733
   * offset within the page.
1734
   */
1735
0
  cachedPage = ptr / XLOG_BLCKSZ;
1736
0
  cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1737
1738
0
  Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1739
0
  Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1740
1741
0
  return cachedPos + ptr % XLOG_BLCKSZ;
1742
0
}
1743
1744
/*
1745
 * Read WAL data directly from WAL buffers, if available. Returns the number
1746
 * of bytes read successfully.
1747
 *
1748
 * Fewer than 'count' bytes may be read if some of the requested WAL data has
1749
 * already been evicted.
1750
 *
1751
 * No locks are taken.
1752
 *
1753
 * Caller should ensure that it reads no further than LogwrtResult.Write
1754
 * (which should have been updated by the caller when determining how far to
1755
 * read). The 'tli' argument is only used as a convenient safety check so that
1756
 * callers do not read from WAL buffers on a historical timeline.
1757
 */
1758
Size
1759
WALReadFromBuffers(char *dstbuf, XLogRecPtr startptr, Size count,
1760
           TimeLineID tli)
1761
0
{
1762
0
  char     *pdst = dstbuf;
1763
0
  XLogRecPtr  recptr = startptr;
1764
0
  XLogRecPtr  inserted;
1765
0
  Size    nbytes = count;
1766
1767
0
  if (RecoveryInProgress() || tli != GetWALInsertionTimeLine())
1768
0
    return 0;
1769
1770
0
  Assert(!XLogRecPtrIsInvalid(startptr));
1771
1772
  /*
1773
   * Caller should ensure that the requested data has been inserted into WAL
1774
   * buffers before we try to read it.
1775
   */
1776
0
  inserted = pg_atomic_read_u64(&XLogCtl->logInsertResult);
1777
0
  if (startptr + count > inserted)
1778
0
    ereport(ERROR,
1779
0
        errmsg("cannot read past end of generated WAL: requested %X/%X, current position %X/%X",
1780
0
             LSN_FORMAT_ARGS(startptr + count),
1781
0
             LSN_FORMAT_ARGS(inserted)));
1782
1783
  /*
1784
   * Loop through the buffers without a lock. For each buffer, atomically
1785
   * read and verify the end pointer, then copy the data out, and finally
1786
   * re-read and re-verify the end pointer.
1787
   *
1788
   * Once a page is evicted, it never returns to the WAL buffers, so if the
1789
   * end pointer matches the expected end pointer before and after we copy
1790
   * the data, then the right page must have been present during the data
1791
   * copy. Read barriers are necessary to ensure that the data copy actually
1792
   * happens between the two verification steps.
1793
   *
1794
   * If either verification fails, we simply terminate the loop and return
1795
   * with the data that had been already copied out successfully.
1796
   */
1797
0
  while (nbytes > 0)
1798
0
  {
1799
0
    uint32    offset = recptr % XLOG_BLCKSZ;
1800
0
    int     idx = XLogRecPtrToBufIdx(recptr);
1801
0
    XLogRecPtr  expectedEndPtr;
1802
0
    XLogRecPtr  endptr;
1803
0
    const char *page;
1804
0
    const char *psrc;
1805
0
    Size    npagebytes;
1806
1807
    /*
1808
     * Calculate the end pointer we expect in the xlblocks array if the
1809
     * correct page is present.
1810
     */
1811
0
    expectedEndPtr = recptr + (XLOG_BLCKSZ - offset);
1812
1813
    /*
1814
     * First verification step: check that the correct page is present in
1815
     * the WAL buffers.
1816
     */
1817
0
    endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1818
0
    if (expectedEndPtr != endptr)
1819
0
      break;
1820
1821
    /*
1822
     * The correct page is present (or was at the time the endptr was
1823
     * read; must re-verify later). Calculate pointer to source data and
1824
     * determine how much data to read from this page.
1825
     */
1826
0
    page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1827
0
    psrc = page + offset;
1828
0
    npagebytes = Min(nbytes, XLOG_BLCKSZ - offset);
1829
1830
    /*
1831
     * Ensure that the data copy and the first verification step are not
1832
     * reordered.
1833
     */
1834
0
    pg_read_barrier();
1835
1836
    /* data copy */
1837
0
    memcpy(pdst, psrc, npagebytes);
1838
1839
    /*
1840
     * Ensure that the data copy and the second verification step are not
1841
     * reordered.
1842
     */
1843
0
    pg_read_barrier();
1844
1845
    /*
1846
     * Second verification step: check that the page we read from wasn't
1847
     * evicted while we were copying the data.
1848
     */
1849
0
    endptr = pg_atomic_read_u64(&XLogCtl->xlblocks[idx]);
1850
0
    if (expectedEndPtr != endptr)
1851
0
      break;
1852
1853
0
    pdst += npagebytes;
1854
0
    recptr += npagebytes;
1855
0
    nbytes -= npagebytes;
1856
0
  }
1857
1858
0
  Assert(pdst - dstbuf <= count);
1859
1860
0
  return pdst - dstbuf;
1861
0
}
1862
1863
/*
1864
 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1865
 * is the position starting from the beginning of WAL, excluding all WAL
1866
 * page headers.
1867
 */
1868
static XLogRecPtr
1869
XLogBytePosToRecPtr(uint64 bytepos)
1870
0
{
1871
0
  uint64    fullsegs;
1872
0
  uint64    fullpages;
1873
0
  uint64    bytesleft;
1874
0
  uint32    seg_offset;
1875
0
  XLogRecPtr  result;
1876
1877
0
  fullsegs = bytepos / UsableBytesInSegment;
1878
0
  bytesleft = bytepos % UsableBytesInSegment;
1879
1880
0
  if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1881
0
  {
1882
    /* fits on first page of segment */
1883
0
    seg_offset = bytesleft + SizeOfXLogLongPHD;
1884
0
  }
1885
0
  else
1886
0
  {
1887
    /* account for the first page on segment with long header */
1888
0
    seg_offset = XLOG_BLCKSZ;
1889
0
    bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1890
1891
0
    fullpages = bytesleft / UsableBytesInPage;
1892
0
    bytesleft = bytesleft % UsableBytesInPage;
1893
1894
0
    seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1895
0
  }
1896
1897
0
  XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1898
1899
0
  return result;
1900
0
}
1901
1902
/*
1903
 * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1904
 * returns a pointer to the beginning of the page (ie. before page header),
1905
 * not to where the first xlog record on that page would go to. This is used
1906
 * when converting a pointer to the end of a record.
1907
 */
1908
static XLogRecPtr
1909
XLogBytePosToEndRecPtr(uint64 bytepos)
1910
0
{
1911
0
  uint64    fullsegs;
1912
0
  uint64    fullpages;
1913
0
  uint64    bytesleft;
1914
0
  uint32    seg_offset;
1915
0
  XLogRecPtr  result;
1916
1917
0
  fullsegs = bytepos / UsableBytesInSegment;
1918
0
  bytesleft = bytepos % UsableBytesInSegment;
1919
1920
0
  if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1921
0
  {
1922
    /* fits on first page of segment */
1923
0
    if (bytesleft == 0)
1924
0
      seg_offset = 0;
1925
0
    else
1926
0
      seg_offset = bytesleft + SizeOfXLogLongPHD;
1927
0
  }
1928
0
  else
1929
0
  {
1930
    /* account for the first page on segment with long header */
1931
0
    seg_offset = XLOG_BLCKSZ;
1932
0
    bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1933
1934
0
    fullpages = bytesleft / UsableBytesInPage;
1935
0
    bytesleft = bytesleft % UsableBytesInPage;
1936
1937
0
    if (bytesleft == 0)
1938
0
      seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1939
0
    else
1940
0
      seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1941
0
  }
1942
1943
0
  XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
1944
1945
0
  return result;
1946
0
}
1947
1948
/*
1949
 * Convert an XLogRecPtr to a "usable byte position".
1950
 */
1951
static uint64
1952
XLogRecPtrToBytePos(XLogRecPtr ptr)
1953
0
{
1954
0
  uint64    fullsegs;
1955
0
  uint32    fullpages;
1956
0
  uint32    offset;
1957
0
  uint64    result;
1958
1959
0
  XLByteToSeg(ptr, fullsegs, wal_segment_size);
1960
1961
0
  fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
1962
0
  offset = ptr % XLOG_BLCKSZ;
1963
1964
0
  if (fullpages == 0)
1965
0
  {
1966
0
    result = fullsegs * UsableBytesInSegment;
1967
0
    if (offset > 0)
1968
0
    {
1969
0
      Assert(offset >= SizeOfXLogLongPHD);
1970
0
      result += offset - SizeOfXLogLongPHD;
1971
0
    }
1972
0
  }
1973
0
  else
1974
0
  {
1975
0
    result = fullsegs * UsableBytesInSegment +
1976
0
      (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
1977
0
      (fullpages - 1) * UsableBytesInPage; /* full pages */
1978
0
    if (offset > 0)
1979
0
    {
1980
0
      Assert(offset >= SizeOfXLogShortPHD);
1981
0
      result += offset - SizeOfXLogShortPHD;
1982
0
    }
1983
0
  }
1984
1985
0
  return result;
1986
0
}
1987
1988
/*
1989
 * Initialize XLOG buffers, writing out old buffers if they still contain
1990
 * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
1991
 * true, initialize as many pages as we can without having to write out
1992
 * unwritten data. Any new pages are initialized to zeros, with pages headers
1993
 * initialized properly.
1994
 */
1995
static void
1996
AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
1997
0
{
1998
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
1999
0
  int     nextidx;
2000
0
  XLogRecPtr  OldPageRqstPtr;
2001
0
  XLogwrtRqst WriteRqst;
2002
0
  XLogRecPtr  NewPageEndPtr = InvalidXLogRecPtr;
2003
0
  XLogRecPtr  NewPageBeginPtr;
2004
0
  XLogPageHeader NewPage;
2005
0
  XLogRecPtr  ReservedPtr;
2006
0
  int     npages pg_attribute_unused() = 0;
2007
2008
  /*
2009
   * We must run the loop below inside the critical section as we expect
2010
   * XLogCtl->InitializedUpTo to eventually keep up.  The most of callers
2011
   * already run inside the critical section. Except for WAL writer, which
2012
   * passed 'opportunistic == true', and therefore we don't perform
2013
   * operations that could error out.
2014
   *
2015
   * Start an explicit critical section anyway though.
2016
   */
2017
0
  Assert(CritSectionCount > 0 || opportunistic);
2018
0
  START_CRIT_SECTION();
2019
2020
  /*--
2021
   * Loop till we get all the pages in WAL buffer before 'upto' reserved for
2022
   * initialization.  Multiple process can initialize different buffers with
2023
   * this loop in parallel as following.
2024
   *
2025
   * 1. Reserve page for initialization using XLogCtl->InitializeReserved.
2026
   * 2. Initialize the reserved page.
2027
   * 3. Attempt to advance XLogCtl->InitializedUpTo,
2028
   */
2029
0
  ReservedPtr = pg_atomic_read_u64(&XLogCtl->InitializeReserved);
2030
0
  while (upto >= ReservedPtr || opportunistic)
2031
0
  {
2032
0
    Assert(ReservedPtr % XLOG_BLCKSZ == 0);
2033
2034
    /*
2035
     * Get ending-offset of the buffer page we need to replace.
2036
     *
2037
     * We don't lookup into xlblocks, but rather calculate position we
2038
     * must wait to be written. If it was written, xlblocks will have this
2039
     * position (or uninitialized)
2040
     */
2041
0
    if (ReservedPtr + XLOG_BLCKSZ > XLogCtl->InitializedFrom + XLOG_BLCKSZ * XLOGbuffers)
2042
0
      OldPageRqstPtr = ReservedPtr + XLOG_BLCKSZ - (XLogRecPtr) XLOG_BLCKSZ * XLOGbuffers;
2043
0
    else
2044
0
      OldPageRqstPtr = InvalidXLogRecPtr;
2045
2046
0
    if (LogwrtResult.Write < OldPageRqstPtr && opportunistic)
2047
0
    {
2048
      /*
2049
       * If we just want to pre-initialize as much as we can without
2050
       * flushing, give up now.
2051
       */
2052
0
      upto = ReservedPtr - 1;
2053
0
      break;
2054
0
    }
2055
2056
    /*
2057
     * Attempt to reserve the page for initialization.  Failure means that
2058
     * this page got reserved by another process.
2059
     */
2060
0
    if (!pg_atomic_compare_exchange_u64(&XLogCtl->InitializeReserved,
2061
0
                      &ReservedPtr,
2062
0
                      ReservedPtr + XLOG_BLCKSZ))
2063
0
      continue;
2064
2065
    /*
2066
     * Wait till page gets correctly initialized up to OldPageRqstPtr.
2067
     */
2068
0
    nextidx = XLogRecPtrToBufIdx(ReservedPtr);
2069
0
    while (pg_atomic_read_u64(&XLogCtl->InitializedUpTo) < OldPageRqstPtr)
2070
0
      ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT);
2071
0
    ConditionVariableCancelSleep();
2072
0
    Assert(pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) == OldPageRqstPtr);
2073
2074
    /* Fall through if it's already written out. */
2075
0
    if (LogwrtResult.Write < OldPageRqstPtr)
2076
0
    {
2077
      /* Nope, got work to do. */
2078
2079
      /* Advance shared memory write request position */
2080
0
      SpinLockAcquire(&XLogCtl->info_lck);
2081
0
      if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2082
0
        XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2083
0
      SpinLockRelease(&XLogCtl->info_lck);
2084
2085
      /*
2086
       * Acquire an up-to-date LogwrtResult value and see if we still
2087
       * need to write it or if someone else already did.
2088
       */
2089
0
      RefreshXLogWriteResult(LogwrtResult);
2090
0
      if (LogwrtResult.Write < OldPageRqstPtr)
2091
0
      {
2092
0
        WaitXLogInsertionsToFinish(OldPageRqstPtr);
2093
2094
0
        LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2095
2096
0
        RefreshXLogWriteResult(LogwrtResult);
2097
0
        if (LogwrtResult.Write >= OldPageRqstPtr)
2098
0
        {
2099
          /* OK, someone wrote it already */
2100
0
          LWLockRelease(WALWriteLock);
2101
0
        }
2102
0
        else
2103
0
        {
2104
          /* Have to write it ourselves */
2105
0
          TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2106
0
          WriteRqst.Write = OldPageRqstPtr;
2107
0
          WriteRqst.Flush = 0;
2108
0
          XLogWrite(WriteRqst, tli, false);
2109
0
          LWLockRelease(WALWriteLock);
2110
0
          pgWalUsage.wal_buffers_full++;
2111
0
          TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2112
0
        }
2113
0
      }
2114
0
    }
2115
2116
    /*
2117
     * Now the next buffer slot is free and we can set it up to be the
2118
     * next output page.
2119
     */
2120
0
    NewPageBeginPtr = ReservedPtr;
2121
0
    NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2122
2123
0
    NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2124
2125
    /*
2126
     * Mark the xlblock with InvalidXLogRecPtr and issue a write barrier
2127
     * before initializing. Otherwise, the old page may be partially
2128
     * zeroed but look valid.
2129
     */
2130
0
    pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], InvalidXLogRecPtr);
2131
0
    pg_write_barrier();
2132
2133
    /*
2134
     * Be sure to re-zero the buffer so that bytes beyond what we've
2135
     * written will look like zeroes and not valid XLOG records...
2136
     */
2137
0
    MemSet(NewPage, 0, XLOG_BLCKSZ);
2138
2139
    /*
2140
     * Fill the new page's header
2141
     */
2142
0
    NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2143
2144
    /* NewPage->xlp_info = 0; */  /* done by memset */
2145
0
    NewPage->xlp_tli = tli;
2146
0
    NewPage->xlp_pageaddr = NewPageBeginPtr;
2147
2148
    /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2149
2150
    /*
2151
     * If online backup is not in progress, mark the header to indicate
2152
     * that WAL records beginning in this page have removable backup
2153
     * blocks.  This allows the WAL archiver to know whether it is safe to
2154
     * compress archived WAL data by transforming full-block records into
2155
     * the non-full-block format.  It is sufficient to record this at the
2156
     * page level because we force a page switch (in fact a segment
2157
     * switch) when starting a backup, so the flag will be off before any
2158
     * records can be written during the backup.  At the end of a backup,
2159
     * the last page will be marked as all unsafe when perhaps only part
2160
     * is unsafe, but at worst the archiver would miss the opportunity to
2161
     * compress a few records.
2162
     */
2163
0
    if (Insert->runningBackups == 0)
2164
0
      NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2165
2166
    /*
2167
     * If first page of an XLOG segment file, make it a long header.
2168
     */
2169
0
    if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2170
0
    {
2171
0
      XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2172
2173
0
      NewLongPage->xlp_sysid = ControlFile->system_identifier;
2174
0
      NewLongPage->xlp_seg_size = wal_segment_size;
2175
0
      NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2176
0
      NewPage->xlp_info |= XLP_LONG_HEADER;
2177
0
    }
2178
2179
    /*
2180
     * Make sure the initialization of the page becomes visible to others
2181
     * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2182
     * holding a lock.
2183
     */
2184
0
    pg_write_barrier();
2185
2186
    /*-----
2187
     * Update the value of XLogCtl->xlblocks[nextidx] and try to advance
2188
     * XLogCtl->InitializedUpTo in a lock-less manner.
2189
     *
2190
     * First, let's provide a formal proof of the algorithm.  Let it be 'n'
2191
     * process with the following variables in shared memory:
2192
     *  f - an array of 'n' boolean flags,
2193
     *  v - atomic integer variable.
2194
     *
2195
     * Also, let
2196
     *  i - a number of a process,
2197
     *  j - local integer variable,
2198
     * CAS(var, oldval, newval) - compare-and-swap atomic operation
2199
     *                returning true on success,
2200
     * write_barrier()/read_barrier() - memory barriers.
2201
     *
2202
     * The pseudocode for each process is the following.
2203
     *
2204
     *  j := i
2205
     *  f[i] := true
2206
     *  write_barrier()
2207
     *  while CAS(v, j, j + 1):
2208
     *    j := j + 1
2209
     *    read_barrier()
2210
     *    if not f[j]:
2211
     *      break
2212
     *
2213
     * Let's prove that v eventually reaches the value of n.
2214
     * 1. Prove by contradiction.  Assume v doesn't reach n and stucks
2215
     *    on k, where k < n.
2216
     * 2. Process k attempts CAS(v, k, k + 1).  1). If, as we assumed, v
2217
     *    gets stuck at k, then this CAS operation must fail.  Therefore,
2218
     *    v < k when process k attempts CAS(v, k, k + 1).
2219
     * 3. If, as we assumed, v gets stuck at k, then the value k of v
2220
     *    must be achieved by some process m, where m < k.  The process
2221
     *    m must observe f[k] == false.  Otherwise, it will later attempt
2222
     *    CAS(v, k, k + 1) with success.
2223
     * 4. Therefore, corresponding read_barrier() (while j == k) on
2224
     *    process m reached before write_barrier() of process k.  But then
2225
     *    process k attempts CAS(v, k, k + 1) after process m successfully
2226
     *    incremented v to k, and that CAS operation must succeed.
2227
     *    That leads to a contradiction.  So, there is no such k (k < n)
2228
     *    where v gets stuck.  Q.E.D.
2229
     *
2230
     * To apply this proof to the code below, we assume
2231
     * XLogCtl->InitializedUpTo will play the role of v with XLOG_BLCKSZ
2232
     * granularity.  We also assume setting XLogCtl->xlblocks[nextidx] to
2233
     * NewPageEndPtr to play the role of setting f[i] to true.  Also, note
2234
     * that processes can't concurrently map different xlog locations to
2235
     * the same nextidx because we previously requested that
2236
     * XLogCtl->InitializedUpTo >= OldPageRqstPtr.  So, a xlog buffer can
2237
     * be taken for initialization only once the previous initialization
2238
     * takes effect on XLogCtl->InitializedUpTo.
2239
     */
2240
2241
0
    pg_atomic_write_u64(&XLogCtl->xlblocks[nextidx], NewPageEndPtr);
2242
2243
0
    pg_write_barrier();
2244
2245
0
    while (pg_atomic_compare_exchange_u64(&XLogCtl->InitializedUpTo, &NewPageBeginPtr, NewPageEndPtr))
2246
0
    {
2247
0
      NewPageBeginPtr = NewPageEndPtr;
2248
0
      NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2249
0
      nextidx = XLogRecPtrToBufIdx(NewPageBeginPtr);
2250
2251
0
      pg_read_barrier();
2252
2253
0
      if (pg_atomic_read_u64(&XLogCtl->xlblocks[nextidx]) != NewPageEndPtr)
2254
0
      {
2255
        /*
2256
         * Page at nextidx wasn't initialized yet, so we can't move
2257
         * InitializedUpto further. It will be moved by backend which
2258
         * will initialize nextidx.
2259
         */
2260
0
        ConditionVariableBroadcast(&XLogCtl->InitializedUpToCondVar);
2261
0
        break;
2262
0
      }
2263
0
    }
2264
2265
0
    npages++;
2266
0
  }
2267
2268
0
  END_CRIT_SECTION();
2269
2270
  /*
2271
   * All the pages in WAL buffer before 'upto' were reserved for
2272
   * initialization.  However, some pages might be reserved by concurrent
2273
   * processes.  Wait till they finish initialization.
2274
   */
2275
0
  while (upto >= pg_atomic_read_u64(&XLogCtl->InitializedUpTo))
2276
0
    ConditionVariableSleep(&XLogCtl->InitializedUpToCondVar, WAIT_EVENT_WAL_BUFFER_INIT);
2277
0
  ConditionVariableCancelSleep();
2278
2279
0
  pg_read_barrier();
2280
2281
#ifdef WAL_DEBUG
2282
  if (XLOG_DEBUG && npages > 0)
2283
  {
2284
    elog(DEBUG1, "initialized %d pages, up to %X/%X",
2285
       npages, LSN_FORMAT_ARGS(NewPageEndPtr));
2286
  }
2287
#endif
2288
0
}
2289
2290
/*
2291
 * Calculate CheckPointSegments based on max_wal_size_mb and
2292
 * checkpoint_completion_target.
2293
 */
2294
static void
2295
CalculateCheckpointSegments(void)
2296
0
{
2297
0
  double    target;
2298
2299
  /*-------
2300
   * Calculate the distance at which to trigger a checkpoint, to avoid
2301
   * exceeding max_wal_size_mb. This is based on two assumptions:
2302
   *
2303
   * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2304
   *    WAL for two checkpoint cycles to allow us to recover from the
2305
   *    secondary checkpoint if the first checkpoint failed, though we
2306
   *    only did this on the primary anyway, not on standby. Keeping just
2307
   *    one checkpoint simplifies processing and reduces disk space in
2308
   *    many smaller databases.)
2309
   * b) during checkpoint, we consume checkpoint_completion_target *
2310
   *    number of segments consumed between checkpoints.
2311
   *-------
2312
   */
2313
0
  target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2314
0
    (1.0 + CheckPointCompletionTarget);
2315
2316
  /* round down */
2317
0
  CheckPointSegments = (int) target;
2318
2319
0
  if (CheckPointSegments < 1)
2320
0
    CheckPointSegments = 1;
2321
0
}
2322
2323
void
2324
assign_max_wal_size(int newval, void *extra)
2325
0
{
2326
0
  max_wal_size_mb = newval;
2327
0
  CalculateCheckpointSegments();
2328
0
}
2329
2330
void
2331
assign_checkpoint_completion_target(double newval, void *extra)
2332
0
{
2333
0
  CheckPointCompletionTarget = newval;
2334
0
  CalculateCheckpointSegments();
2335
0
}
2336
2337
bool
2338
check_wal_segment_size(int *newval, void **extra, GucSource source)
2339
0
{
2340
0
  if (!IsValidWalSegSize(*newval))
2341
0
  {
2342
0
    GUC_check_errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
2343
0
    return false;
2344
0
  }
2345
2346
0
  return true;
2347
0
}
2348
2349
/*
2350
 * GUC check_hook for max_slot_wal_keep_size
2351
 *
2352
 * We don't allow the value of max_slot_wal_keep_size other than -1 during the
2353
 * binary upgrade. See start_postmaster() in pg_upgrade for more details.
2354
 */
2355
bool
2356
check_max_slot_wal_keep_size(int *newval, void **extra, GucSource source)
2357
0
{
2358
0
  if (IsBinaryUpgrade && *newval != -1)
2359
0
  {
2360
0
    GUC_check_errdetail("\"%s\" must be set to -1 during binary upgrade mode.",
2361
0
              "max_slot_wal_keep_size");
2362
0
    return false;
2363
0
  }
2364
2365
0
  return true;
2366
0
}
2367
2368
/*
2369
 * At a checkpoint, how many WAL segments to recycle as preallocated future
2370
 * XLOG segments? Returns the highest segment that should be preallocated.
2371
 */
2372
static XLogSegNo
2373
XLOGfileslop(XLogRecPtr lastredoptr)
2374
0
{
2375
0
  XLogSegNo minSegNo;
2376
0
  XLogSegNo maxSegNo;
2377
0
  double    distance;
2378
0
  XLogSegNo recycleSegNo;
2379
2380
  /*
2381
   * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2382
   * correspond to. Always recycle enough segments to meet the minimum, and
2383
   * remove enough segments to stay below the maximum.
2384
   */
2385
0
  minSegNo = lastredoptr / wal_segment_size +
2386
0
    ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2387
0
  maxSegNo = lastredoptr / wal_segment_size +
2388
0
    ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2389
2390
  /*
2391
   * Between those limits, recycle enough segments to get us through to the
2392
   * estimated end of next checkpoint.
2393
   *
2394
   * To estimate where the next checkpoint will finish, assume that the
2395
   * system runs steadily consuming CheckPointDistanceEstimate bytes between
2396
   * every checkpoint.
2397
   */
2398
0
  distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2399
  /* add 10% for good measure. */
2400
0
  distance *= 1.10;
2401
2402
0
  recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
2403
0
                  wal_segment_size);
2404
2405
0
  if (recycleSegNo < minSegNo)
2406
0
    recycleSegNo = minSegNo;
2407
0
  if (recycleSegNo > maxSegNo)
2408
0
    recycleSegNo = maxSegNo;
2409
2410
0
  return recycleSegNo;
2411
0
}
2412
2413
/*
2414
 * Check whether we've consumed enough xlog space that a checkpoint is needed.
2415
 *
2416
 * new_segno indicates a log file that has just been filled up (or read
2417
 * during recovery). We measure the distance from RedoRecPtr to new_segno
2418
 * and see if that exceeds CheckPointSegments.
2419
 *
2420
 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2421
 */
2422
bool
2423
XLogCheckpointNeeded(XLogSegNo new_segno)
2424
0
{
2425
0
  XLogSegNo old_segno;
2426
2427
0
  XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2428
2429
0
  if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2430
0
    return true;
2431
0
  return false;
2432
0
}
2433
2434
/*
2435
 * Write and/or fsync the log at least as far as WriteRqst indicates.
2436
 *
2437
 * If flexible == true, we don't have to write as far as WriteRqst, but
2438
 * may stop at any convenient boundary (such as a cache or logfile boundary).
2439
 * This option allows us to avoid uselessly issuing multiple writes when a
2440
 * single one would do.
2441
 *
2442
 * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2443
 * must be called before grabbing the lock, to make sure the data is ready to
2444
 * write.
2445
 */
2446
static void
2447
XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible)
2448
0
{
2449
0
  bool    ispartialpage;
2450
0
  bool    last_iteration;
2451
0
  bool    finishing_seg;
2452
0
  int     curridx;
2453
0
  int     npages;
2454
0
  int     startidx;
2455
0
  uint32    startoffset;
2456
2457
  /* We should always be inside a critical section here */
2458
0
  Assert(CritSectionCount > 0);
2459
2460
  /*
2461
   * Update local LogwrtResult (caller probably did this already, but...)
2462
   */
2463
0
  RefreshXLogWriteResult(LogwrtResult);
2464
2465
  /*
2466
   * Since successive pages in the xlog cache are consecutively allocated,
2467
   * we can usually gather multiple pages together and issue just one
2468
   * write() call.  npages is the number of pages we have determined can be
2469
   * written together; startidx is the cache block index of the first one,
2470
   * and startoffset is the file offset at which it should go. The latter
2471
   * two variables are only valid when npages > 0, but we must initialize
2472
   * all of them to keep the compiler quiet.
2473
   */
2474
0
  npages = 0;
2475
0
  startidx = 0;
2476
0
  startoffset = 0;
2477
2478
  /*
2479
   * Within the loop, curridx is the cache block index of the page to
2480
   * consider writing.  Begin at the buffer containing the next unwritten
2481
   * page, or last partially written page.
2482
   */
2483
0
  curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2484
2485
0
  while (LogwrtResult.Write < WriteRqst.Write)
2486
0
  {
2487
    /*
2488
     * Make sure we're not ahead of the insert process.  This could happen
2489
     * if we're passed a bogus WriteRqst.Write that is past the end of the
2490
     * last page that's been initialized by AdvanceXLInsertBuffer.
2491
     */
2492
0
    XLogRecPtr  EndPtr = pg_atomic_read_u64(&XLogCtl->xlblocks[curridx]);
2493
2494
0
    if (LogwrtResult.Write >= EndPtr)
2495
0
      elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2496
0
         LSN_FORMAT_ARGS(LogwrtResult.Write),
2497
0
         LSN_FORMAT_ARGS(EndPtr));
2498
2499
    /* Advance LogwrtResult.Write to end of current buffer page */
2500
0
    LogwrtResult.Write = EndPtr;
2501
0
    ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2502
2503
0
    if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2504
0
               wal_segment_size))
2505
0
    {
2506
      /*
2507
       * Switch to new logfile segment.  We cannot have any pending
2508
       * pages here (since we dump what we have at segment end).
2509
       */
2510
0
      Assert(npages == 0);
2511
0
      if (openLogFile >= 0)
2512
0
        XLogFileClose();
2513
0
      XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2514
0
              wal_segment_size);
2515
0
      openLogTLI = tli;
2516
2517
      /* create/use new log file */
2518
0
      openLogFile = XLogFileInit(openLogSegNo, tli);
2519
0
      ReserveExternalFD();
2520
0
    }
2521
2522
    /* Make sure we have the current logfile open */
2523
0
    if (openLogFile < 0)
2524
0
    {
2525
0
      XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2526
0
              wal_segment_size);
2527
0
      openLogTLI = tli;
2528
0
      openLogFile = XLogFileOpen(openLogSegNo, tli);
2529
0
      ReserveExternalFD();
2530
0
    }
2531
2532
    /* Add current page to the set of pending pages-to-dump */
2533
0
    if (npages == 0)
2534
0
    {
2535
      /* first of group */
2536
0
      startidx = curridx;
2537
0
      startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2538
0
                      wal_segment_size);
2539
0
    }
2540
0
    npages++;
2541
2542
    /*
2543
     * Dump the set if this will be the last loop iteration, or if we are
2544
     * at the last page of the cache area (since the next page won't be
2545
     * contiguous in memory), or if we are at the end of the logfile
2546
     * segment.
2547
     */
2548
0
    last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2549
2550
0
    finishing_seg = !ispartialpage &&
2551
0
      (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2552
2553
0
    if (last_iteration ||
2554
0
      curridx == XLogCtl->XLogCacheBlck ||
2555
0
      finishing_seg)
2556
0
    {
2557
0
      char     *from;
2558
0
      Size    nbytes;
2559
0
      Size    nleft;
2560
0
      ssize_t   written;
2561
0
      instr_time  start;
2562
2563
      /* OK to write the page(s) */
2564
0
      from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2565
0
      nbytes = npages * (Size) XLOG_BLCKSZ;
2566
0
      nleft = nbytes;
2567
0
      do
2568
0
      {
2569
0
        errno = 0;
2570
2571
        /*
2572
         * Measure I/O timing to write WAL data, for pg_stat_io.
2573
         */
2574
0
        start = pgstat_prepare_io_time(track_wal_io_timing);
2575
2576
0
        pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2577
0
        written = pg_pwrite(openLogFile, from, nleft, startoffset);
2578
0
        pgstat_report_wait_end();
2579
2580
0
        pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL,
2581
0
                    IOOP_WRITE, start, 1, written);
2582
2583
0
        if (written <= 0)
2584
0
        {
2585
0
          char    xlogfname[MAXFNAMELEN];
2586
0
          int     save_errno;
2587
2588
0
          if (errno == EINTR)
2589
0
            continue;
2590
2591
0
          save_errno = errno;
2592
0
          XLogFileName(xlogfname, tli, openLogSegNo,
2593
0
                 wal_segment_size);
2594
0
          errno = save_errno;
2595
0
          ereport(PANIC,
2596
0
              (errcode_for_file_access(),
2597
0
               errmsg("could not write to log file \"%s\" at offset %u, length %zu: %m",
2598
0
                  xlogfname, startoffset, nleft)));
2599
0
        }
2600
0
        nleft -= written;
2601
0
        from += written;
2602
0
        startoffset += written;
2603
0
      } while (nleft > 0);
2604
2605
0
      npages = 0;
2606
2607
      /*
2608
       * If we just wrote the whole last page of a logfile segment,
2609
       * fsync the segment immediately.  This avoids having to go back
2610
       * and re-open prior segments when an fsync request comes along
2611
       * later. Doing it here ensures that one and only one backend will
2612
       * perform this fsync.
2613
       *
2614
       * This is also the right place to notify the Archiver that the
2615
       * segment is ready to copy to archival storage, and to update the
2616
       * timer for archive_timeout, and to signal for a checkpoint if
2617
       * too many logfile segments have been used since the last
2618
       * checkpoint.
2619
       */
2620
0
      if (finishing_seg)
2621
0
      {
2622
0
        issue_xlog_fsync(openLogFile, openLogSegNo, tli);
2623
2624
        /* signal that we need to wakeup walsenders later */
2625
0
        WalSndWakeupRequest();
2626
2627
0
        LogwrtResult.Flush = LogwrtResult.Write;  /* end of page */
2628
2629
0
        if (XLogArchivingActive())
2630
0
          XLogArchiveNotifySeg(openLogSegNo, tli);
2631
2632
0
        XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2633
0
        XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2634
2635
        /*
2636
         * Request a checkpoint if we've consumed too much xlog since
2637
         * the last one.  For speed, we first check using the local
2638
         * copy of RedoRecPtr, which might be out of date; if it looks
2639
         * like a checkpoint is needed, forcibly update RedoRecPtr and
2640
         * recheck.
2641
         */
2642
0
        if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2643
0
        {
2644
0
          (void) GetRedoRecPtr();
2645
0
          if (XLogCheckpointNeeded(openLogSegNo))
2646
0
            RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2647
0
        }
2648
0
      }
2649
0
    }
2650
2651
0
    if (ispartialpage)
2652
0
    {
2653
      /* Only asked to write a partial page */
2654
0
      LogwrtResult.Write = WriteRqst.Write;
2655
0
      break;
2656
0
    }
2657
0
    curridx = NextBufIdx(curridx);
2658
2659
    /* If flexible, break out of loop as soon as we wrote something */
2660
0
    if (flexible && npages == 0)
2661
0
      break;
2662
0
  }
2663
2664
0
  Assert(npages == 0);
2665
2666
  /*
2667
   * If asked to flush, do so
2668
   */
2669
0
  if (LogwrtResult.Flush < WriteRqst.Flush &&
2670
0
    LogwrtResult.Flush < LogwrtResult.Write)
2671
0
  {
2672
    /*
2673
     * Could get here without iterating above loop, in which case we might
2674
     * have no open file or the wrong one.  However, we do not need to
2675
     * fsync more than one file.
2676
     */
2677
0
    if (wal_sync_method != WAL_SYNC_METHOD_OPEN &&
2678
0
      wal_sync_method != WAL_SYNC_METHOD_OPEN_DSYNC)
2679
0
    {
2680
0
      if (openLogFile >= 0 &&
2681
0
        !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2682
0
                 wal_segment_size))
2683
0
        XLogFileClose();
2684
0
      if (openLogFile < 0)
2685
0
      {
2686
0
        XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2687
0
                wal_segment_size);
2688
0
        openLogTLI = tli;
2689
0
        openLogFile = XLogFileOpen(openLogSegNo, tli);
2690
0
        ReserveExternalFD();
2691
0
      }
2692
2693
0
      issue_xlog_fsync(openLogFile, openLogSegNo, tli);
2694
0
    }
2695
2696
    /* signal that we need to wakeup walsenders later */
2697
0
    WalSndWakeupRequest();
2698
2699
0
    LogwrtResult.Flush = LogwrtResult.Write;
2700
0
  }
2701
2702
  /*
2703
   * Update shared-memory status
2704
   *
2705
   * We make sure that the shared 'request' values do not fall behind the
2706
   * 'result' values.  This is not absolutely essential, but it saves some
2707
   * code in a couple of places.
2708
   */
2709
0
  SpinLockAcquire(&XLogCtl->info_lck);
2710
0
  if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2711
0
    XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2712
0
  if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2713
0
    XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2714
0
  SpinLockRelease(&XLogCtl->info_lck);
2715
2716
  /*
2717
   * We write Write first, bar, then Flush.  When reading, the opposite must
2718
   * be done (with a matching barrier in between), so that we always see a
2719
   * Flush value that trails behind the Write value seen.
2720
   */
2721
0
  pg_atomic_write_u64(&XLogCtl->logWriteResult, LogwrtResult.Write);
2722
0
  pg_write_barrier();
2723
0
  pg_atomic_write_u64(&XLogCtl->logFlushResult, LogwrtResult.Flush);
2724
2725
#ifdef USE_ASSERT_CHECKING
2726
  {
2727
    XLogRecPtr  Flush;
2728
    XLogRecPtr  Write;
2729
    XLogRecPtr  Insert;
2730
2731
    Flush = pg_atomic_read_u64(&XLogCtl->logFlushResult);
2732
    pg_read_barrier();
2733
    Write = pg_atomic_read_u64(&XLogCtl->logWriteResult);
2734
    pg_read_barrier();
2735
    Insert = pg_atomic_read_u64(&XLogCtl->logInsertResult);
2736
2737
    /* WAL written to disk is always ahead of WAL flushed */
2738
    Assert(Write >= Flush);
2739
2740
    /* WAL inserted to buffers is always ahead of WAL written */
2741
    Assert(Insert >= Write);
2742
  }
2743
#endif
2744
0
}
2745
2746
/*
2747
 * Record the LSN for an asynchronous transaction commit/abort
2748
 * and nudge the WALWriter if there is work for it to do.
2749
 * (This should not be called for synchronous commits.)
2750
 */
2751
void
2752
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2753
0
{
2754
0
  XLogRecPtr  WriteRqstPtr = asyncXactLSN;
2755
0
  bool    sleeping;
2756
0
  bool    wakeup = false;
2757
0
  XLogRecPtr  prevAsyncXactLSN;
2758
2759
0
  SpinLockAcquire(&XLogCtl->info_lck);
2760
0
  sleeping = XLogCtl->WalWriterSleeping;
2761
0
  prevAsyncXactLSN = XLogCtl->asyncXactLSN;
2762
0
  if (XLogCtl->asyncXactLSN < asyncXactLSN)
2763
0
    XLogCtl->asyncXactLSN = asyncXactLSN;
2764
0
  SpinLockRelease(&XLogCtl->info_lck);
2765
2766
  /*
2767
   * If somebody else already called this function with a more aggressive
2768
   * LSN, they will have done what we needed (and perhaps more).
2769
   */
2770
0
  if (asyncXactLSN <= prevAsyncXactLSN)
2771
0
    return;
2772
2773
  /*
2774
   * If the WALWriter is sleeping, kick it to make it come out of low-power
2775
   * mode, so that this async commit will reach disk within the expected
2776
   * amount of time.  Otherwise, determine whether it has enough WAL
2777
   * available to flush, the same way that XLogBackgroundFlush() does.
2778
   */
2779
0
  if (sleeping)
2780
0
    wakeup = true;
2781
0
  else
2782
0
  {
2783
0
    int     flushblocks;
2784
2785
0
    RefreshXLogWriteResult(LogwrtResult);
2786
2787
0
    flushblocks =
2788
0
      WriteRqstPtr / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
2789
2790
0
    if (WalWriterFlushAfter == 0 || flushblocks >= WalWriterFlushAfter)
2791
0
      wakeup = true;
2792
0
  }
2793
2794
0
  if (wakeup)
2795
0
  {
2796
0
    volatile PROC_HDR *procglobal = ProcGlobal;
2797
0
    ProcNumber  walwriterProc = procglobal->walwriterProc;
2798
2799
0
    if (walwriterProc != INVALID_PROC_NUMBER)
2800
0
      SetLatch(&GetPGProcByNumber(walwriterProc)->procLatch);
2801
0
  }
2802
0
}
2803
2804
/*
2805
 * Record the LSN up to which we can remove WAL because it's not required by
2806
 * any replication slot.
2807
 */
2808
void
2809
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2810
0
{
2811
0
  SpinLockAcquire(&XLogCtl->info_lck);
2812
0
  XLogCtl->replicationSlotMinLSN = lsn;
2813
0
  SpinLockRelease(&XLogCtl->info_lck);
2814
0
}
2815
2816
2817
/*
2818
 * Return the oldest LSN we must retain to satisfy the needs of some
2819
 * replication slot.
2820
 */
2821
static XLogRecPtr
2822
XLogGetReplicationSlotMinimumLSN(void)
2823
0
{
2824
0
  XLogRecPtr  retval;
2825
2826
0
  SpinLockAcquire(&XLogCtl->info_lck);
2827
0
  retval = XLogCtl->replicationSlotMinLSN;
2828
0
  SpinLockRelease(&XLogCtl->info_lck);
2829
2830
0
  return retval;
2831
0
}
2832
2833
/*
2834
 * Advance minRecoveryPoint in control file.
2835
 *
2836
 * If we crash during recovery, we must reach this point again before the
2837
 * database is consistent.
2838
 *
2839
 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2840
 * is only updated if it's not already greater than or equal to 'lsn'.
2841
 */
2842
static void
2843
UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2844
0
{
2845
  /* Quick check using our local copy of the variable */
2846
0
  if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
2847
0
    return;
2848
2849
  /*
2850
   * An invalid minRecoveryPoint means that we need to recover all the WAL,
2851
   * i.e., we're doing crash recovery.  We never modify the control file's
2852
   * value in that case, so we can short-circuit future checks here too. The
2853
   * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2854
   * updated until crash recovery finishes.  We only do this for the startup
2855
   * process as it should not update its own reference of minRecoveryPoint
2856
   * until it has finished crash recovery to make sure that all WAL
2857
   * available is replayed in this case.  This also saves from extra locks
2858
   * taken on the control file from the startup process.
2859
   */
2860
0
  if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
2861
0
  {
2862
0
    updateMinRecoveryPoint = false;
2863
0
    return;
2864
0
  }
2865
2866
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2867
2868
  /* update local copy */
2869
0
  LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
2870
0
  LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2871
2872
0
  if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
2873
0
    updateMinRecoveryPoint = false;
2874
0
  else if (force || LocalMinRecoveryPoint < lsn)
2875
0
  {
2876
0
    XLogRecPtr  newMinRecoveryPoint;
2877
0
    TimeLineID  newMinRecoveryPointTLI;
2878
2879
    /*
2880
     * To avoid having to update the control file too often, we update it
2881
     * all the way to the last record being replayed, even though 'lsn'
2882
     * would suffice for correctness.  This also allows the 'force' case
2883
     * to not need a valid 'lsn' value.
2884
     *
2885
     * Another important reason for doing it this way is that the passed
2886
     * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2887
     * the caller got it from a corrupted heap page.  Accepting such a
2888
     * value as the min recovery point would prevent us from coming up at
2889
     * all.  Instead, we just log a warning and continue with recovery.
2890
     * (See also the comments about corrupt LSNs in XLogFlush.)
2891
     */
2892
0
    newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
2893
0
    if (!force && newMinRecoveryPoint < lsn)
2894
0
      elog(WARNING,
2895
0
         "xlog min recovery request %X/%X is past current point %X/%X",
2896
0
         LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
2897
2898
    /* update control file */
2899
0
    if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2900
0
    {
2901
0
      ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2902
0
      ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2903
0
      UpdateControlFile();
2904
0
      LocalMinRecoveryPoint = newMinRecoveryPoint;
2905
0
      LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
2906
2907
0
      ereport(DEBUG2,
2908
0
          (errmsg_internal("updated min recovery point to %X/%X on timeline %u",
2909
0
                   LSN_FORMAT_ARGS(newMinRecoveryPoint),
2910
0
                   newMinRecoveryPointTLI)));
2911
0
    }
2912
0
  }
2913
0
  LWLockRelease(ControlFileLock);
2914
0
}
2915
2916
/*
2917
 * Ensure that all XLOG data through the given position is flushed to disk.
2918
 *
2919
 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2920
 * already held, and we try to avoid acquiring it if possible.
2921
 */
2922
void
2923
XLogFlush(XLogRecPtr record)
2924
0
{
2925
0
  XLogRecPtr  WriteRqstPtr;
2926
0
  XLogwrtRqst WriteRqst;
2927
0
  TimeLineID  insertTLI = XLogCtl->InsertTimeLineID;
2928
2929
  /*
2930
   * During REDO, we are reading not writing WAL.  Therefore, instead of
2931
   * trying to flush the WAL, we should update minRecoveryPoint instead. We
2932
   * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2933
   * to act this way too, and because when it tries to write the
2934
   * end-of-recovery checkpoint, it should indeed flush.
2935
   */
2936
0
  if (!XLogInsertAllowed())
2937
0
  {
2938
0
    UpdateMinRecoveryPoint(record, false);
2939
0
    return;
2940
0
  }
2941
2942
  /* Quick exit if already known flushed */
2943
0
  if (record <= LogwrtResult.Flush)
2944
0
    return;
2945
2946
#ifdef WAL_DEBUG
2947
  if (XLOG_DEBUG)
2948
    elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2949
       LSN_FORMAT_ARGS(record),
2950
       LSN_FORMAT_ARGS(LogwrtResult.Write),
2951
       LSN_FORMAT_ARGS(LogwrtResult.Flush));
2952
#endif
2953
2954
0
  START_CRIT_SECTION();
2955
2956
  /*
2957
   * Since fsync is usually a horribly expensive operation, we try to
2958
   * piggyback as much data as we can on each fsync: if we see any more data
2959
   * entered into the xlog buffer, we'll write and fsync that too, so that
2960
   * the final value of LogwrtResult.Flush is as large as possible. This
2961
   * gives us some chance of avoiding another fsync immediately after.
2962
   */
2963
2964
  /* initialize to given target; may increase below */
2965
0
  WriteRqstPtr = record;
2966
2967
  /*
2968
   * Now wait until we get the write lock, or someone else does the flush
2969
   * for us.
2970
   */
2971
0
  for (;;)
2972
0
  {
2973
0
    XLogRecPtr  insertpos;
2974
2975
    /* done already? */
2976
0
    RefreshXLogWriteResult(LogwrtResult);
2977
0
    if (record <= LogwrtResult.Flush)
2978
0
      break;
2979
2980
    /*
2981
     * Before actually performing the write, wait for all in-flight
2982
     * insertions to the pages we're about to write to finish.
2983
     */
2984
0
    SpinLockAcquire(&XLogCtl->info_lck);
2985
0
    if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2986
0
      WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2987
0
    SpinLockRelease(&XLogCtl->info_lck);
2988
0
    insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2989
2990
    /*
2991
     * Try to get the write lock. If we can't get it immediately, wait
2992
     * until it's released, and recheck if we still need to do the flush
2993
     * or if the backend that held the lock did it for us already. This
2994
     * helps to maintain a good rate of group committing when the system
2995
     * is bottlenecked by the speed of fsyncing.
2996
     */
2997
0
    if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2998
0
    {
2999
      /*
3000
       * The lock is now free, but we didn't acquire it yet. Before we
3001
       * do, loop back to check if someone else flushed the record for
3002
       * us already.
3003
       */
3004
0
      continue;
3005
0
    }
3006
3007
    /* Got the lock; recheck whether request is satisfied */
3008
0
    RefreshXLogWriteResult(LogwrtResult);
3009
0
    if (record <= LogwrtResult.Flush)
3010
0
    {
3011
0
      LWLockRelease(WALWriteLock);
3012
0
      break;
3013
0
    }
3014
3015
    /*
3016
     * Sleep before flush! By adding a delay here, we may give further
3017
     * backends the opportunity to join the backlog of group commit
3018
     * followers; this can significantly improve transaction throughput,
3019
     * at the risk of increasing transaction latency.
3020
     *
3021
     * We do not sleep if enableFsync is not turned on, nor if there are
3022
     * fewer than CommitSiblings other backends with active transactions.
3023
     */
3024
0
    if (CommitDelay > 0 && enableFsync &&
3025
0
      MinimumActiveBackends(CommitSiblings))
3026
0
    {
3027
0
      pg_usleep(CommitDelay);
3028
3029
      /*
3030
       * Re-check how far we can now flush the WAL. It's generally not
3031
       * safe to call WaitXLogInsertionsToFinish while holding
3032
       * WALWriteLock, because an in-progress insertion might need to
3033
       * also grab WALWriteLock to make progress. But we know that all
3034
       * the insertions up to insertpos have already finished, because
3035
       * that's what the earlier WaitXLogInsertionsToFinish() returned.
3036
       * We're only calling it again to allow insertpos to be moved
3037
       * further forward, not to actually wait for anyone.
3038
       */
3039
0
      insertpos = WaitXLogInsertionsToFinish(insertpos);
3040
0
    }
3041
3042
    /* try to write/flush later additions to XLOG as well */
3043
0
    WriteRqst.Write = insertpos;
3044
0
    WriteRqst.Flush = insertpos;
3045
3046
0
    XLogWrite(WriteRqst, insertTLI, false);
3047
3048
0
    LWLockRelease(WALWriteLock);
3049
    /* done */
3050
0
    break;
3051
0
  }
3052
3053
0
  END_CRIT_SECTION();
3054
3055
  /* wake up walsenders now that we've released heavily contended locks */
3056
0
  WalSndWakeupProcessRequests(true, !RecoveryInProgress());
3057
3058
  /*
3059
   * If we still haven't flushed to the request point then we have a
3060
   * problem; most likely, the requested flush point is past end of XLOG.
3061
   * This has been seen to occur when a disk page has a corrupted LSN.
3062
   *
3063
   * Formerly we treated this as a PANIC condition, but that hurts the
3064
   * system's robustness rather than helping it: we do not want to take down
3065
   * the whole system due to corruption on one data page.  In particular, if
3066
   * the bad page is encountered again during recovery then we would be
3067
   * unable to restart the database at all!  (This scenario actually
3068
   * happened in the field several times with 7.1 releases.)  As of 8.4, bad
3069
   * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3070
   * the only time we can reach here during recovery is while flushing the
3071
   * end-of-recovery checkpoint record, and we don't expect that to have a
3072
   * bad LSN.
3073
   *
3074
   * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3075
   * since xact.c calls this routine inside a critical section.  However,
3076
   * calls from bufmgr.c are not within critical sections and so we will not
3077
   * force a restart for a bad LSN on a data page.
3078
   */
3079
0
  if (LogwrtResult.Flush < record)
3080
0
    elog(ERROR,
3081
0
       "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3082
0
       LSN_FORMAT_ARGS(record),
3083
0
       LSN_FORMAT_ARGS(LogwrtResult.Flush));
3084
0
}
3085
3086
/*
3087
 * Write & flush xlog, but without specifying exactly where to.
3088
 *
3089
 * We normally write only completed blocks; but if there is nothing to do on
3090
 * that basis, we check for unwritten async commits in the current incomplete
3091
 * block, and write through the latest one of those.  Thus, if async commits
3092
 * are not being used, we will write complete blocks only.
3093
 *
3094
 * If, based on the above, there's anything to write we do so immediately. But
3095
 * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
3096
 * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
3097
 * more than wal_writer_flush_after unflushed blocks.
3098
 *
3099
 * We can guarantee that async commits reach disk after at most three
3100
 * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
3101
 * to write "flexibly", meaning it can stop at the end of the buffer ring;
3102
 * this makes a difference only with very high load or long wal_writer_delay,
3103
 * but imposes one extra cycle for the worst case for async commits.)
3104
 *
3105
 * This routine is invoked periodically by the background walwriter process.
3106
 *
3107
 * Returns true if there was any work to do, even if we skipped flushing due
3108
 * to wal_writer_delay/wal_writer_flush_after.
3109
 */
3110
bool
3111
XLogBackgroundFlush(void)
3112
0
{
3113
0
  XLogwrtRqst WriteRqst;
3114
0
  bool    flexible = true;
3115
0
  static TimestampTz lastflush;
3116
0
  TimestampTz now;
3117
0
  int     flushblocks;
3118
0
  TimeLineID  insertTLI;
3119
3120
  /* XLOG doesn't need flushing during recovery */
3121
0
  if (RecoveryInProgress())
3122
0
    return false;
3123
3124
  /*
3125
   * Since we're not in recovery, InsertTimeLineID is set and can't change,
3126
   * so we can read it without a lock.
3127
   */
3128
0
  insertTLI = XLogCtl->InsertTimeLineID;
3129
3130
  /* read updated LogwrtRqst */
3131
0
  SpinLockAcquire(&XLogCtl->info_lck);
3132
0
  WriteRqst = XLogCtl->LogwrtRqst;
3133
0
  SpinLockRelease(&XLogCtl->info_lck);
3134
3135
  /* back off to last completed page boundary */
3136
0
  WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3137
3138
  /* if we have already flushed that far, consider async commit records */
3139
0
  RefreshXLogWriteResult(LogwrtResult);
3140
0
  if (WriteRqst.Write <= LogwrtResult.Flush)
3141
0
  {
3142
0
    SpinLockAcquire(&XLogCtl->info_lck);
3143
0
    WriteRqst.Write = XLogCtl->asyncXactLSN;
3144
0
    SpinLockRelease(&XLogCtl->info_lck);
3145
0
    flexible = false;   /* ensure it all gets written */
3146
0
  }
3147
3148
  /*
3149
   * If already known flushed, we're done. Just need to check if we are
3150
   * holding an open file handle to a logfile that's no longer in use,
3151
   * preventing the file from being deleted.
3152
   */
3153
0
  if (WriteRqst.Write <= LogwrtResult.Flush)
3154
0
  {
3155
0
    if (openLogFile >= 0)
3156
0
    {
3157
0
      if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3158
0
                 wal_segment_size))
3159
0
      {
3160
0
        XLogFileClose();
3161
0
      }
3162
0
    }
3163
0
    return false;
3164
0
  }
3165
3166
  /*
3167
   * Determine how far to flush WAL, based on the wal_writer_delay and
3168
   * wal_writer_flush_after GUCs.
3169
   *
3170
   * Note that XLogSetAsyncXactLSN() performs similar calculation based on
3171
   * wal_writer_flush_after, to decide when to wake us up.  Make sure the
3172
   * logic is the same in both places if you change this.
3173
   */
3174
0
  now = GetCurrentTimestamp();
3175
0
  flushblocks =
3176
0
    WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3177
3178
0
  if (WalWriterFlushAfter == 0 || lastflush == 0)
3179
0
  {
3180
    /* first call, or block based limits disabled */
3181
0
    WriteRqst.Flush = WriteRqst.Write;
3182
0
    lastflush = now;
3183
0
  }
3184
0
  else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3185
0
  {
3186
    /*
3187
     * Flush the writes at least every WalWriterDelay ms. This is
3188
     * important to bound the amount of time it takes for an asynchronous
3189
     * commit to hit disk.
3190
     */
3191
0
    WriteRqst.Flush = WriteRqst.Write;
3192
0
    lastflush = now;
3193
0
  }
3194
0
  else if (flushblocks >= WalWriterFlushAfter)
3195
0
  {
3196
    /* exceeded wal_writer_flush_after blocks, flush */
3197
0
    WriteRqst.Flush = WriteRqst.Write;
3198
0
    lastflush = now;
3199
0
  }
3200
0
  else
3201
0
  {
3202
    /* no flushing, this time round */
3203
0
    WriteRqst.Flush = 0;
3204
0
  }
3205
3206
#ifdef WAL_DEBUG
3207
  if (XLOG_DEBUG)
3208
    elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3209
       LSN_FORMAT_ARGS(WriteRqst.Write),
3210
       LSN_FORMAT_ARGS(WriteRqst.Flush),
3211
       LSN_FORMAT_ARGS(LogwrtResult.Write),
3212
       LSN_FORMAT_ARGS(LogwrtResult.Flush));
3213
#endif
3214
3215
0
  START_CRIT_SECTION();
3216
3217
  /* now wait for any in-progress insertions to finish and get write lock */
3218
0
  WaitXLogInsertionsToFinish(WriteRqst.Write);
3219
0
  LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3220
0
  RefreshXLogWriteResult(LogwrtResult);
3221
0
  if (WriteRqst.Write > LogwrtResult.Write ||
3222
0
    WriteRqst.Flush > LogwrtResult.Flush)
3223
0
  {
3224
0
    XLogWrite(WriteRqst, insertTLI, flexible);
3225
0
  }
3226
0
  LWLockRelease(WALWriteLock);
3227
3228
0
  END_CRIT_SECTION();
3229
3230
  /* wake up walsenders now that we've released heavily contended locks */
3231
0
  WalSndWakeupProcessRequests(true, !RecoveryInProgress());
3232
3233
  /*
3234
   * Great, done. To take some work off the critical path, try to initialize
3235
   * as many of the no-longer-needed WAL buffers for future use as we can.
3236
   */
3237
0
  AdvanceXLInsertBuffer(InvalidXLogRecPtr, insertTLI, true);
3238
3239
  /*
3240
   * If we determined that we need to write data, but somebody else
3241
   * wrote/flushed already, it should be considered as being active, to
3242
   * avoid hibernating too early.
3243
   */
3244
0
  return true;
3245
0
}
3246
3247
/*
3248
 * Test whether XLOG data has been flushed up to (at least) the given position.
3249
 *
3250
 * Returns true if a flush is still needed.  (It may be that someone else
3251
 * is already in process of flushing that far, however.)
3252
 */
3253
bool
3254
XLogNeedsFlush(XLogRecPtr record)
3255
0
{
3256
  /*
3257
   * During recovery, we don't flush WAL but update minRecoveryPoint
3258
   * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3259
   * would need to be updated.
3260
   */
3261
0
  if (RecoveryInProgress())
3262
0
  {
3263
    /*
3264
     * An invalid minRecoveryPoint means that we need to recover all the
3265
     * WAL, i.e., we're doing crash recovery.  We never modify the control
3266
     * file's value in that case, so we can short-circuit future checks
3267
     * here too.  This triggers a quick exit path for the startup process,
3268
     * which cannot update its local copy of minRecoveryPoint as long as
3269
     * it has not replayed all WAL available when doing crash recovery.
3270
     */
3271
0
    if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
3272
0
      updateMinRecoveryPoint = false;
3273
3274
    /* Quick exit if already known to be updated or cannot be updated */
3275
0
    if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
3276
0
      return false;
3277
3278
    /*
3279
     * Update local copy of minRecoveryPoint. But if the lock is busy,
3280
     * just return a conservative guess.
3281
     */
3282
0
    if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3283
0
      return true;
3284
0
    LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
3285
0
    LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3286
0
    LWLockRelease(ControlFileLock);
3287
3288
    /*
3289
     * Check minRecoveryPoint for any other process than the startup
3290
     * process doing crash recovery, which should not update the control
3291
     * file value if crash recovery is still running.
3292
     */
3293
0
    if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
3294
0
      updateMinRecoveryPoint = false;
3295
3296
    /* check again */
3297
0
    if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
3298
0
      return false;
3299
0
    else
3300
0
      return true;
3301
0
  }
3302
3303
  /* Quick exit if already known flushed */
3304
0
  if (record <= LogwrtResult.Flush)
3305
0
    return false;
3306
3307
  /* read LogwrtResult and update local state */
3308
0
  RefreshXLogWriteResult(LogwrtResult);
3309
3310
  /* check again */
3311
0
  if (record <= LogwrtResult.Flush)
3312
0
    return false;
3313
3314
0
  return true;
3315
0
}
3316
3317
/*
3318
 * Try to make a given XLOG file segment exist.
3319
 *
3320
 * logsegno: identify segment.
3321
 *
3322
 * *added: on return, true if this call raised the number of extant segments.
3323
 *
3324
 * path: on return, this char[MAXPGPATH] has the path to the logsegno file.
3325
 *
3326
 * Returns -1 or FD of opened file.  A -1 here is not an error; a caller
3327
 * wanting an open segment should attempt to open "path", which usually will
3328
 * succeed.  (This is weird, but it's efficient for the callers.)
3329
 */
3330
static int
3331
XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
3332
           bool *added, char *path)
3333
0
{
3334
0
  char    tmppath[MAXPGPATH];
3335
0
  XLogSegNo installed_segno;
3336
0
  XLogSegNo max_segno;
3337
0
  int     fd;
3338
0
  int     save_errno;
3339
0
  int     open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
3340
0
  instr_time  io_start;
3341
3342
0
  Assert(logtli != 0);
3343
3344
0
  XLogFilePath(path, logtli, logsegno, wal_segment_size);
3345
3346
  /*
3347
   * Try to use existent file (checkpoint maker may have created it already)
3348
   */
3349
0
  *added = false;
3350
0
  fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3351
0
             get_sync_bit(wal_sync_method));
3352
0
  if (fd < 0)
3353
0
  {
3354
0
    if (errno != ENOENT)
3355
0
      ereport(ERROR,
3356
0
          (errcode_for_file_access(),
3357
0
           errmsg("could not open file \"%s\": %m", path)));
3358
0
  }
3359
0
  else
3360
0
    return fd;
3361
3362
  /*
3363
   * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
3364
   * another process is doing the same thing.  If so, we will end up
3365
   * pre-creating an extra log segment.  That seems OK, and better than
3366
   * holding the lock throughout this lengthy process.
3367
   */
3368
0
  elog(DEBUG2, "creating and filling new WAL file");
3369
3370
0
  snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3371
3372
0
  unlink(tmppath);
3373
3374
0
  if (io_direct_flags & IO_DIRECT_WAL_INIT)
3375
0
    open_flags |= PG_O_DIRECT;
3376
3377
  /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3378
0
  fd = BasicOpenFile(tmppath, open_flags);
3379
0
  if (fd < 0)
3380
0
    ereport(ERROR,
3381
0
        (errcode_for_file_access(),
3382
0
         errmsg("could not create file \"%s\": %m", tmppath)));
3383
3384
  /* Measure I/O timing when initializing segment */
3385
0
  io_start = pgstat_prepare_io_time(track_wal_io_timing);
3386
3387
0
  pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3388
0
  save_errno = 0;
3389
0
  if (wal_init_zero)
3390
0
  {
3391
0
    ssize_t   rc;
3392
3393
    /*
3394
     * Zero-fill the file.  With this setting, we do this the hard way to
3395
     * ensure that all the file space has really been allocated.  On
3396
     * platforms that allow "holes" in files, just seeking to the end
3397
     * doesn't allocate intermediate space.  This way, we know that we
3398
     * have all the space and (after the fsync below) that all the
3399
     * indirect blocks are down on disk.  Therefore, fdatasync(2) or
3400
     * O_DSYNC will be sufficient to sync future writes to the log file.
3401
     */
3402
0
    rc = pg_pwrite_zeros(fd, wal_segment_size, 0);
3403
3404
0
    if (rc < 0)
3405
0
      save_errno = errno;
3406
0
  }
3407
0
  else
3408
0
  {
3409
    /*
3410
     * Otherwise, seeking to the end and writing a solitary byte is
3411
     * enough.
3412
     */
3413
0
    errno = 0;
3414
0
    if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
3415
0
    {
3416
      /* if write didn't set errno, assume no disk space */
3417
0
      save_errno = errno ? errno : ENOSPC;
3418
0
    }
3419
0
  }
3420
0
  pgstat_report_wait_end();
3421
3422
  /*
3423
   * A full segment worth of data is written when using wal_init_zero. One
3424
   * byte is written when not using it.
3425
   */
3426
0
  pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT, IOOP_WRITE,
3427
0
              io_start, 1,
3428
0
              wal_init_zero ? wal_segment_size : 1);
3429
3430
0
  if (save_errno)
3431
0
  {
3432
    /*
3433
     * If we fail to make the file, delete it to release disk space
3434
     */
3435
0
    unlink(tmppath);
3436
3437
0
    close(fd);
3438
3439
0
    errno = save_errno;
3440
3441
0
    ereport(ERROR,
3442
0
        (errcode_for_file_access(),
3443
0
         errmsg("could not write to file \"%s\": %m", tmppath)));
3444
0
  }
3445
3446
  /* Measure I/O timing when flushing segment */
3447
0
  io_start = pgstat_prepare_io_time(track_wal_io_timing);
3448
3449
0
  pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3450
0
  if (pg_fsync(fd) != 0)
3451
0
  {
3452
0
    save_errno = errno;
3453
0
    close(fd);
3454
0
    errno = save_errno;
3455
0
    ereport(ERROR,
3456
0
        (errcode_for_file_access(),
3457
0
         errmsg("could not fsync file \"%s\": %m", tmppath)));
3458
0
  }
3459
0
  pgstat_report_wait_end();
3460
3461
0
  pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_INIT,
3462
0
              IOOP_FSYNC, io_start, 1, 0);
3463
3464
0
  if (close(fd) != 0)
3465
0
    ereport(ERROR,
3466
0
        (errcode_for_file_access(),
3467
0
         errmsg("could not close file \"%s\": %m", tmppath)));
3468
3469
  /*
3470
   * Now move the segment into place with its final name.  Cope with
3471
   * possibility that someone else has created the file while we were
3472
   * filling ours: if so, use ours to pre-create a future log segment.
3473
   */
3474
0
  installed_segno = logsegno;
3475
3476
  /*
3477
   * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3478
   * that was a constant, but that was always a bit dubious: normally, at a
3479
   * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3480
   * here, it was the offset from the insert location. We can't do the
3481
   * normal XLOGfileslop calculation here because we don't have access to
3482
   * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3483
   * CheckPointSegments.
3484
   */
3485
0
  max_segno = logsegno + CheckPointSegments;
3486
0
  if (InstallXLogFileSegment(&installed_segno, tmppath, true, max_segno,
3487
0
                 logtli))
3488
0
  {
3489
0
    *added = true;
3490
0
    elog(DEBUG2, "done creating and filling new WAL file");
3491
0
  }
3492
0
  else
3493
0
  {
3494
    /*
3495
     * No need for any more future segments, or InstallXLogFileSegment()
3496
     * failed to rename the file into place. If the rename failed, a
3497
     * caller opening the file may fail.
3498
     */
3499
0
    unlink(tmppath);
3500
0
    elog(DEBUG2, "abandoned new WAL file");
3501
0
  }
3502
3503
0
  return -1;
3504
0
}
3505
3506
/*
3507
 * Create a new XLOG file segment, or open a pre-existing one.
3508
 *
3509
 * logsegno: identify segment to be created/opened.
3510
 *
3511
 * Returns FD of opened file.
3512
 *
3513
 * Note: errors here are ERROR not PANIC because we might or might not be
3514
 * inside a critical section (eg, during checkpoint there is no reason to
3515
 * take down the system on failure).  They will promote to PANIC if we are
3516
 * in a critical section.
3517
 */
3518
int
3519
XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
3520
0
{
3521
0
  bool    ignore_added;
3522
0
  char    path[MAXPGPATH];
3523
0
  int     fd;
3524
3525
0
  Assert(logtli != 0);
3526
3527
0
  fd = XLogFileInitInternal(logsegno, logtli, &ignore_added, path);
3528
0
  if (fd >= 0)
3529
0
    return fd;
3530
3531
  /* Now open original target segment (might not be file I just made) */
3532
0
  fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3533
0
             get_sync_bit(wal_sync_method));
3534
0
  if (fd < 0)
3535
0
    ereport(ERROR,
3536
0
        (errcode_for_file_access(),
3537
0
         errmsg("could not open file \"%s\": %m", path)));
3538
0
  return fd;
3539
0
}
3540
3541
/*
3542
 * Create a new XLOG file segment by copying a pre-existing one.
3543
 *
3544
 * destsegno: identify segment to be created.
3545
 *
3546
 * srcTLI, srcsegno: identify segment to be copied (could be from
3547
 *    a different timeline)
3548
 *
3549
 * upto: how much of the source file to copy (the rest is filled with
3550
 *    zeros)
3551
 *
3552
 * Currently this is only used during recovery, and so there are no locking
3553
 * considerations.  But we should be just as tense as XLogFileInit to avoid
3554
 * emplacing a bogus file.
3555
 */
3556
static void
3557
XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno,
3558
       TimeLineID srcTLI, XLogSegNo srcsegno,
3559
       int upto)
3560
0
{
3561
0
  char    path[MAXPGPATH];
3562
0
  char    tmppath[MAXPGPATH];
3563
0
  PGAlignedXLogBlock buffer;
3564
0
  int     srcfd;
3565
0
  int     fd;
3566
0
  int     nbytes;
3567
3568
  /*
3569
   * Open the source file
3570
   */
3571
0
  XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3572
0
  srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3573
0
  if (srcfd < 0)
3574
0
    ereport(ERROR,
3575
0
        (errcode_for_file_access(),
3576
0
         errmsg("could not open file \"%s\": %m", path)));
3577
3578
  /*
3579
   * Copy into a temp file name.
3580
   */
3581
0
  snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3582
3583
0
  unlink(tmppath);
3584
3585
  /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3586
0
  fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3587
0
  if (fd < 0)
3588
0
    ereport(ERROR,
3589
0
        (errcode_for_file_access(),
3590
0
         errmsg("could not create file \"%s\": %m", tmppath)));
3591
3592
  /*
3593
   * Do the data copying.
3594
   */
3595
0
  for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3596
0
  {
3597
0
    int     nread;
3598
3599
0
    nread = upto - nbytes;
3600
3601
    /*
3602
     * The part that is not read from the source file is filled with
3603
     * zeros.
3604
     */
3605
0
    if (nread < sizeof(buffer))
3606
0
      memset(buffer.data, 0, sizeof(buffer));
3607
3608
0
    if (nread > 0)
3609
0
    {
3610
0
      int     r;
3611
3612
0
      if (nread > sizeof(buffer))
3613
0
        nread = sizeof(buffer);
3614
0
      pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3615
0
      r = read(srcfd, buffer.data, nread);
3616
0
      if (r != nread)
3617
0
      {
3618
0
        if (r < 0)
3619
0
          ereport(ERROR,
3620
0
              (errcode_for_file_access(),
3621
0
               errmsg("could not read file \"%s\": %m",
3622
0
                  path)));
3623
0
        else
3624
0
          ereport(ERROR,
3625
0
              (errcode(ERRCODE_DATA_CORRUPTED),
3626
0
               errmsg("could not read file \"%s\": read %d of %zu",
3627
0
                  path, r, (Size) nread)));
3628
0
      }
3629
0
      pgstat_report_wait_end();
3630
0
    }
3631
0
    errno = 0;
3632
0
    pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3633
0
    if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3634
0
    {
3635
0
      int     save_errno = errno;
3636
3637
      /*
3638
       * If we fail to make the file, delete it to release disk space
3639
       */
3640
0
      unlink(tmppath);
3641
      /* if write didn't set errno, assume problem is no disk space */
3642
0
      errno = save_errno ? save_errno : ENOSPC;
3643
3644
0
      ereport(ERROR,
3645
0
          (errcode_for_file_access(),
3646
0
           errmsg("could not write to file \"%s\": %m", tmppath)));
3647
0
    }
3648
0
    pgstat_report_wait_end();
3649
0
  }
3650
3651
0
  pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3652
0
  if (pg_fsync(fd) != 0)
3653
0
    ereport(data_sync_elevel(ERROR),
3654
0
        (errcode_for_file_access(),
3655
0
         errmsg("could not fsync file \"%s\": %m", tmppath)));
3656
0
  pgstat_report_wait_end();
3657
3658
0
  if (CloseTransientFile(fd) != 0)
3659
0
    ereport(ERROR,
3660
0
        (errcode_for_file_access(),
3661
0
         errmsg("could not close file \"%s\": %m", tmppath)));
3662
3663
0
  if (CloseTransientFile(srcfd) != 0)
3664
0
    ereport(ERROR,
3665
0
        (errcode_for_file_access(),
3666
0
         errmsg("could not close file \"%s\": %m", path)));
3667
3668
  /*
3669
   * Now move the segment into place with its final name.
3670
   */
3671
0
  if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, destTLI))
3672
0
    elog(ERROR, "InstallXLogFileSegment should not have failed");
3673
0
}
3674
3675
/*
3676
 * Install a new XLOG segment file as a current or future log segment.
3677
 *
3678
 * This is used both to install a newly-created segment (which has a temp
3679
 * filename while it's being created) and to recycle an old segment.
3680
 *
3681
 * *segno: identify segment to install as (or first possible target).
3682
 * When find_free is true, this is modified on return to indicate the
3683
 * actual installation location or last segment searched.
3684
 *
3685
 * tmppath: initial name of file to install.  It will be renamed into place.
3686
 *
3687
 * find_free: if true, install the new segment at the first empty segno
3688
 * number at or after the passed numbers.  If false, install the new segment
3689
 * exactly where specified, deleting any existing segment file there.
3690
 *
3691
 * max_segno: maximum segment number to install the new file as.  Fail if no
3692
 * free slot is found between *segno and max_segno. (Ignored when find_free
3693
 * is false.)
3694
 *
3695
 * tli: The timeline on which the new segment should be installed.
3696
 *
3697
 * Returns true if the file was installed successfully.  false indicates that
3698
 * max_segno limit was exceeded, the startup process has disabled this
3699
 * function for now, or an error occurred while renaming the file into place.
3700
 */
3701
static bool
3702
InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3703
             bool find_free, XLogSegNo max_segno, TimeLineID tli)
3704
0
{
3705
0
  char    path[MAXPGPATH];
3706
0
  struct stat stat_buf;
3707
3708
0
  Assert(tli != 0);
3709
3710
0
  XLogFilePath(path, tli, *segno, wal_segment_size);
3711
3712
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3713
0
  if (!XLogCtl->InstallXLogFileSegmentActive)
3714
0
  {
3715
0
    LWLockRelease(ControlFileLock);
3716
0
    return false;
3717
0
  }
3718
3719
0
  if (!find_free)
3720
0
  {
3721
    /* Force installation: get rid of any pre-existing segment file */
3722
0
    durable_unlink(path, DEBUG1);
3723
0
  }
3724
0
  else
3725
0
  {
3726
    /* Find a free slot to put it in */
3727
0
    while (stat(path, &stat_buf) == 0)
3728
0
    {
3729
0
      if ((*segno) >= max_segno)
3730
0
      {
3731
        /* Failed to find a free slot within specified range */
3732
0
        LWLockRelease(ControlFileLock);
3733
0
        return false;
3734
0
      }
3735
0
      (*segno)++;
3736
0
      XLogFilePath(path, tli, *segno, wal_segment_size);
3737
0
    }
3738
0
  }
3739
3740
0
  Assert(access(path, F_OK) != 0 && errno == ENOENT);
3741
0
  if (durable_rename(tmppath, path, LOG) != 0)
3742
0
  {
3743
0
    LWLockRelease(ControlFileLock);
3744
    /* durable_rename already emitted log message */
3745
0
    return false;
3746
0
  }
3747
3748
0
  LWLockRelease(ControlFileLock);
3749
3750
0
  return true;
3751
0
}
3752
3753
/*
3754
 * Open a pre-existing logfile segment for writing.
3755
 */
3756
int
3757
XLogFileOpen(XLogSegNo segno, TimeLineID tli)
3758
0
{
3759
0
  char    path[MAXPGPATH];
3760
0
  int     fd;
3761
3762
0
  XLogFilePath(path, tli, segno, wal_segment_size);
3763
3764
0
  fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
3765
0
             get_sync_bit(wal_sync_method));
3766
0
  if (fd < 0)
3767
0
    ereport(PANIC,
3768
0
        (errcode_for_file_access(),
3769
0
         errmsg("could not open file \"%s\": %m", path)));
3770
3771
0
  return fd;
3772
0
}
3773
3774
/*
3775
 * Close the current logfile segment for writing.
3776
 */
3777
static void
3778
XLogFileClose(void)
3779
0
{
3780
0
  Assert(openLogFile >= 0);
3781
3782
  /*
3783
   * WAL segment files will not be re-read in normal operation, so we advise
3784
   * the OS to release any cached pages.  But do not do so if WAL archiving
3785
   * or streaming is active, because archiver and walsender process could
3786
   * use the cache to read the WAL segment.
3787
   */
3788
0
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3789
0
  if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
3790
0
    (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3791
0
#endif
3792
3793
0
  if (close(openLogFile) != 0)
3794
0
  {
3795
0
    char    xlogfname[MAXFNAMELEN];
3796
0
    int     save_errno = errno;
3797
3798
0
    XLogFileName(xlogfname, openLogTLI, openLogSegNo, wal_segment_size);
3799
0
    errno = save_errno;
3800
0
    ereport(PANIC,
3801
0
        (errcode_for_file_access(),
3802
0
         errmsg("could not close file \"%s\": %m", xlogfname)));
3803
0
  }
3804
3805
0
  openLogFile = -1;
3806
0
  ReleaseExternalFD();
3807
0
}
3808
3809
/*
3810
 * Preallocate log files beyond the specified log endpoint.
3811
 *
3812
 * XXX this is currently extremely conservative, since it forces only one
3813
 * future log segment to exist, and even that only if we are 75% done with
3814
 * the current one.  This is only appropriate for very low-WAL-volume systems.
3815
 * High-volume systems will be OK once they've built up a sufficient set of
3816
 * recycled log segments, but the startup transient is likely to include
3817
 * a lot of segment creations by foreground processes, which is not so good.
3818
 *
3819
 * XLogFileInitInternal() can ereport(ERROR).  All known causes indicate big
3820
 * trouble; for example, a full filesystem is one cause.  The checkpoint WAL
3821
 * and/or ControlFile updates already completed.  If a RequestCheckpoint()
3822
 * initiated the present checkpoint and an ERROR ends this function, the
3823
 * command that called RequestCheckpoint() fails.  That's not ideal, but it's
3824
 * not worth contorting more functions to use caller-specified elevel values.
3825
 * (With or without RequestCheckpoint(), an ERROR forestalls some inessential
3826
 * reporting and resource reclamation.)
3827
 */
3828
static void
3829
PreallocXlogFiles(XLogRecPtr endptr, TimeLineID tli)
3830
0
{
3831
0
  XLogSegNo _logSegNo;
3832
0
  int     lf;
3833
0
  bool    added;
3834
0
  char    path[MAXPGPATH];
3835
0
  uint64    offset;
3836
3837
0
  if (!XLogCtl->InstallXLogFileSegmentActive)
3838
0
    return;         /* unlocked check says no */
3839
3840
0
  XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3841
0
  offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3842
0
  if (offset >= (uint32) (0.75 * wal_segment_size))
3843
0
  {
3844
0
    _logSegNo++;
3845
0
    lf = XLogFileInitInternal(_logSegNo, tli, &added, path);
3846
0
    if (lf >= 0)
3847
0
      close(lf);
3848
0
    if (added)
3849
0
      CheckpointStats.ckpt_segs_added++;
3850
0
  }
3851
0
}
3852
3853
/*
3854
 * Throws an error if the given log segment has already been removed or
3855
 * recycled. The caller should only pass a segment that it knows to have
3856
 * existed while the server has been running, as this function always
3857
 * succeeds if no WAL segments have been removed since startup.
3858
 * 'tli' is only used in the error message.
3859
 *
3860
 * Note: this function guarantees to keep errno unchanged on return.
3861
 * This supports callers that use this to possibly deliver a better
3862
 * error message about a missing file, while still being able to throw
3863
 * a normal file-access error afterwards, if this does return.
3864
 */
3865
void
3866
CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3867
0
{
3868
0
  int     save_errno = errno;
3869
0
  XLogSegNo lastRemovedSegNo;
3870
3871
0
  SpinLockAcquire(&XLogCtl->info_lck);
3872
0
  lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3873
0
  SpinLockRelease(&XLogCtl->info_lck);
3874
3875
0
  if (segno <= lastRemovedSegNo)
3876
0
  {
3877
0
    char    filename[MAXFNAMELEN];
3878
3879
0
    XLogFileName(filename, tli, segno, wal_segment_size);
3880
0
    errno = save_errno;
3881
0
    ereport(ERROR,
3882
0
        (errcode_for_file_access(),
3883
0
         errmsg("requested WAL segment %s has already been removed",
3884
0
            filename)));
3885
0
  }
3886
0
  errno = save_errno;
3887
0
}
3888
3889
/*
3890
 * Return the last WAL segment removed, or 0 if no segment has been removed
3891
 * since startup.
3892
 *
3893
 * NB: the result can be out of date arbitrarily fast, the caller has to deal
3894
 * with that.
3895
 */
3896
XLogSegNo
3897
XLogGetLastRemovedSegno(void)
3898
0
{
3899
0
  XLogSegNo lastRemovedSegNo;
3900
3901
0
  SpinLockAcquire(&XLogCtl->info_lck);
3902
0
  lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3903
0
  SpinLockRelease(&XLogCtl->info_lck);
3904
3905
0
  return lastRemovedSegNo;
3906
0
}
3907
3908
/*
3909
 * Return the oldest WAL segment on the given TLI that still exists in
3910
 * XLOGDIR, or 0 if none.
3911
 */
3912
XLogSegNo
3913
XLogGetOldestSegno(TimeLineID tli)
3914
0
{
3915
0
  DIR      *xldir;
3916
0
  struct dirent *xlde;
3917
0
  XLogSegNo oldest_segno = 0;
3918
3919
0
  xldir = AllocateDir(XLOGDIR);
3920
0
  while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3921
0
  {
3922
0
    TimeLineID  file_tli;
3923
0
    XLogSegNo file_segno;
3924
3925
    /* Ignore files that are not XLOG segments. */
3926
0
    if (!IsXLogFileName(xlde->d_name))
3927
0
      continue;
3928
3929
    /* Parse filename to get TLI and segno. */
3930
0
    XLogFromFileName(xlde->d_name, &file_tli, &file_segno,
3931
0
             wal_segment_size);
3932
3933
    /* Ignore anything that's not from the TLI of interest. */
3934
0
    if (tli != file_tli)
3935
0
      continue;
3936
3937
    /* If it's the oldest so far, update oldest_segno. */
3938
0
    if (oldest_segno == 0 || file_segno < oldest_segno)
3939
0
      oldest_segno = file_segno;
3940
0
  }
3941
3942
0
  FreeDir(xldir);
3943
0
  return oldest_segno;
3944
0
}
3945
3946
/*
3947
 * Update the last removed segno pointer in shared memory, to reflect that the
3948
 * given XLOG file has been removed.
3949
 */
3950
static void
3951
UpdateLastRemovedPtr(char *filename)
3952
0
{
3953
0
  uint32    tli;
3954
0
  XLogSegNo segno;
3955
3956
0
  XLogFromFileName(filename, &tli, &segno, wal_segment_size);
3957
3958
0
  SpinLockAcquire(&XLogCtl->info_lck);
3959
0
  if (segno > XLogCtl->lastRemovedSegNo)
3960
0
    XLogCtl->lastRemovedSegNo = segno;
3961
0
  SpinLockRelease(&XLogCtl->info_lck);
3962
0
}
3963
3964
/*
3965
 * Remove all temporary log files in pg_wal
3966
 *
3967
 * This is called at the beginning of recovery after a previous crash,
3968
 * at a point where no other processes write fresh WAL data.
3969
 */
3970
static void
3971
RemoveTempXlogFiles(void)
3972
0
{
3973
0
  DIR      *xldir;
3974
0
  struct dirent *xlde;
3975
3976
0
  elog(DEBUG2, "removing all temporary WAL segments");
3977
3978
0
  xldir = AllocateDir(XLOGDIR);
3979
0
  while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3980
0
  {
3981
0
    char    path[MAXPGPATH];
3982
3983
0
    if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
3984
0
      continue;
3985
3986
0
    snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3987
0
    unlink(path);
3988
0
    elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
3989
0
  }
3990
0
  FreeDir(xldir);
3991
0
}
3992
3993
/*
3994
 * Recycle or remove all log files older or equal to passed segno.
3995
 *
3996
 * endptr is current (or recent) end of xlog, and lastredoptr is the
3997
 * redo pointer of the last checkpoint. These are used to determine
3998
 * whether we want to recycle rather than delete no-longer-wanted log files.
3999
 *
4000
 * insertTLI is the current timeline for XLOG insertion. Any recycled
4001
 * segments should be reused for this timeline.
4002
 */
4003
static void
4004
RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr,
4005
           TimeLineID insertTLI)
4006
0
{
4007
0
  DIR      *xldir;
4008
0
  struct dirent *xlde;
4009
0
  char    lastoff[MAXFNAMELEN];
4010
0
  XLogSegNo endlogSegNo;
4011
0
  XLogSegNo recycleSegNo;
4012
4013
  /* Initialize info about where to try to recycle to */
4014
0
  XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4015
0
  recycleSegNo = XLOGfileslop(lastredoptr);
4016
4017
  /*
4018
   * Construct a filename of the last segment to be kept. The timeline ID
4019
   * doesn't matter, we ignore that in the comparison. (During recovery,
4020
   * InsertTimeLineID isn't set, so we can't use that.)
4021
   */
4022
0
  XLogFileName(lastoff, 0, segno, wal_segment_size);
4023
4024
0
  elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4025
0
     lastoff);
4026
4027
0
  xldir = AllocateDir(XLOGDIR);
4028
4029
0
  while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4030
0
  {
4031
    /* Ignore files that are not XLOG segments */
4032
0
    if (!IsXLogFileName(xlde->d_name) &&
4033
0
      !IsPartialXLogFileName(xlde->d_name))
4034
0
      continue;
4035
4036
    /*
4037
     * We ignore the timeline part of the XLOG segment identifiers in
4038
     * deciding whether a segment is still needed.  This ensures that we
4039
     * won't prematurely remove a segment from a parent timeline. We could
4040
     * probably be a little more proactive about removing segments of
4041
     * non-parent timelines, but that would be a whole lot more
4042
     * complicated.
4043
     *
4044
     * We use the alphanumeric sorting property of the filenames to decide
4045
     * which ones are earlier than the lastoff segment.
4046
     */
4047
0
    if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4048
0
    {
4049
0
      if (XLogArchiveCheckDone(xlde->d_name))
4050
0
      {
4051
        /* Update the last removed location in shared memory first */
4052
0
        UpdateLastRemovedPtr(xlde->d_name);
4053
4054
0
        RemoveXlogFile(xlde, recycleSegNo, &endlogSegNo, insertTLI);
4055
0
      }
4056
0
    }
4057
0
  }
4058
4059
0
  FreeDir(xldir);
4060
0
}
4061
4062
/*
4063
 * Recycle or remove WAL files that are not part of the given timeline's
4064
 * history.
4065
 *
4066
 * This is called during recovery, whenever we switch to follow a new
4067
 * timeline, and at the end of recovery when we create a new timeline. We
4068
 * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
4069
 * might be leftover pre-allocated or recycled WAL segments on the old timeline
4070
 * that we haven't used yet, and contain garbage. If we just leave them in
4071
 * pg_wal, they will eventually be archived, and we can't let that happen.
4072
 * Files that belong to our timeline history are valid, because we have
4073
 * successfully replayed them, but from others we can't be sure.
4074
 *
4075
 * 'switchpoint' is the current point in WAL where we switch to new timeline,
4076
 * and 'newTLI' is the new timeline we switch to.
4077
 */
4078
void
4079
RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
4080
0
{
4081
0
  DIR      *xldir;
4082
0
  struct dirent *xlde;
4083
0
  char    switchseg[MAXFNAMELEN];
4084
0
  XLogSegNo endLogSegNo;
4085
0
  XLogSegNo switchLogSegNo;
4086
0
  XLogSegNo recycleSegNo;
4087
4088
  /*
4089
   * Initialize info about where to begin the work.  This will recycle,
4090
   * somewhat arbitrarily, 10 future segments.
4091
   */
4092
0
  XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
4093
0
  XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
4094
0
  recycleSegNo = endLogSegNo + 10;
4095
4096
  /*
4097
   * Construct a filename of the last segment to be kept.
4098
   */
4099
0
  XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
4100
4101
0
  elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4102
0
     switchseg);
4103
4104
0
  xldir = AllocateDir(XLOGDIR);
4105
4106
0
  while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4107
0
  {
4108
    /* Ignore files that are not XLOG segments */
4109
0
    if (!IsXLogFileName(xlde->d_name))
4110
0
      continue;
4111
4112
    /*
4113
     * Remove files that are on a timeline older than the new one we're
4114
     * switching to, but with a segment number >= the first segment on the
4115
     * new timeline.
4116
     */
4117
0
    if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4118
0
      strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4119
0
    {
4120
      /*
4121
       * If the file has already been marked as .ready, however, don't
4122
       * remove it yet. It should be OK to remove it - files that are
4123
       * not part of our timeline history are not required for recovery
4124
       * - but seems safer to let them be archived and removed later.
4125
       */
4126
0
      if (!XLogArchiveIsReady(xlde->d_name))
4127
0
        RemoveXlogFile(xlde, recycleSegNo, &endLogSegNo, newTLI);
4128
0
    }
4129
0
  }
4130
4131
0
  FreeDir(xldir);
4132
0
}
4133
4134
/*
4135
 * Recycle or remove a log file that's no longer needed.
4136
 *
4137
 * segment_de is the dirent structure of the segment to recycle or remove.
4138
 * recycleSegNo is the segment number to recycle up to.  endlogSegNo is
4139
 * the segment number of the current (or recent) end of WAL.
4140
 *
4141
 * endlogSegNo gets incremented if the segment is recycled so as it is not
4142
 * checked again with future callers of this function.
4143
 *
4144
 * insertTLI is the current timeline for XLOG insertion. Any recycled segments
4145
 * should be used for this timeline.
4146
 */
4147
static void
4148
RemoveXlogFile(const struct dirent *segment_de,
4149
         XLogSegNo recycleSegNo, XLogSegNo *endlogSegNo,
4150
         TimeLineID insertTLI)
4151
0
{
4152
0
  char    path[MAXPGPATH];
4153
#ifdef WIN32
4154
  char    newpath[MAXPGPATH];
4155
#endif
4156
0
  const char *segname = segment_de->d_name;
4157
4158
0
  snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4159
4160
  /*
4161
   * Before deleting the file, see if it can be recycled as a future log
4162
   * segment. Only recycle normal files, because we don't want to recycle
4163
   * symbolic links pointing to a separate archive directory.
4164
   */
4165
0
  if (wal_recycle &&
4166
0
    *endlogSegNo <= recycleSegNo &&
4167
0
    XLogCtl->InstallXLogFileSegmentActive && /* callee rechecks this */
4168
0
    get_dirent_type(path, segment_de, false, DEBUG2) == PGFILETYPE_REG &&
4169
0
    InstallXLogFileSegment(endlogSegNo, path,
4170
0
                 true, recycleSegNo, insertTLI))
4171
0
  {
4172
0
    ereport(DEBUG2,
4173
0
        (errmsg_internal("recycled write-ahead log file \"%s\"",
4174
0
                 segname)));
4175
0
    CheckpointStats.ckpt_segs_recycled++;
4176
    /* Needn't recheck that slot on future iterations */
4177
0
    (*endlogSegNo)++;
4178
0
  }
4179
0
  else
4180
0
  {
4181
    /* No need for any more future segments, or recycling failed ... */
4182
0
    int     rc;
4183
4184
0
    ereport(DEBUG2,
4185
0
        (errmsg_internal("removing write-ahead log file \"%s\"",
4186
0
                 segname)));
4187
4188
#ifdef WIN32
4189
4190
    /*
4191
     * On Windows, if another process (e.g another backend) holds the file
4192
     * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4193
     * will still show up in directory listing until the last handle is
4194
     * closed. To avoid confusing the lingering deleted file for a live
4195
     * WAL file that needs to be archived, rename it before deleting it.
4196
     *
4197
     * If another process holds the file open without FILE_SHARE_DELETE
4198
     * flag, rename will fail. We'll try again at the next checkpoint.
4199
     */
4200
    snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4201
    if (rename(path, newpath) != 0)
4202
    {
4203
      ereport(LOG,
4204
          (errcode_for_file_access(),
4205
           errmsg("could not rename file \"%s\": %m",
4206
              path)));
4207
      return;
4208
    }
4209
    rc = durable_unlink(newpath, LOG);
4210
#else
4211
0
    rc = durable_unlink(path, LOG);
4212
0
#endif
4213
0
    if (rc != 0)
4214
0
    {
4215
      /* Message already logged by durable_unlink() */
4216
0
      return;
4217
0
    }
4218
0
    CheckpointStats.ckpt_segs_removed++;
4219
0
  }
4220
4221
0
  XLogArchiveCleanup(segname);
4222
0
}
4223
4224
/*
4225
 * Verify whether pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
4226
 * If the latter do not exist, recreate them.
4227
 *
4228
 * It is not the goal of this function to verify the contents of these
4229
 * directories, but to help in cases where someone has performed a cluster
4230
 * copy for PITR purposes but omitted pg_wal from the copy.
4231
 *
4232
 * We could also recreate pg_wal if it doesn't exist, but a deliberate
4233
 * policy decision was made not to.  It is fairly common for pg_wal to be
4234
 * a symlink, and if that was the DBA's intent then automatically making a
4235
 * plain directory would result in degraded performance with no notice.
4236
 */
4237
static void
4238
ValidateXLOGDirectoryStructure(void)
4239
0
{
4240
0
  char    path[MAXPGPATH];
4241
0
  struct stat stat_buf;
4242
4243
  /* Check for pg_wal; if it doesn't exist, error out */
4244
0
  if (stat(XLOGDIR, &stat_buf) != 0 ||
4245
0
    !S_ISDIR(stat_buf.st_mode))
4246
0
    ereport(FATAL,
4247
0
        (errcode_for_file_access(),
4248
0
         errmsg("required WAL directory \"%s\" does not exist",
4249
0
            XLOGDIR)));
4250
4251
  /* Check for archive_status */
4252
0
  snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4253
0
  if (stat(path, &stat_buf) == 0)
4254
0
  {
4255
    /* Check for weird cases where it exists but isn't a directory */
4256
0
    if (!S_ISDIR(stat_buf.st_mode))
4257
0
      ereport(FATAL,
4258
0
          (errcode_for_file_access(),
4259
0
           errmsg("required WAL directory \"%s\" does not exist",
4260
0
              path)));
4261
0
  }
4262
0
  else
4263
0
  {
4264
0
    ereport(LOG,
4265
0
        (errmsg("creating missing WAL directory \"%s\"", path)));
4266
0
    if (MakePGDirectory(path) < 0)
4267
0
      ereport(FATAL,
4268
0
          (errcode_for_file_access(),
4269
0
           errmsg("could not create missing directory \"%s\": %m",
4270
0
              path)));
4271
0
  }
4272
4273
  /* Check for summaries */
4274
0
  snprintf(path, MAXPGPATH, XLOGDIR "/summaries");
4275
0
  if (stat(path, &stat_buf) == 0)
4276
0
  {
4277
    /* Check for weird cases where it exists but isn't a directory */
4278
0
    if (!S_ISDIR(stat_buf.st_mode))
4279
0
      ereport(FATAL,
4280
0
          (errmsg("required WAL directory \"%s\" does not exist",
4281
0
              path)));
4282
0
  }
4283
0
  else
4284
0
  {
4285
0
    ereport(LOG,
4286
0
        (errmsg("creating missing WAL directory \"%s\"", path)));
4287
0
    if (MakePGDirectory(path) < 0)
4288
0
      ereport(FATAL,
4289
0
          (errmsg("could not create missing directory \"%s\": %m",
4290
0
              path)));
4291
0
  }
4292
0
}
4293
4294
/*
4295
 * Remove previous backup history files.  This also retries creation of
4296
 * .ready files for any backup history files for which XLogArchiveNotify
4297
 * failed earlier.
4298
 */
4299
static void
4300
CleanupBackupHistory(void)
4301
0
{
4302
0
  DIR      *xldir;
4303
0
  struct dirent *xlde;
4304
0
  char    path[MAXPGPATH + sizeof(XLOGDIR)];
4305
4306
0
  xldir = AllocateDir(XLOGDIR);
4307
4308
0
  while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4309
0
  {
4310
0
    if (IsBackupHistoryFileName(xlde->d_name))
4311
0
    {
4312
0
      if (XLogArchiveCheckDone(xlde->d_name))
4313
0
      {
4314
0
        elog(DEBUG2, "removing WAL backup history file \"%s\"",
4315
0
           xlde->d_name);
4316
0
        snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4317
0
        unlink(path);
4318
0
        XLogArchiveCleanup(xlde->d_name);
4319
0
      }
4320
0
    }
4321
0
  }
4322
4323
0
  FreeDir(xldir);
4324
0
}
4325
4326
/*
4327
 * I/O routines for pg_control
4328
 *
4329
 * *ControlFile is a buffer in shared memory that holds an image of the
4330
 * contents of pg_control.  WriteControlFile() initializes pg_control
4331
 * given a preloaded buffer, ReadControlFile() loads the buffer from
4332
 * the pg_control file (during postmaster or standalone-backend startup),
4333
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4334
 * InitControlFile() fills the buffer with initial values.
4335
 *
4336
 * For simplicity, WriteControlFile() initializes the fields of pg_control
4337
 * that are related to checking backend/database compatibility, and
4338
 * ReadControlFile() verifies they are correct.  We could split out the
4339
 * I/O and compatibility-check functions, but there seems no need currently.
4340
 */
4341
4342
static void
4343
InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
4344
0
{
4345
0
  char    mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
4346
4347
  /*
4348
   * Generate a random nonce. This is used for authentication requests that
4349
   * will fail because the user does not exist. The nonce is used to create
4350
   * a genuine-looking password challenge for the non-existent user, in lieu
4351
   * of an actual stored password.
4352
   */
4353
0
  if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
4354
0
    ereport(PANIC,
4355
0
        (errcode(ERRCODE_INTERNAL_ERROR),
4356
0
         errmsg("could not generate secret authorization token")));
4357
4358
0
  memset(ControlFile, 0, sizeof(ControlFileData));
4359
  /* Initialize pg_control status fields */
4360
0
  ControlFile->system_identifier = sysidentifier;
4361
0
  memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
4362
0
  ControlFile->state = DB_SHUTDOWNED;
4363
0
  ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
4364
4365
  /* Set important parameter values for use when replaying WAL */
4366
0
  ControlFile->MaxConnections = MaxConnections;
4367
0
  ControlFile->max_worker_processes = max_worker_processes;
4368
0
  ControlFile->max_wal_senders = max_wal_senders;
4369
0
  ControlFile->max_prepared_xacts = max_prepared_xacts;
4370
0
  ControlFile->max_locks_per_xact = max_locks_per_xact;
4371
0
  ControlFile->wal_level = wal_level;
4372
0
  ControlFile->wal_log_hints = wal_log_hints;
4373
0
  ControlFile->track_commit_timestamp = track_commit_timestamp;
4374
0
  ControlFile->data_checksum_version = data_checksum_version;
4375
0
}
4376
4377
static void
4378
WriteControlFile(void)
4379
0
{
4380
0
  int     fd;
4381
0
  char    buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
4382
4383
  /*
4384
   * Initialize version and compatibility-check fields
4385
   */
4386
0
  ControlFile->pg_control_version = PG_CONTROL_VERSION;
4387
0
  ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4388
4389
0
  ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4390
0
  ControlFile->floatFormat = FLOATFORMAT_VALUE;
4391
4392
0
  ControlFile->blcksz = BLCKSZ;
4393
0
  ControlFile->relseg_size = RELSEG_SIZE;
4394
0
  ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4395
0
  ControlFile->xlog_seg_size = wal_segment_size;
4396
4397
0
  ControlFile->nameDataLen = NAMEDATALEN;
4398
0
  ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4399
4400
0
  ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4401
0
  ControlFile->loblksize = LOBLKSIZE;
4402
4403
0
  ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4404
4405
  /*
4406
   * Initialize the default 'char' signedness.
4407
   *
4408
   * The signedness of the char type is implementation-defined. For instance
4409
   * on x86 architecture CPUs, the char data type is typically treated as
4410
   * signed by default, whereas on aarch architecture CPUs, it is typically
4411
   * treated as unsigned by default. In v17 or earlier, we accidentally let
4412
   * C implementation signedness affect persistent data. This led to
4413
   * inconsistent results when comparing char data across different
4414
   * platforms.
4415
   *
4416
   * This flag can be used as a hint to ensure consistent behavior for
4417
   * pre-v18 data files that store data sorted by the 'char' type on disk,
4418
   * especially in cross-platform replication scenarios.
4419
   *
4420
   * Newly created database clusters unconditionally set the default char
4421
   * signedness to true. pg_upgrade changes this flag for clusters that were
4422
   * initialized on signedness=false platforms. As a result,
4423
   * signedness=false setting will become rare over time. If we had known
4424
   * about this problem during the last development cycle that forced initdb
4425
   * (v8.3), we would have made all clusters signed or all clusters
4426
   * unsigned. Making pg_upgrade the only source of signedness=false will
4427
   * cause the population of database clusters to converge toward that
4428
   * retrospective ideal.
4429
   */
4430
0
  ControlFile->default_char_signedness = true;
4431
4432
  /* Contents are protected with a CRC */
4433
0
  INIT_CRC32C(ControlFile->crc);
4434
0
  COMP_CRC32C(ControlFile->crc,
4435
0
        ControlFile,
4436
0
        offsetof(ControlFileData, crc));
4437
0
  FIN_CRC32C(ControlFile->crc);
4438
4439
  /*
4440
   * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4441
   * the excess over sizeof(ControlFileData).  This reduces the odds of
4442
   * premature-EOF errors when reading pg_control.  We'll still fail when we
4443
   * check the contents of the file, but hopefully with a more specific
4444
   * error than "couldn't read pg_control".
4445
   */
4446
0
  memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4447
0
  memcpy(buffer, ControlFile, sizeof(ControlFileData));
4448
4449
0
  fd = BasicOpenFile(XLOG_CONTROL_FILE,
4450
0
             O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4451
0
  if (fd < 0)
4452
0
    ereport(PANIC,
4453
0
        (errcode_for_file_access(),
4454
0
         errmsg("could not create file \"%s\": %m",
4455
0
            XLOG_CONTROL_FILE)));
4456
4457
0
  errno = 0;
4458
0
  pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4459
0
  if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4460
0
  {
4461
    /* if write didn't set errno, assume problem is no disk space */
4462
0
    if (errno == 0)
4463
0
      errno = ENOSPC;
4464
0
    ereport(PANIC,
4465
0
        (errcode_for_file_access(),
4466
0
         errmsg("could not write to file \"%s\": %m",
4467
0
            XLOG_CONTROL_FILE)));
4468
0
  }
4469
0
  pgstat_report_wait_end();
4470
4471
0
  pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4472
0
  if (pg_fsync(fd) != 0)
4473
0
    ereport(PANIC,
4474
0
        (errcode_for_file_access(),
4475
0
         errmsg("could not fsync file \"%s\": %m",
4476
0
            XLOG_CONTROL_FILE)));
4477
0
  pgstat_report_wait_end();
4478
4479
0
  if (close(fd) != 0)
4480
0
    ereport(PANIC,
4481
0
        (errcode_for_file_access(),
4482
0
         errmsg("could not close file \"%s\": %m",
4483
0
            XLOG_CONTROL_FILE)));
4484
0
}
4485
4486
static void
4487
ReadControlFile(void)
4488
0
{
4489
0
  pg_crc32c crc;
4490
0
  int     fd;
4491
0
  char    wal_segsz_str[20];
4492
0
  int     r;
4493
4494
  /*
4495
   * Read data...
4496
   */
4497
0
  fd = BasicOpenFile(XLOG_CONTROL_FILE,
4498
0
             O_RDWR | PG_BINARY);
4499
0
  if (fd < 0)
4500
0
    ereport(PANIC,
4501
0
        (errcode_for_file_access(),
4502
0
         errmsg("could not open file \"%s\": %m",
4503
0
            XLOG_CONTROL_FILE)));
4504
4505
0
  pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4506
0
  r = read(fd, ControlFile, sizeof(ControlFileData));
4507
0
  if (r != sizeof(ControlFileData))
4508
0
  {
4509
0
    if (r < 0)
4510
0
      ereport(PANIC,
4511
0
          (errcode_for_file_access(),
4512
0
           errmsg("could not read file \"%s\": %m",
4513
0
              XLOG_CONTROL_FILE)));
4514
0
    else
4515
0
      ereport(PANIC,
4516
0
          (errcode(ERRCODE_DATA_CORRUPTED),
4517
0
           errmsg("could not read file \"%s\": read %d of %zu",
4518
0
              XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4519
0
  }
4520
0
  pgstat_report_wait_end();
4521
4522
0
  close(fd);
4523
4524
  /*
4525
   * Check for expected pg_control format version.  If this is wrong, the
4526
   * CRC check will likely fail because we'll be checking the wrong number
4527
   * of bytes.  Complaining about wrong version will probably be more
4528
   * enlightening than complaining about wrong CRC.
4529
   */
4530
4531
0
  if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4532
0
    ereport(FATAL,
4533
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4534
0
         errmsg("database files are incompatible with server"),
4535
0
         errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4536
0
               " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4537
0
               ControlFile->pg_control_version, ControlFile->pg_control_version,
4538
0
               PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4539
0
         errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4540
4541
0
  if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4542
0
    ereport(FATAL,
4543
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4544
0
         errmsg("database files are incompatible with server"),
4545
0
         errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4546
0
               " but the server was compiled with PG_CONTROL_VERSION %d.",
4547
0
               ControlFile->pg_control_version, PG_CONTROL_VERSION),
4548
0
         errhint("It looks like you need to initdb.")));
4549
4550
  /* Now check the CRC. */
4551
0
  INIT_CRC32C(crc);
4552
0
  COMP_CRC32C(crc,
4553
0
        ControlFile,
4554
0
        offsetof(ControlFileData, crc));
4555
0
  FIN_CRC32C(crc);
4556
4557
0
  if (!EQ_CRC32C(crc, ControlFile->crc))
4558
0
    ereport(FATAL,
4559
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4560
0
         errmsg("incorrect checksum in control file")));
4561
4562
  /*
4563
   * Do compatibility checking immediately.  If the database isn't
4564
   * compatible with the backend executable, we want to abort before we can
4565
   * possibly do any damage.
4566
   */
4567
0
  if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4568
0
    ereport(FATAL,
4569
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4570
0
         errmsg("database files are incompatible with server"),
4571
    /* translator: %s is a variable name and %d is its value */
4572
0
         errdetail("The database cluster was initialized with %s %d,"
4573
0
               " but the server was compiled with %s %d.",
4574
0
               "CATALOG_VERSION_NO", ControlFile->catalog_version_no,
4575
0
               "CATALOG_VERSION_NO", CATALOG_VERSION_NO),
4576
0
         errhint("It looks like you need to initdb.")));
4577
0
  if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4578
0
    ereport(FATAL,
4579
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4580
0
         errmsg("database files are incompatible with server"),
4581
    /* translator: %s is a variable name and %d is its value */
4582
0
         errdetail("The database cluster was initialized with %s %d,"
4583
0
               " but the server was compiled with %s %d.",
4584
0
               "MAXALIGN", ControlFile->maxAlign,
4585
0
               "MAXALIGN", MAXIMUM_ALIGNOF),
4586
0
         errhint("It looks like you need to initdb.")));
4587
0
  if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4588
0
    ereport(FATAL,
4589
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4590
0
         errmsg("database files are incompatible with server"),
4591
0
         errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4592
0
         errhint("It looks like you need to initdb.")));
4593
0
  if (ControlFile->blcksz != BLCKSZ)
4594
0
    ereport(FATAL,
4595
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4596
0
         errmsg("database files are incompatible with server"),
4597
    /* translator: %s is a variable name and %d is its value */
4598
0
         errdetail("The database cluster was initialized with %s %d,"
4599
0
               " but the server was compiled with %s %d.",
4600
0
               "BLCKSZ", ControlFile->blcksz,
4601
0
               "BLCKSZ", BLCKSZ),
4602
0
         errhint("It looks like you need to recompile or initdb.")));
4603
0
  if (ControlFile->relseg_size != RELSEG_SIZE)
4604
0
    ereport(FATAL,
4605
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4606
0
         errmsg("database files are incompatible with server"),
4607
    /* translator: %s is a variable name and %d is its value */
4608
0
         errdetail("The database cluster was initialized with %s %d,"
4609
0
               " but the server was compiled with %s %d.",
4610
0
               "RELSEG_SIZE", ControlFile->relseg_size,
4611
0
               "RELSEG_SIZE", RELSEG_SIZE),
4612
0
         errhint("It looks like you need to recompile or initdb.")));
4613
0
  if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4614
0
    ereport(FATAL,
4615
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4616
0
         errmsg("database files are incompatible with server"),
4617
    /* translator: %s is a variable name and %d is its value */
4618
0
         errdetail("The database cluster was initialized with %s %d,"
4619
0
               " but the server was compiled with %s %d.",
4620
0
               "XLOG_BLCKSZ", ControlFile->xlog_blcksz,
4621
0
               "XLOG_BLCKSZ", XLOG_BLCKSZ),
4622
0
         errhint("It looks like you need to recompile or initdb.")));
4623
0
  if (ControlFile->nameDataLen != NAMEDATALEN)
4624
0
    ereport(FATAL,
4625
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4626
0
         errmsg("database files are incompatible with server"),
4627
    /* translator: %s is a variable name and %d is its value */
4628
0
         errdetail("The database cluster was initialized with %s %d,"
4629
0
               " but the server was compiled with %s %d.",
4630
0
               "NAMEDATALEN", ControlFile->nameDataLen,
4631
0
               "NAMEDATALEN", NAMEDATALEN),
4632
0
         errhint("It looks like you need to recompile or initdb.")));
4633
0
  if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4634
0
    ereport(FATAL,
4635
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4636
0
         errmsg("database files are incompatible with server"),
4637
    /* translator: %s is a variable name and %d is its value */
4638
0
         errdetail("The database cluster was initialized with %s %d,"
4639
0
               " but the server was compiled with %s %d.",
4640
0
               "INDEX_MAX_KEYS", ControlFile->indexMaxKeys,
4641
0
               "INDEX_MAX_KEYS", INDEX_MAX_KEYS),
4642
0
         errhint("It looks like you need to recompile or initdb.")));
4643
0
  if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4644
0
    ereport(FATAL,
4645
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4646
0
         errmsg("database files are incompatible with server"),
4647
    /* translator: %s is a variable name and %d is its value */
4648
0
         errdetail("The database cluster was initialized with %s %d,"
4649
0
               " but the server was compiled with %s %d.",
4650
0
               "TOAST_MAX_CHUNK_SIZE", ControlFile->toast_max_chunk_size,
4651
0
               "TOAST_MAX_CHUNK_SIZE", (int) TOAST_MAX_CHUNK_SIZE),
4652
0
         errhint("It looks like you need to recompile or initdb.")));
4653
0
  if (ControlFile->loblksize != LOBLKSIZE)
4654
0
    ereport(FATAL,
4655
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4656
0
         errmsg("database files are incompatible with server"),
4657
    /* translator: %s is a variable name and %d is its value */
4658
0
         errdetail("The database cluster was initialized with %s %d,"
4659
0
               " but the server was compiled with %s %d.",
4660
0
               "LOBLKSIZE", ControlFile->loblksize,
4661
0
               "LOBLKSIZE", (int) LOBLKSIZE),
4662
0
         errhint("It looks like you need to recompile or initdb.")));
4663
4664
0
#ifdef USE_FLOAT8_BYVAL
4665
0
  if (ControlFile->float8ByVal != true)
4666
0
    ereport(FATAL,
4667
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4668
0
         errmsg("database files are incompatible with server"),
4669
0
         errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4670
0
               " but the server was compiled with USE_FLOAT8_BYVAL."),
4671
0
         errhint("It looks like you need to recompile or initdb.")));
4672
#else
4673
  if (ControlFile->float8ByVal != false)
4674
    ereport(FATAL,
4675
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
4676
         errmsg("database files are incompatible with server"),
4677
         errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4678
               " but the server was compiled without USE_FLOAT8_BYVAL."),
4679
         errhint("It looks like you need to recompile or initdb.")));
4680
#endif
4681
4682
0
  wal_segment_size = ControlFile->xlog_seg_size;
4683
4684
0
  if (!IsValidWalSegSize(wal_segment_size))
4685
0
    ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4686
0
            errmsg_plural("invalid WAL segment size in control file (%d byte)",
4687
0
                    "invalid WAL segment size in control file (%d bytes)",
4688
0
                    wal_segment_size,
4689
0
                    wal_segment_size),
4690
0
            errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB.")));
4691
4692
0
  snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4693
0
  SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4694
0
          PGC_S_DYNAMIC_DEFAULT);
4695
4696
  /* check and update variables dependent on wal_segment_size */
4697
0
  if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4698
0
    ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4699
    /* translator: both %s are GUC names */
4700
0
            errmsg("\"%s\" must be at least twice \"%s\"",
4701
0
                 "min_wal_size", "wal_segment_size")));
4702
4703
0
  if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4704
0
    ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4705
    /* translator: both %s are GUC names */
4706
0
            errmsg("\"%s\" must be at least twice \"%s\"",
4707
0
                 "max_wal_size", "wal_segment_size")));
4708
4709
0
  UsableBytesInSegment =
4710
0
    (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4711
0
    (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4712
4713
0
  CalculateCheckpointSegments();
4714
4715
  /* Make the initdb settings visible as GUC variables, too */
4716
0
  SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4717
0
          PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
4718
0
}
4719
4720
/*
4721
 * Utility wrapper to update the control file.  Note that the control
4722
 * file gets flushed.
4723
 */
4724
static void
4725
UpdateControlFile(void)
4726
0
{
4727
0
  update_controlfile(DataDir, ControlFile, true);
4728
0
}
4729
4730
/*
4731
 * Returns the unique system identifier from control file.
4732
 */
4733
uint64
4734
GetSystemIdentifier(void)
4735
0
{
4736
0
  Assert(ControlFile != NULL);
4737
0
  return ControlFile->system_identifier;
4738
0
}
4739
4740
/*
4741
 * Returns the random nonce from control file.
4742
 */
4743
char *
4744
GetMockAuthenticationNonce(void)
4745
0
{
4746
0
  Assert(ControlFile != NULL);
4747
0
  return ControlFile->mock_authentication_nonce;
4748
0
}
4749
4750
/*
4751
 * Are checksums enabled for data pages?
4752
 */
4753
bool
4754
DataChecksumsEnabled(void)
4755
0
{
4756
0
  Assert(ControlFile != NULL);
4757
0
  return (ControlFile->data_checksum_version > 0);
4758
0
}
4759
4760
/*
4761
 * Return true if the cluster was initialized on a platform where the
4762
 * default signedness of char is "signed". This function exists for code
4763
 * that deals with pre-v18 data files that store data sorted by the 'char'
4764
 * type on disk (e.g., GIN and GiST indexes). See the comments in
4765
 * WriteControlFile() for details.
4766
 */
4767
bool
4768
GetDefaultCharSignedness(void)
4769
0
{
4770
0
  return ControlFile->default_char_signedness;
4771
0
}
4772
4773
/*
4774
 * Returns a fake LSN for unlogged relations.
4775
 *
4776
 * Each call generates an LSN that is greater than any previous value
4777
 * returned. The current counter value is saved and restored across clean
4778
 * shutdowns, but like unlogged relations, does not survive a crash. This can
4779
 * be used in lieu of real LSN values returned by XLogInsert, if you need an
4780
 * LSN-like increasing sequence of numbers without writing any WAL.
4781
 */
4782
XLogRecPtr
4783
GetFakeLSNForUnloggedRel(void)
4784
0
{
4785
0
  return pg_atomic_fetch_add_u64(&XLogCtl->unloggedLSN, 1);
4786
0
}
4787
4788
/*
4789
 * Auto-tune the number of XLOG buffers.
4790
 *
4791
 * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4792
 * a maximum of one XLOG segment (there is little reason to think that more
4793
 * is helpful, at least so long as we force an fsync when switching log files)
4794
 * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4795
 * 9.1, when auto-tuning was added).
4796
 *
4797
 * This should not be called until NBuffers has received its final value.
4798
 */
4799
static int
4800
XLOGChooseNumBuffers(void)
4801
0
{
4802
0
  int     xbuffers;
4803
4804
0
  xbuffers = NBuffers / 32;
4805
0
  if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
4806
0
    xbuffers = (wal_segment_size / XLOG_BLCKSZ);
4807
0
  if (xbuffers < 8)
4808
0
    xbuffers = 8;
4809
0
  return xbuffers;
4810
0
}
4811
4812
/*
4813
 * GUC check_hook for wal_buffers
4814
 */
4815
bool
4816
check_wal_buffers(int *newval, void **extra, GucSource source)
4817
0
{
4818
  /*
4819
   * -1 indicates a request for auto-tune.
4820
   */
4821
0
  if (*newval == -1)
4822
0
  {
4823
    /*
4824
     * If we haven't yet changed the boot_val default of -1, just let it
4825
     * be.  We'll fix it when XLOGShmemSize is called.
4826
     */
4827
0
    if (XLOGbuffers == -1)
4828
0
      return true;
4829
4830
    /* Otherwise, substitute the auto-tune value */
4831
0
    *newval = XLOGChooseNumBuffers();
4832
0
  }
4833
4834
  /*
4835
   * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
4836
   * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4837
   * the case, we just silently treat such values as a request for the
4838
   * minimum.  (We could throw an error instead, but that doesn't seem very
4839
   * helpful.)
4840
   */
4841
0
  if (*newval < 4)
4842
0
    *newval = 4;
4843
4844
0
  return true;
4845
0
}
4846
4847
/*
4848
 * GUC check_hook for wal_consistency_checking
4849
 */
4850
bool
4851
check_wal_consistency_checking(char **newval, void **extra, GucSource source)
4852
0
{
4853
0
  char     *rawstring;
4854
0
  List     *elemlist;
4855
0
  ListCell   *l;
4856
0
  bool    newwalconsistency[RM_MAX_ID + 1];
4857
4858
  /* Initialize the array */
4859
0
  MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool));
4860
4861
  /* Need a modifiable copy of string */
4862
0
  rawstring = pstrdup(*newval);
4863
4864
  /* Parse string into list of identifiers */
4865
0
  if (!SplitIdentifierString(rawstring, ',', &elemlist))
4866
0
  {
4867
    /* syntax error in list */
4868
0
    GUC_check_errdetail("List syntax is invalid.");
4869
0
    pfree(rawstring);
4870
0
    list_free(elemlist);
4871
0
    return false;
4872
0
  }
4873
4874
0
  foreach(l, elemlist)
4875
0
  {
4876
0
    char     *tok = (char *) lfirst(l);
4877
0
    int     rmid;
4878
4879
    /* Check for 'all'. */
4880
0
    if (pg_strcasecmp(tok, "all") == 0)
4881
0
    {
4882
0
      for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
4883
0
        if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL)
4884
0
          newwalconsistency[rmid] = true;
4885
0
    }
4886
0
    else
4887
0
    {
4888
      /* Check if the token matches any known resource manager. */
4889
0
      bool    found = false;
4890
4891
0
      for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
4892
0
      {
4893
0
        if (RmgrIdExists(rmid) && GetRmgr(rmid).rm_mask != NULL &&
4894
0
          pg_strcasecmp(tok, GetRmgr(rmid).rm_name) == 0)
4895
0
        {
4896
0
          newwalconsistency[rmid] = true;
4897
0
          found = true;
4898
0
          break;
4899
0
        }
4900
0
      }
4901
0
      if (!found)
4902
0
      {
4903
        /*
4904
         * During startup, it might be a not-yet-loaded custom
4905
         * resource manager.  Defer checking until
4906
         * InitializeWalConsistencyChecking().
4907
         */
4908
0
        if (!process_shared_preload_libraries_done)
4909
0
        {
4910
0
          check_wal_consistency_checking_deferred = true;
4911
0
        }
4912
0
        else
4913
0
        {
4914
0
          GUC_check_errdetail("Unrecognized key word: \"%s\".", tok);
4915
0
          pfree(rawstring);
4916
0
          list_free(elemlist);
4917
0
          return false;
4918
0
        }
4919
0
      }
4920
0
    }
4921
0
  }
4922
4923
0
  pfree(rawstring);
4924
0
  list_free(elemlist);
4925
4926
  /* assign new value */
4927
0
  *extra = guc_malloc(LOG, (RM_MAX_ID + 1) * sizeof(bool));
4928
0
  if (!*extra)
4929
0
    return false;
4930
0
  memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool));
4931
0
  return true;
4932
0
}
4933
4934
/*
4935
 * GUC assign_hook for wal_consistency_checking
4936
 */
4937
void
4938
assign_wal_consistency_checking(const char *newval, void *extra)
4939
0
{
4940
  /*
4941
   * If some checks were deferred, it's possible that the checks will fail
4942
   * later during InitializeWalConsistencyChecking(). But in that case, the
4943
   * postmaster will exit anyway, so it's safe to proceed with the
4944
   * assignment.
4945
   *
4946
   * Any built-in resource managers specified are assigned immediately,
4947
   * which affects WAL created before shared_preload_libraries are
4948
   * processed. Any custom resource managers specified won't be assigned
4949
   * until after shared_preload_libraries are processed, but that's OK
4950
   * because WAL for a custom resource manager can't be written before the
4951
   * module is loaded anyway.
4952
   */
4953
0
  wal_consistency_checking = extra;
4954
0
}
4955
4956
/*
4957
 * InitializeWalConsistencyChecking: run after loading custom resource managers
4958
 *
4959
 * If any unknown resource managers were specified in the
4960
 * wal_consistency_checking GUC, processing was deferred.  Now that
4961
 * shared_preload_libraries have been loaded, process wal_consistency_checking
4962
 * again.
4963
 */
4964
void
4965
InitializeWalConsistencyChecking(void)
4966
0
{
4967
0
  Assert(process_shared_preload_libraries_done);
4968
4969
0
  if (check_wal_consistency_checking_deferred)
4970
0
  {
4971
0
    struct config_generic *guc;
4972
4973
0
    guc = find_option("wal_consistency_checking", false, false, ERROR);
4974
4975
0
    check_wal_consistency_checking_deferred = false;
4976
4977
0
    set_config_option_ext("wal_consistency_checking",
4978
0
                wal_consistency_checking_string,
4979
0
                guc->scontext, guc->source, guc->srole,
4980
0
                GUC_ACTION_SET, true, ERROR, false);
4981
4982
    /* checking should not be deferred again */
4983
0
    Assert(!check_wal_consistency_checking_deferred);
4984
0
  }
4985
0
}
4986
4987
/*
4988
 * GUC show_hook for archive_command
4989
 */
4990
const char *
4991
show_archive_command(void)
4992
0
{
4993
0
  if (XLogArchivingActive())
4994
0
    return XLogArchiveCommand;
4995
0
  else
4996
0
    return "(disabled)";
4997
0
}
4998
4999
/*
5000
 * GUC show_hook for in_hot_standby
5001
 */
5002
const char *
5003
show_in_hot_standby(void)
5004
0
{
5005
  /*
5006
   * We display the actual state based on shared memory, so that this GUC
5007
   * reports up-to-date state if examined intra-query.  The underlying
5008
   * variable (in_hot_standby_guc) changes only when we transmit a new value
5009
   * to the client.
5010
   */
5011
0
  return RecoveryInProgress() ? "on" : "off";
5012
0
}
5013
5014
/*
5015
 * Read the control file, set respective GUCs.
5016
 *
5017
 * This is to be called during startup, including a crash recovery cycle,
5018
 * unless in bootstrap mode, where no control file yet exists.  As there's no
5019
 * usable shared memory yet (its sizing can depend on the contents of the
5020
 * control file!), first store the contents in local memory. XLOGShmemInit()
5021
 * will then copy it to shared memory later.
5022
 *
5023
 * reset just controls whether previous contents are to be expected (in the
5024
 * reset case, there's a dangling pointer into old shared memory), or not.
5025
 */
5026
void
5027
LocalProcessControlFile(bool reset)
5028
0
{
5029
0
  Assert(reset || ControlFile == NULL);
5030
0
  ControlFile = palloc(sizeof(ControlFileData));
5031
0
  ReadControlFile();
5032
0
}
5033
5034
/*
5035
 * Get the wal_level from the control file. For a standby, this value should be
5036
 * considered as its active wal_level, because it may be different from what
5037
 * was originally configured on standby.
5038
 */
5039
WalLevel
5040
GetActiveWalLevelOnStandby(void)
5041
0
{
5042
0
  return ControlFile->wal_level;
5043
0
}
5044
5045
/*
5046
 * Initialization of shared memory for XLOG
5047
 */
5048
Size
5049
XLOGShmemSize(void)
5050
0
{
5051
0
  Size    size;
5052
5053
  /*
5054
   * If the value of wal_buffers is -1, use the preferred auto-tune value.
5055
   * This isn't an amazingly clean place to do this, but we must wait till
5056
   * NBuffers has received its final value, and must do it before using the
5057
   * value of XLOGbuffers to do anything important.
5058
   *
5059
   * We prefer to report this value's source as PGC_S_DYNAMIC_DEFAULT.
5060
   * However, if the DBA explicitly set wal_buffers = -1 in the config file,
5061
   * then PGC_S_DYNAMIC_DEFAULT will fail to override that and we must force
5062
   * the matter with PGC_S_OVERRIDE.
5063
   */
5064
0
  if (XLOGbuffers == -1)
5065
0
  {
5066
0
    char    buf[32];
5067
5068
0
    snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5069
0
    SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
5070
0
            PGC_S_DYNAMIC_DEFAULT);
5071
0
    if (XLOGbuffers == -1) /* failed to apply it? */
5072
0
      SetConfigOption("wal_buffers", buf, PGC_POSTMASTER,
5073
0
              PGC_S_OVERRIDE);
5074
0
  }
5075
0
  Assert(XLOGbuffers > 0);
5076
5077
  /* XLogCtl */
5078
0
  size = sizeof(XLogCtlData);
5079
5080
  /* WAL insertion locks, plus alignment */
5081
0
  size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
5082
  /* xlblocks array */
5083
0
  size = add_size(size, mul_size(sizeof(pg_atomic_uint64), XLOGbuffers));
5084
  /* extra alignment padding for XLOG I/O buffers */
5085
0
  size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE));
5086
  /* and the buffers themselves */
5087
0
  size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5088
5089
  /*
5090
   * Note: we don't count ControlFileData, it comes out of the "slop factor"
5091
   * added by CreateSharedMemoryAndSemaphores.  This lets us use this
5092
   * routine again below to compute the actual allocation size.
5093
   */
5094
5095
0
  return size;
5096
0
}
5097
5098
void
5099
XLOGShmemInit(void)
5100
0
{
5101
0
  bool    foundCFile,
5102
0
        foundXLog;
5103
0
  char     *allocptr;
5104
0
  int     i;
5105
0
  ControlFileData *localControlFile;
5106
5107
#ifdef WAL_DEBUG
5108
5109
  /*
5110
   * Create a memory context for WAL debugging that's exempt from the normal
5111
   * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
5112
   * an allocation fails, but wal_debug is not for production use anyway.
5113
   */
5114
  if (walDebugCxt == NULL)
5115
  {
5116
    walDebugCxt = AllocSetContextCreate(TopMemoryContext,
5117
                      "WAL Debug",
5118
                      ALLOCSET_DEFAULT_SIZES);
5119
    MemoryContextAllowInCriticalSection(walDebugCxt, true);
5120
  }
5121
#endif
5122
5123
5124
0
  XLogCtl = (XLogCtlData *)
5125
0
    ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5126
5127
0
  localControlFile = ControlFile;
5128
0
  ControlFile = (ControlFileData *)
5129
0
    ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5130
5131
0
  if (foundCFile || foundXLog)
5132
0
  {
5133
    /* both should be present or neither */
5134
0
    Assert(foundCFile && foundXLog);
5135
5136
    /* Initialize local copy of WALInsertLocks */
5137
0
    WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5138
5139
0
    if (localControlFile)
5140
0
      pfree(localControlFile);
5141
0
    return;
5142
0
  }
5143
0
  memset(XLogCtl, 0, sizeof(XLogCtlData));
5144
5145
  /*
5146
   * Already have read control file locally, unless in bootstrap mode. Move
5147
   * contents into shared memory.
5148
   */
5149
0
  if (localControlFile)
5150
0
  {
5151
0
    memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5152
0
    pfree(localControlFile);
5153
0
  }
5154
5155
  /*
5156
   * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5157
   * multiple of the alignment for same, so no extra alignment padding is
5158
   * needed here.
5159
   */
5160
0
  allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5161
0
  XLogCtl->xlblocks = (pg_atomic_uint64 *) allocptr;
5162
0
  allocptr += sizeof(pg_atomic_uint64) * XLOGbuffers;
5163
5164
0
  for (i = 0; i < XLOGbuffers; i++)
5165
0
  {
5166
0
    pg_atomic_init_u64(&XLogCtl->xlblocks[i], InvalidXLogRecPtr);
5167
0
  }
5168
5169
  /* WAL insertion locks. Ensure they're aligned to the full padded size */
5170
0
  allocptr += sizeof(WALInsertLockPadded) -
5171
0
    ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5172
0
  WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5173
0
    (WALInsertLockPadded *) allocptr;
5174
0
  allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5175
5176
0
  for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5177
0
  {
5178
0
    LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5179
0
    pg_atomic_init_u64(&WALInsertLocks[i].l.insertingAt, InvalidXLogRecPtr);
5180
0
    WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5181
0
  }
5182
5183
  /*
5184
   * Align the start of the page buffers to a full xlog block size boundary.
5185
   * This simplifies some calculations in XLOG insertion. It is also
5186
   * required for O_DIRECT.
5187
   */
5188
0
  allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5189
0
  XLogCtl->pages = allocptr;
5190
0
  memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5191
5192
  /*
5193
   * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5194
   * in additional info.)
5195
   */
5196
0
  XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5197
0
  XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5198
0
  XLogCtl->InstallXLogFileSegmentActive = false;
5199
0
  XLogCtl->WalWriterSleeping = false;
5200
5201
0
  SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5202
0
  SpinLockInit(&XLogCtl->info_lck);
5203
0
  pg_atomic_init_u64(&XLogCtl->logInsertResult, InvalidXLogRecPtr);
5204
0
  pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr);
5205
0
  pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr);
5206
0
  pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr);
5207
5208
0
  pg_atomic_init_u64(&XLogCtl->InitializeReserved, InvalidXLogRecPtr);
5209
0
  pg_atomic_init_u64(&XLogCtl->InitializedUpTo, InvalidXLogRecPtr);
5210
0
  ConditionVariableInit(&XLogCtl->InitializedUpToCondVar);
5211
0
}
5212
5213
/*
5214
 * This func must be called ONCE on system install.  It creates pg_control
5215
 * and the initial XLOG segment.
5216
 */
5217
void
5218
BootStrapXLOG(uint32 data_checksum_version)
5219
0
{
5220
0
  CheckPoint  checkPoint;
5221
0
  char     *buffer;
5222
0
  XLogPageHeader page;
5223
0
  XLogLongPageHeader longpage;
5224
0
  XLogRecord *record;
5225
0
  char     *recptr;
5226
0
  uint64    sysidentifier;
5227
0
  struct timeval tv;
5228
0
  pg_crc32c crc;
5229
5230
  /* allow ordinary WAL segment creation, like StartupXLOG() would */
5231
0
  SetInstallXLogFileSegmentActive();
5232
5233
  /*
5234
   * Select a hopefully-unique system identifier code for this installation.
5235
   * We use the result of gettimeofday(), including the fractional seconds
5236
   * field, as being about as unique as we can easily get.  (Think not to
5237
   * use random(), since it hasn't been seeded and there's no portable way
5238
   * to seed it other than the system clock value...)  The upper half of the
5239
   * uint64 value is just the tv_sec part, while the lower half contains the
5240
   * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5241
   * PID for a little extra uniqueness.  A person knowing this encoding can
5242
   * determine the initialization time of the installation, which could
5243
   * perhaps be useful sometimes.
5244
   */
5245
0
  gettimeofday(&tv, NULL);
5246
0
  sysidentifier = ((uint64) tv.tv_sec) << 32;
5247
0
  sysidentifier |= ((uint64) tv.tv_usec) << 12;
5248
0
  sysidentifier |= getpid() & 0xFFF;
5249
5250
  /* page buffer must be aligned suitably for O_DIRECT */
5251
0
  buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5252
0
  page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5253
0
  memset(page, 0, XLOG_BLCKSZ);
5254
5255
  /*
5256
   * Set up information for the initial checkpoint record
5257
   *
5258
   * The initial checkpoint record is written to the beginning of the WAL
5259
   * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5260
   * used, so that we can use 0/0 to mean "before any valid WAL segment".
5261
   */
5262
0
  checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5263
0
  checkPoint.ThisTimeLineID = BootstrapTimeLineID;
5264
0
  checkPoint.PrevTimeLineID = BootstrapTimeLineID;
5265
0
  checkPoint.fullPageWrites = fullPageWrites;
5266
0
  checkPoint.wal_level = wal_level;
5267
0
  checkPoint.nextXid =
5268
0
    FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
5269
0
  checkPoint.nextOid = FirstGenbkiObjectId;
5270
0
  checkPoint.nextMulti = FirstMultiXactId;
5271
0
  checkPoint.nextMultiOffset = 0;
5272
0
  checkPoint.oldestXid = FirstNormalTransactionId;
5273
0
  checkPoint.oldestXidDB = Template1DbOid;
5274
0
  checkPoint.oldestMulti = FirstMultiXactId;
5275
0
  checkPoint.oldestMultiDB = Template1DbOid;
5276
0
  checkPoint.oldestCommitTsXid = InvalidTransactionId;
5277
0
  checkPoint.newestCommitTsXid = InvalidTransactionId;
5278
0
  checkPoint.time = (pg_time_t) time(NULL);
5279
0
  checkPoint.oldestActiveXid = InvalidTransactionId;
5280
5281
0
  TransamVariables->nextXid = checkPoint.nextXid;
5282
0
  TransamVariables->nextOid = checkPoint.nextOid;
5283
0
  TransamVariables->oidCount = 0;
5284
0
  MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5285
0
  AdvanceOldestClogXid(checkPoint.oldestXid);
5286
0
  SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5287
0
  SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5288
0
  SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5289
5290
  /* Set up the XLOG page header */
5291
0
  page->xlp_magic = XLOG_PAGE_MAGIC;
5292
0
  page->xlp_info = XLP_LONG_HEADER;
5293
0
  page->xlp_tli = BootstrapTimeLineID;
5294
0
  page->xlp_pageaddr = wal_segment_size;
5295
0
  longpage = (XLogLongPageHeader) page;
5296
0
  longpage->xlp_sysid = sysidentifier;
5297
0
  longpage->xlp_seg_size = wal_segment_size;
5298
0
  longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5299
5300
  /* Insert the initial checkpoint record */
5301
0
  recptr = ((char *) page + SizeOfXLogLongPHD);
5302
0
  record = (XLogRecord *) recptr;
5303
0
  record->xl_prev = 0;
5304
0
  record->xl_xid = InvalidTransactionId;
5305
0
  record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5306
0
  record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5307
0
  record->xl_rmid = RM_XLOG_ID;
5308
0
  recptr += SizeOfXLogRecord;
5309
  /* fill the XLogRecordDataHeaderShort struct */
5310
0
  *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5311
0
  *(recptr++) = sizeof(checkPoint);
5312
0
  memcpy(recptr, &checkPoint, sizeof(checkPoint));
5313
0
  recptr += sizeof(checkPoint);
5314
0
  Assert(recptr - (char *) record == record->xl_tot_len);
5315
5316
0
  INIT_CRC32C(crc);
5317
0
  COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5318
0
  COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5319
0
  FIN_CRC32C(crc);
5320
0
  record->xl_crc = crc;
5321
5322
  /* Create first XLOG segment file */
5323
0
  openLogTLI = BootstrapTimeLineID;
5324
0
  openLogFile = XLogFileInit(1, BootstrapTimeLineID);
5325
5326
  /*
5327
   * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
5328
   * close the file again in a moment.
5329
   */
5330
5331
  /* Write the first page with the initial record */
5332
0
  errno = 0;
5333
0
  pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5334
0
  if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5335
0
  {
5336
    /* if write didn't set errno, assume problem is no disk space */
5337
0
    if (errno == 0)
5338
0
      errno = ENOSPC;
5339
0
    ereport(PANIC,
5340
0
        (errcode_for_file_access(),
5341
0
         errmsg("could not write bootstrap write-ahead log file: %m")));
5342
0
  }
5343
0
  pgstat_report_wait_end();
5344
5345
0
  pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5346
0
  if (pg_fsync(openLogFile) != 0)
5347
0
    ereport(PANIC,
5348
0
        (errcode_for_file_access(),
5349
0
         errmsg("could not fsync bootstrap write-ahead log file: %m")));
5350
0
  pgstat_report_wait_end();
5351
5352
0
  if (close(openLogFile) != 0)
5353
0
    ereport(PANIC,
5354
0
        (errcode_for_file_access(),
5355
0
         errmsg("could not close bootstrap write-ahead log file: %m")));
5356
5357
0
  openLogFile = -1;
5358
5359
  /* Now create pg_control */
5360
0
  InitControlFile(sysidentifier, data_checksum_version);
5361
0
  ControlFile->time = checkPoint.time;
5362
0
  ControlFile->checkPoint = checkPoint.redo;
5363
0
  ControlFile->checkPointCopy = checkPoint;
5364
5365
  /* some additional ControlFile fields are set in WriteControlFile() */
5366
0
  WriteControlFile();
5367
5368
  /* Bootstrap the commit log, too */
5369
0
  BootStrapCLOG();
5370
0
  BootStrapCommitTs();
5371
0
  BootStrapSUBTRANS();
5372
0
  BootStrapMultiXact();
5373
5374
0
  pfree(buffer);
5375
5376
  /*
5377
   * Force control file to be read - in contrast to normal processing we'd
5378
   * otherwise never run the checks and GUC related initializations therein.
5379
   */
5380
0
  ReadControlFile();
5381
0
}
5382
5383
static char *
5384
str_time(pg_time_t tnow)
5385
0
{
5386
0
  char     *buf = palloc(128);
5387
5388
0
  pg_strftime(buf, 128,
5389
0
        "%Y-%m-%d %H:%M:%S %Z",
5390
0
        pg_localtime(&tnow, log_timezone));
5391
5392
0
  return buf;
5393
0
}
5394
5395
/*
5396
 * Initialize the first WAL segment on new timeline.
5397
 */
5398
static void
5399
XLogInitNewTimeline(TimeLineID endTLI, XLogRecPtr endOfLog, TimeLineID newTLI)
5400
0
{
5401
0
  char    xlogfname[MAXFNAMELEN];
5402
0
  XLogSegNo endLogSegNo;
5403
0
  XLogSegNo startLogSegNo;
5404
5405
  /* we always switch to a new timeline after archive recovery */
5406
0
  Assert(endTLI != newTLI);
5407
5408
  /*
5409
   * Update min recovery point one last time.
5410
   */
5411
0
  UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5412
5413
  /*
5414
   * Calculate the last segment on the old timeline, and the first segment
5415
   * on the new timeline. If the switch happens in the middle of a segment,
5416
   * they are the same, but if the switch happens exactly at a segment
5417
   * boundary, startLogSegNo will be endLogSegNo + 1.
5418
   */
5419
0
  XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5420
0
  XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5421
5422
  /*
5423
   * Initialize the starting WAL segment for the new timeline. If the switch
5424
   * happens in the middle of a segment, copy data from the last WAL segment
5425
   * of the old timeline up to the switch point, to the starting WAL segment
5426
   * on the new timeline.
5427
   */
5428
0
  if (endLogSegNo == startLogSegNo)
5429
0
  {
5430
    /*
5431
     * Make a copy of the file on the new timeline.
5432
     *
5433
     * Writing WAL isn't allowed yet, so there are no locking
5434
     * considerations. But we should be just as tense as XLogFileInit to
5435
     * avoid emplacing a bogus file.
5436
     */
5437
0
    XLogFileCopy(newTLI, endLogSegNo, endTLI, endLogSegNo,
5438
0
           XLogSegmentOffset(endOfLog, wal_segment_size));
5439
0
  }
5440
0
  else
5441
0
  {
5442
    /*
5443
     * The switch happened at a segment boundary, so just create the next
5444
     * segment on the new timeline.
5445
     */
5446
0
    int     fd;
5447
5448
0
    fd = XLogFileInit(startLogSegNo, newTLI);
5449
5450
0
    if (close(fd) != 0)
5451
0
    {
5452
0
      int     save_errno = errno;
5453
5454
0
      XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
5455
0
      errno = save_errno;
5456
0
      ereport(ERROR,
5457
0
          (errcode_for_file_access(),
5458
0
           errmsg("could not close file \"%s\": %m", xlogfname)));
5459
0
    }
5460
0
  }
5461
5462
  /*
5463
   * Let's just make real sure there are not .ready or .done flags posted
5464
   * for the new segment.
5465
   */
5466
0
  XLogFileName(xlogfname, newTLI, startLogSegNo, wal_segment_size);
5467
0
  XLogArchiveCleanup(xlogfname);
5468
0
}
5469
5470
/*
5471
 * Perform cleanup actions at the conclusion of archive recovery.
5472
 */
5473
static void
5474
CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog,
5475
              TimeLineID newTLI)
5476
0
{
5477
  /*
5478
   * Execute the recovery_end_command, if any.
5479
   */
5480
0
  if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
5481
0
    ExecuteRecoveryCommand(recoveryEndCommand,
5482
0
                 "recovery_end_command",
5483
0
                 true,
5484
0
                 WAIT_EVENT_RECOVERY_END_COMMAND);
5485
5486
  /*
5487
   * We switched to a new timeline. Clean up segments on the old timeline.
5488
   *
5489
   * If there are any higher-numbered segments on the old timeline, remove
5490
   * them. They might contain valid WAL, but they might also be
5491
   * pre-allocated files containing garbage. In any case, they are not part
5492
   * of the new timeline's history so we don't need them.
5493
   */
5494
0
  RemoveNonParentXlogFiles(EndOfLog, newTLI);
5495
5496
  /*
5497
   * If the switch happened in the middle of a segment, what to do with the
5498
   * last, partial segment on the old timeline? If we don't archive it, and
5499
   * the server that created the WAL never archives it either (e.g. because
5500
   * it was hit by a meteor), it will never make it to the archive. That's
5501
   * OK from our point of view, because the new segment that we created with
5502
   * the new TLI contains all the WAL from the old timeline up to the switch
5503
   * point. But if you later try to do PITR to the "missing" WAL on the old
5504
   * timeline, recovery won't find it in the archive. It's physically
5505
   * present in the new file with new TLI, but recovery won't look there
5506
   * when it's recovering to the older timeline. On the other hand, if we
5507
   * archive the partial segment, and the original server on that timeline
5508
   * is still running and archives the completed version of the same segment
5509
   * later, it will fail. (We used to do that in 9.4 and below, and it
5510
   * caused such problems).
5511
   *
5512
   * As a compromise, we rename the last segment with the .partial suffix,
5513
   * and archive it. Archive recovery will never try to read .partial
5514
   * segments, so they will normally go unused. But in the odd PITR case,
5515
   * the administrator can copy them manually to the pg_wal directory
5516
   * (removing the suffix). They can be useful in debugging, too.
5517
   *
5518
   * If a .done or .ready file already exists for the old timeline, however,
5519
   * we had already determined that the segment is complete, so we can let
5520
   * it be archived normally. (In particular, if it was restored from the
5521
   * archive to begin with, it's expected to have a .done file).
5522
   */
5523
0
  if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
5524
0
    XLogArchivingActive())
5525
0
  {
5526
0
    char    origfname[MAXFNAMELEN];
5527
0
    XLogSegNo endLogSegNo;
5528
5529
0
    XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
5530
0
    XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
5531
5532
0
    if (!XLogArchiveIsReadyOrDone(origfname))
5533
0
    {
5534
0
      char    origpath[MAXPGPATH];
5535
0
      char    partialfname[MAXFNAMELEN];
5536
0
      char    partialpath[MAXPGPATH];
5537
5538
      /*
5539
       * If we're summarizing WAL, we can't rename the partial file
5540
       * until the summarizer finishes with it, else it will fail.
5541
       */
5542
0
      if (summarize_wal)
5543
0
        WaitForWalSummarization(EndOfLog);
5544
5545
0
      XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
5546
0
      snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
5547
0
      snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
5548
5549
      /*
5550
       * Make sure there's no .done or .ready file for the .partial
5551
       * file.
5552
       */
5553
0
      XLogArchiveCleanup(partialfname);
5554
5555
0
      durable_rename(origpath, partialpath, ERROR);
5556
0
      XLogArchiveNotify(partialfname);
5557
0
    }
5558
0
  }
5559
0
}
5560
5561
/*
5562
 * Check to see if required parameters are set high enough on this server
5563
 * for various aspects of recovery operation.
5564
 *
5565
 * Note that all the parameters which this function tests need to be
5566
 * listed in Administrator's Overview section in high-availability.sgml.
5567
 * If you change them, don't forget to update the list.
5568
 */
5569
static void
5570
CheckRequiredParameterValues(void)
5571
0
{
5572
  /*
5573
   * For archive recovery, the WAL must be generated with at least 'replica'
5574
   * wal_level.
5575
   */
5576
0
  if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5577
0
  {
5578
0
    ereport(FATAL,
5579
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
5580
0
         errmsg("WAL was generated with \"wal_level=minimal\", cannot continue recovering"),
5581
0
         errdetail("This happens if you temporarily set \"wal_level=minimal\" on the server."),
5582
0
         errhint("Use a backup taken after setting \"wal_level\" to higher than \"minimal\".")));
5583
0
  }
5584
5585
  /*
5586
   * For Hot Standby, the WAL must be generated with 'replica' mode, and we
5587
   * must have at least as many backend slots as the primary.
5588
   */
5589
0
  if (ArchiveRecoveryRequested && EnableHotStandby)
5590
0
  {
5591
    /* We ignore autovacuum_worker_slots when we make this test. */
5592
0
    RecoveryRequiresIntParameter("max_connections",
5593
0
                   MaxConnections,
5594
0
                   ControlFile->MaxConnections);
5595
0
    RecoveryRequiresIntParameter("max_worker_processes",
5596
0
                   max_worker_processes,
5597
0
                   ControlFile->max_worker_processes);
5598
0
    RecoveryRequiresIntParameter("max_wal_senders",
5599
0
                   max_wal_senders,
5600
0
                   ControlFile->max_wal_senders);
5601
0
    RecoveryRequiresIntParameter("max_prepared_transactions",
5602
0
                   max_prepared_xacts,
5603
0
                   ControlFile->max_prepared_xacts);
5604
0
    RecoveryRequiresIntParameter("max_locks_per_transaction",
5605
0
                   max_locks_per_xact,
5606
0
                   ControlFile->max_locks_per_xact);
5607
0
  }
5608
0
}
5609
5610
/*
5611
 * This must be called ONCE during postmaster or standalone-backend startup
5612
 */
5613
void
5614
StartupXLOG(void)
5615
0
{
5616
0
  XLogCtlInsert *Insert;
5617
0
  CheckPoint  checkPoint;
5618
0
  bool    wasShutdown;
5619
0
  bool    didCrash;
5620
0
  bool    haveTblspcMap;
5621
0
  bool    haveBackupLabel;
5622
0
  XLogRecPtr  EndOfLog;
5623
0
  TimeLineID  EndOfLogTLI;
5624
0
  TimeLineID  newTLI;
5625
0
  bool    performedWalRecovery;
5626
0
  EndOfWalRecoveryInfo *endOfRecoveryInfo;
5627
0
  XLogRecPtr  abortedRecPtr;
5628
0
  XLogRecPtr  missingContrecPtr;
5629
0
  TransactionId oldestActiveXID;
5630
0
  bool    promoted = false;
5631
5632
  /*
5633
   * We should have an aux process resource owner to use, and we should not
5634
   * be in a transaction that's installed some other resowner.
5635
   */
5636
0
  Assert(AuxProcessResourceOwner != NULL);
5637
0
  Assert(CurrentResourceOwner == NULL ||
5638
0
       CurrentResourceOwner == AuxProcessResourceOwner);
5639
0
  CurrentResourceOwner = AuxProcessResourceOwner;
5640
5641
  /*
5642
   * Check that contents look valid.
5643
   */
5644
0
  if (!XRecOffIsValid(ControlFile->checkPoint))
5645
0
    ereport(FATAL,
5646
0
        (errcode(ERRCODE_DATA_CORRUPTED),
5647
0
         errmsg("control file contains invalid checkpoint location")));
5648
5649
0
  switch (ControlFile->state)
5650
0
  {
5651
0
    case DB_SHUTDOWNED:
5652
5653
      /*
5654
       * This is the expected case, so don't be chatty in standalone
5655
       * mode
5656
       */
5657
0
      ereport(IsPostmasterEnvironment ? LOG : NOTICE,
5658
0
          (errmsg("database system was shut down at %s",
5659
0
              str_time(ControlFile->time))));
5660
0
      break;
5661
5662
0
    case DB_SHUTDOWNED_IN_RECOVERY:
5663
0
      ereport(LOG,
5664
0
          (errmsg("database system was shut down in recovery at %s",
5665
0
              str_time(ControlFile->time))));
5666
0
      break;
5667
5668
0
    case DB_SHUTDOWNING:
5669
0
      ereport(LOG,
5670
0
          (errmsg("database system shutdown was interrupted; last known up at %s",
5671
0
              str_time(ControlFile->time))));
5672
0
      break;
5673
5674
0
    case DB_IN_CRASH_RECOVERY:
5675
0
      ereport(LOG,
5676
0
          (errmsg("database system was interrupted while in recovery at %s",
5677
0
              str_time(ControlFile->time)),
5678
0
           errhint("This probably means that some data is corrupted and"
5679
0
               " you will have to use the last backup for recovery.")));
5680
0
      break;
5681
5682
0
    case DB_IN_ARCHIVE_RECOVERY:
5683
0
      ereport(LOG,
5684
0
          (errmsg("database system was interrupted while in recovery at log time %s",
5685
0
              str_time(ControlFile->checkPointCopy.time)),
5686
0
           errhint("If this has occurred more than once some data might be corrupted"
5687
0
               " and you might need to choose an earlier recovery target.")));
5688
0
      break;
5689
5690
0
    case DB_IN_PRODUCTION:
5691
0
      ereport(LOG,
5692
0
          (errmsg("database system was interrupted; last known up at %s",
5693
0
              str_time(ControlFile->time))));
5694
0
      break;
5695
5696
0
    default:
5697
0
      ereport(FATAL,
5698
0
          (errcode(ERRCODE_DATA_CORRUPTED),
5699
0
           errmsg("control file contains invalid database cluster state")));
5700
0
  }
5701
5702
  /* This is just to allow attaching to startup process with a debugger */
5703
#ifdef XLOG_REPLAY_DELAY
5704
  if (ControlFile->state != DB_SHUTDOWNED)
5705
    pg_usleep(60000000L);
5706
#endif
5707
5708
  /*
5709
   * Verify that pg_wal, pg_wal/archive_status, and pg_wal/summaries exist.
5710
   * In cases where someone has performed a copy for PITR, these directories
5711
   * may have been excluded and need to be re-created.
5712
   */
5713
0
  ValidateXLOGDirectoryStructure();
5714
5715
  /* Set up timeout handler needed to report startup progress. */
5716
0
  if (!IsBootstrapProcessingMode())
5717
0
    RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
5718
0
            startup_progress_timeout_handler);
5719
5720
  /*----------
5721
   * If we previously crashed, perform a couple of actions:
5722
   *
5723
   * - The pg_wal directory may still include some temporary WAL segments
5724
   *   used when creating a new segment, so perform some clean up to not
5725
   *   bloat this path.  This is done first as there is no point to sync
5726
   *   this temporary data.
5727
   *
5728
   * - There might be data which we had written, intending to fsync it, but
5729
   *   which we had not actually fsync'd yet.  Therefore, a power failure in
5730
   *   the near future might cause earlier unflushed writes to be lost, even
5731
   *   though more recent data written to disk from here on would be
5732
   *   persisted.  To avoid that, fsync the entire data directory.
5733
   */
5734
0
  if (ControlFile->state != DB_SHUTDOWNED &&
5735
0
    ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
5736
0
  {
5737
0
    RemoveTempXlogFiles();
5738
0
    SyncDataDirectory();
5739
0
    didCrash = true;
5740
0
  }
5741
0
  else
5742
0
    didCrash = false;
5743
5744
  /*
5745
   * Prepare for WAL recovery if needed.
5746
   *
5747
   * InitWalRecovery analyzes the control file and the backup label file, if
5748
   * any.  It updates the in-memory ControlFile buffer according to the
5749
   * starting checkpoint, and sets InRecovery and ArchiveRecoveryRequested.
5750
   * It also applies the tablespace map file, if any.
5751
   */
5752
0
  InitWalRecovery(ControlFile, &wasShutdown,
5753
0
          &haveBackupLabel, &haveTblspcMap);
5754
0
  checkPoint = ControlFile->checkPointCopy;
5755
5756
  /* initialize shared memory variables from the checkpoint record */
5757
0
  TransamVariables->nextXid = checkPoint.nextXid;
5758
0
  TransamVariables->nextOid = checkPoint.nextOid;
5759
0
  TransamVariables->oidCount = 0;
5760
0
  MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5761
0
  AdvanceOldestClogXid(checkPoint.oldestXid);
5762
0
  SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5763
0
  SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5764
0
  SetCommitTsLimit(checkPoint.oldestCommitTsXid,
5765
0
           checkPoint.newestCommitTsXid);
5766
0
  XLogCtl->ckptFullXid = checkPoint.nextXid;
5767
5768
  /*
5769
   * Clear out any old relcache cache files.  This is *necessary* if we do
5770
   * any WAL replay, since that would probably result in the cache files
5771
   * being out of sync with database reality.  In theory we could leave them
5772
   * in place if the database had been cleanly shut down, but it seems
5773
   * safest to just remove them always and let them be rebuilt during the
5774
   * first backend startup.  These files needs to be removed from all
5775
   * directories including pg_tblspc, however the symlinks are created only
5776
   * after reading tablespace_map file in case of archive recovery from
5777
   * backup, so needs to clear old relcache files here after creating
5778
   * symlinks.
5779
   */
5780
0
  RelationCacheInitFileRemove();
5781
5782
  /*
5783
   * Initialize replication slots, before there's a chance to remove
5784
   * required resources.
5785
   */
5786
0
  StartupReplicationSlots();
5787
5788
  /*
5789
   * Startup logical state, needs to be setup now so we have proper data
5790
   * during crash recovery.
5791
   */
5792
0
  StartupReorderBuffer();
5793
5794
  /*
5795
   * Startup CLOG. This must be done after TransamVariables->nextXid has
5796
   * been initialized and before we accept connections or begin WAL replay.
5797
   */
5798
0
  StartupCLOG();
5799
5800
  /*
5801
   * Startup MultiXact. We need to do this early to be able to replay
5802
   * truncations.
5803
   */
5804
0
  StartupMultiXact();
5805
5806
  /*
5807
   * Ditto for commit timestamps.  Activate the facility if the setting is
5808
   * enabled in the control file, as there should be no tracking of commit
5809
   * timestamps done when the setting was disabled.  This facility can be
5810
   * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
5811
   */
5812
0
  if (ControlFile->track_commit_timestamp)
5813
0
    StartupCommitTs();
5814
5815
  /*
5816
   * Recover knowledge about replay progress of known replication partners.
5817
   */
5818
0
  StartupReplicationOrigin();
5819
5820
  /*
5821
   * Initialize unlogged LSN. On a clean shutdown, it's restored from the
5822
   * control file. On recovery, all unlogged relations are blown away, so
5823
   * the unlogged LSN counter can be reset too.
5824
   */
5825
0
  if (ControlFile->state == DB_SHUTDOWNED)
5826
0
    pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
5827
0
                     ControlFile->unloggedLSN);
5828
0
  else
5829
0
    pg_atomic_write_membarrier_u64(&XLogCtl->unloggedLSN,
5830
0
                     FirstNormalUnloggedLSN);
5831
5832
  /*
5833
   * Copy any missing timeline history files between 'now' and the recovery
5834
   * target timeline from archive to pg_wal. While we don't need those files
5835
   * ourselves - the history file of the recovery target timeline covers all
5836
   * the previous timelines in the history too - a cascading standby server
5837
   * might be interested in them. Or, if you archive the WAL from this
5838
   * server to a different archive than the primary, it'd be good for all
5839
   * the history files to get archived there after failover, so that you can
5840
   * use one of the old timelines as a PITR target. Timeline history files
5841
   * are small, so it's better to copy them unnecessarily than not copy them
5842
   * and regret later.
5843
   */
5844
0
  restoreTimeLineHistoryFiles(checkPoint.ThisTimeLineID, recoveryTargetTLI);
5845
5846
  /*
5847
   * Before running in recovery, scan pg_twophase and fill in its status to
5848
   * be able to work on entries generated by redo.  Doing a scan before
5849
   * taking any recovery action has the merit to discard any 2PC files that
5850
   * are newer than the first record to replay, saving from any conflicts at
5851
   * replay.  This avoids as well any subsequent scans when doing recovery
5852
   * of the on-disk two-phase data.
5853
   */
5854
0
  restoreTwoPhaseData();
5855
5856
  /*
5857
   * When starting with crash recovery, reset pgstat data - it might not be
5858
   * valid. Otherwise restore pgstat data. It's safe to do this here,
5859
   * because postmaster will not yet have started any other processes.
5860
   *
5861
   * NB: Restoring replication slot stats relies on slot state to have
5862
   * already been restored from disk.
5863
   *
5864
   * TODO: With a bit of extra work we could just start with a pgstat file
5865
   * associated with the checkpoint redo location we're starting from.
5866
   */
5867
0
  if (didCrash)
5868
0
    pgstat_discard_stats();
5869
0
  else
5870
0
    pgstat_restore_stats();
5871
5872
0
  lastFullPageWrites = checkPoint.fullPageWrites;
5873
5874
0
  RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5875
0
  doPageWrites = lastFullPageWrites;
5876
5877
  /* REDO */
5878
0
  if (InRecovery)
5879
0
  {
5880
    /* Initialize state for RecoveryInProgress() */
5881
0
    SpinLockAcquire(&XLogCtl->info_lck);
5882
0
    if (InArchiveRecovery)
5883
0
      XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
5884
0
    else
5885
0
      XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5886
0
    SpinLockRelease(&XLogCtl->info_lck);
5887
5888
    /*
5889
     * Update pg_control to show that we are recovering and to show the
5890
     * selected checkpoint as the place we are starting from. We also mark
5891
     * pg_control with any minimum recovery stop point obtained from a
5892
     * backup history file.
5893
     *
5894
     * No need to hold ControlFileLock yet, we aren't up far enough.
5895
     */
5896
0
    UpdateControlFile();
5897
5898
    /*
5899
     * If there was a backup label file, it's done its job and the info
5900
     * has now been propagated into pg_control.  We must get rid of the
5901
     * label file so that if we crash during recovery, we'll pick up at
5902
     * the latest recovery restartpoint instead of going all the way back
5903
     * to the backup start point.  It seems prudent though to just rename
5904
     * the file out of the way rather than delete it completely.
5905
     */
5906
0
    if (haveBackupLabel)
5907
0
    {
5908
0
      unlink(BACKUP_LABEL_OLD);
5909
0
      durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
5910
0
    }
5911
5912
    /*
5913
     * If there was a tablespace_map file, it's done its job and the
5914
     * symlinks have been created.  We must get rid of the map file so
5915
     * that if we crash during recovery, we don't create symlinks again.
5916
     * It seems prudent though to just rename the file out of the way
5917
     * rather than delete it completely.
5918
     */
5919
0
    if (haveTblspcMap)
5920
0
    {
5921
0
      unlink(TABLESPACE_MAP_OLD);
5922
0
      durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
5923
0
    }
5924
5925
    /*
5926
     * Initialize our local copy of minRecoveryPoint.  When doing crash
5927
     * recovery we want to replay up to the end of WAL.  Particularly, in
5928
     * the case of a promoted standby minRecoveryPoint value in the
5929
     * control file is only updated after the first checkpoint.  However,
5930
     * if the instance crashes before the first post-recovery checkpoint
5931
     * is completed then recovery will use a stale location causing the
5932
     * startup process to think that there are still invalid page
5933
     * references when checking for data consistency.
5934
     */
5935
0
    if (InArchiveRecovery)
5936
0
    {
5937
0
      LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
5938
0
      LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
5939
0
    }
5940
0
    else
5941
0
    {
5942
0
      LocalMinRecoveryPoint = InvalidXLogRecPtr;
5943
0
      LocalMinRecoveryPointTLI = 0;
5944
0
    }
5945
5946
    /* Check that the GUCs used to generate the WAL allow recovery */
5947
0
    CheckRequiredParameterValues();
5948
5949
    /*
5950
     * We're in recovery, so unlogged relations may be trashed and must be
5951
     * reset.  This should be done BEFORE allowing Hot Standby
5952
     * connections, so that read-only backends don't try to read whatever
5953
     * garbage is left over from before.
5954
     */
5955
0
    ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
5956
5957
    /*
5958
     * Likewise, delete any saved transaction snapshot files that got left
5959
     * behind by crashed backends.
5960
     */
5961
0
    DeleteAllExportedSnapshotFiles();
5962
5963
    /*
5964
     * Initialize for Hot Standby, if enabled. We won't let backends in
5965
     * yet, not until we've reached the min recovery point specified in
5966
     * control file and we've established a recovery snapshot from a
5967
     * running-xacts WAL record.
5968
     */
5969
0
    if (ArchiveRecoveryRequested && EnableHotStandby)
5970
0
    {
5971
0
      TransactionId *xids;
5972
0
      int     nxids;
5973
5974
0
      ereport(DEBUG1,
5975
0
          (errmsg_internal("initializing for hot standby")));
5976
5977
0
      InitRecoveryTransactionEnvironment();
5978
5979
0
      if (wasShutdown)
5980
0
        oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
5981
0
      else
5982
0
        oldestActiveXID = checkPoint.oldestActiveXid;
5983
0
      Assert(TransactionIdIsValid(oldestActiveXID));
5984
5985
      /* Tell procarray about the range of xids it has to deal with */
5986
0
      ProcArrayInitRecovery(XidFromFullTransactionId(TransamVariables->nextXid));
5987
5988
      /*
5989
       * Startup subtrans only.  CLOG, MultiXact and commit timestamp
5990
       * have already been started up and other SLRUs are not maintained
5991
       * during recovery and need not be started yet.
5992
       */
5993
0
      StartupSUBTRANS(oldestActiveXID);
5994
5995
      /*
5996
       * If we're beginning at a shutdown checkpoint, we know that
5997
       * nothing was running on the primary at this point. So fake-up an
5998
       * empty running-xacts record and use that here and now. Recover
5999
       * additional standby state for prepared transactions.
6000
       */
6001
0
      if (wasShutdown)
6002
0
      {
6003
0
        RunningTransactionsData running;
6004
0
        TransactionId latestCompletedXid;
6005
6006
        /* Update pg_subtrans entries for any prepared transactions */
6007
0
        StandbyRecoverPreparedTransactions();
6008
6009
        /*
6010
         * Construct a RunningTransactions snapshot representing a
6011
         * shut down server, with only prepared transactions still
6012
         * alive. We're never overflowed at this point because all
6013
         * subxids are listed with their parent prepared transactions.
6014
         */
6015
0
        running.xcnt = nxids;
6016
0
        running.subxcnt = 0;
6017
0
        running.subxid_status = SUBXIDS_IN_SUBTRANS;
6018
0
        running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
6019
0
        running.oldestRunningXid = oldestActiveXID;
6020
0
        latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
6021
0
        TransactionIdRetreat(latestCompletedXid);
6022
0
        Assert(TransactionIdIsNormal(latestCompletedXid));
6023
0
        running.latestCompletedXid = latestCompletedXid;
6024
0
        running.xids = xids;
6025
6026
0
        ProcArrayApplyRecoveryInfo(&running);
6027
0
      }
6028
0
    }
6029
6030
    /*
6031
     * We're all set for replaying the WAL now. Do it.
6032
     */
6033
0
    PerformWalRecovery();
6034
0
    performedWalRecovery = true;
6035
0
  }
6036
0
  else
6037
0
    performedWalRecovery = false;
6038
6039
  /*
6040
   * Finish WAL recovery.
6041
   */
6042
0
  endOfRecoveryInfo = FinishWalRecovery();
6043
0
  EndOfLog = endOfRecoveryInfo->endOfLog;
6044
0
  EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
6045
0
  abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
6046
0
  missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
6047
6048
  /*
6049
   * Reset ps status display, so as no information related to recovery shows
6050
   * up.
6051
   */
6052
0
  set_ps_display("");
6053
6054
  /*
6055
   * When recovering from a backup (we are in recovery, and archive recovery
6056
   * was requested), complain if we did not roll forward far enough to reach
6057
   * the point where the database is consistent.  For regular online
6058
   * backup-from-primary, that means reaching the end-of-backup WAL record
6059
   * (at which point we reset backupStartPoint to be Invalid), for
6060
   * backup-from-replica (which can't inject records into the WAL stream),
6061
   * that point is when we reach the minRecoveryPoint in pg_control (which
6062
   * we purposefully copy last when backing up from a replica).  For
6063
   * pg_rewind (which creates a backup_label with a method of "pg_rewind")
6064
   * or snapshot-style backups (which don't), backupEndRequired will be set
6065
   * to false.
6066
   *
6067
   * Note: it is indeed okay to look at the local variable
6068
   * LocalMinRecoveryPoint here, even though ControlFile->minRecoveryPoint
6069
   * might be further ahead --- ControlFile->minRecoveryPoint cannot have
6070
   * been advanced beyond the WAL we processed.
6071
   */
6072
0
  if (InRecovery &&
6073
0
    (EndOfLog < LocalMinRecoveryPoint ||
6074
0
     !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6075
0
  {
6076
    /*
6077
     * Ran off end of WAL before reaching end-of-backup WAL record, or
6078
     * minRecoveryPoint. That's a bad sign, indicating that you tried to
6079
     * recover from an online backup but never called pg_backup_stop(), or
6080
     * you didn't archive all the WAL needed.
6081
     */
6082
0
    if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
6083
0
    {
6084
0
      if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) || ControlFile->backupEndRequired)
6085
0
        ereport(FATAL,
6086
0
            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6087
0
             errmsg("WAL ends before end of online backup"),
6088
0
             errhint("All WAL generated while online backup was taken must be available at recovery.")));
6089
0
      else
6090
0
        ereport(FATAL,
6091
0
            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6092
0
             errmsg("WAL ends before consistent recovery point")));
6093
0
    }
6094
0
  }
6095
6096
  /*
6097
   * Reset unlogged relations to the contents of their INIT fork. This is
6098
   * done AFTER recovery is complete so as to include any unlogged relations
6099
   * created during recovery, but BEFORE recovery is marked as having
6100
   * completed successfully. Otherwise we'd not retry if any of the post
6101
   * end-of-recovery steps fail.
6102
   */
6103
0
  if (InRecovery)
6104
0
    ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
6105
6106
  /*
6107
   * Pre-scan prepared transactions to find out the range of XIDs present.
6108
   * This information is not quite needed yet, but it is positioned here so
6109
   * as potential problems are detected before any on-disk change is done.
6110
   */
6111
0
  oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6112
6113
  /*
6114
   * Allow ordinary WAL segment creation before possibly switching to a new
6115
   * timeline, which creates a new segment, and after the last ReadRecord().
6116
   */
6117
0
  SetInstallXLogFileSegmentActive();
6118
6119
  /*
6120
   * Consider whether we need to assign a new timeline ID.
6121
   *
6122
   * If we did archive recovery, we always assign a new ID.  This handles a
6123
   * couple of issues.  If we stopped short of the end of WAL during
6124
   * recovery, then we are clearly generating a new timeline and must assign
6125
   * it a unique new ID.  Even if we ran to the end, modifying the current
6126
   * last segment is problematic because it may result in trying to
6127
   * overwrite an already-archived copy of that segment, and we encourage
6128
   * DBAs to make their archive_commands reject that.  We can dodge the
6129
   * problem by making the new active segment have a new timeline ID.
6130
   *
6131
   * In a normal crash recovery, we can just extend the timeline we were in.
6132
   */
6133
0
  newTLI = endOfRecoveryInfo->lastRecTLI;
6134
0
  if (ArchiveRecoveryRequested)
6135
0
  {
6136
0
    newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
6137
0
    ereport(LOG,
6138
0
        (errmsg("selected new timeline ID: %u", newTLI)));
6139
6140
    /*
6141
     * Make a writable copy of the last WAL segment.  (Note that we also
6142
     * have a copy of the last block of the old WAL in
6143
     * endOfRecovery->lastPage; we will use that below.)
6144
     */
6145
0
    XLogInitNewTimeline(EndOfLogTLI, EndOfLog, newTLI);
6146
6147
    /*
6148
     * Remove the signal files out of the way, so that we don't
6149
     * accidentally re-enter archive recovery mode in a subsequent crash.
6150
     */
6151
0
    if (endOfRecoveryInfo->standby_signal_file_found)
6152
0
      durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
6153
6154
0
    if (endOfRecoveryInfo->recovery_signal_file_found)
6155
0
      durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
6156
6157
    /*
6158
     * Write the timeline history file, and have it archived. After this
6159
     * point (or rather, as soon as the file is archived), the timeline
6160
     * will appear as "taken" in the WAL archive and to any standby
6161
     * servers.  If we crash before actually switching to the new
6162
     * timeline, standby servers will nevertheless think that we switched
6163
     * to the new timeline, and will try to connect to the new timeline.
6164
     * To minimize the window for that, try to do as little as possible
6165
     * between here and writing the end-of-recovery record.
6166
     */
6167
0
    writeTimeLineHistory(newTLI, recoveryTargetTLI,
6168
0
               EndOfLog, endOfRecoveryInfo->recoveryStopReason);
6169
6170
0
    ereport(LOG,
6171
0
        (errmsg("archive recovery complete")));
6172
0
  }
6173
6174
  /* Save the selected TimeLineID in shared memory, too */
6175
0
  SpinLockAcquire(&XLogCtl->info_lck);
6176
0
  XLogCtl->InsertTimeLineID = newTLI;
6177
0
  XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
6178
0
  SpinLockRelease(&XLogCtl->info_lck);
6179
6180
  /*
6181
   * Actually, if WAL ended in an incomplete record, skip the parts that
6182
   * made it through and start writing after the portion that persisted.
6183
   * (It's critical to first write an OVERWRITE_CONTRECORD message, which
6184
   * we'll do as soon as we're open for writing new WAL.)
6185
   */
6186
0
  if (!XLogRecPtrIsInvalid(missingContrecPtr))
6187
0
  {
6188
    /*
6189
     * We should only have a missingContrecPtr if we're not switching to a
6190
     * new timeline. When a timeline switch occurs, WAL is copied from the
6191
     * old timeline to the new only up to the end of the last complete
6192
     * record, so there can't be an incomplete WAL record that we need to
6193
     * disregard.
6194
     */
6195
0
    Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
6196
0
    Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
6197
0
    EndOfLog = missingContrecPtr;
6198
0
  }
6199
6200
  /*
6201
   * Prepare to write WAL starting at EndOfLog location, and init xlog
6202
   * buffer cache using the block containing the last record from the
6203
   * previous incarnation.
6204
   */
6205
0
  Insert = &XLogCtl->Insert;
6206
0
  Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
6207
0
  Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
6208
6209
  /*
6210
   * Tricky point here: lastPage contains the *last* block that the LastRec
6211
   * record spans, not the one it starts in.  The last block is indeed the
6212
   * one we want to use.
6213
   */
6214
0
  if (EndOfLog % XLOG_BLCKSZ != 0)
6215
0
  {
6216
0
    char     *page;
6217
0
    int     len;
6218
0
    int     firstIdx;
6219
6220
0
    firstIdx = XLogRecPtrToBufIdx(EndOfLog);
6221
0
    len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
6222
0
    Assert(len < XLOG_BLCKSZ);
6223
6224
    /* Copy the valid part of the last block, and zero the rest */
6225
0
    page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
6226
0
    memcpy(page, endOfRecoveryInfo->lastPage, len);
6227
0
    memset(page + len, 0, XLOG_BLCKSZ - len);
6228
6229
0
    pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
6230
0
    pg_atomic_write_u64(&XLogCtl->InitializedUpTo, endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
6231
0
    XLogCtl->InitializedFrom = endOfRecoveryInfo->lastPageBeginPtr;
6232
0
  }
6233
0
  else
6234
0
  {
6235
    /*
6236
     * There is no partial block to copy. Just set InitializedUpTo, and
6237
     * let the first attempt to insert a log record to initialize the next
6238
     * buffer.
6239
     */
6240
0
    pg_atomic_write_u64(&XLogCtl->InitializedUpTo, EndOfLog);
6241
0
    XLogCtl->InitializedFrom = EndOfLog;
6242
0
  }
6243
0
  pg_atomic_write_u64(&XLogCtl->InitializeReserved, pg_atomic_read_u64(&XLogCtl->InitializedUpTo));
6244
6245
  /*
6246
   * Update local and shared status.  This is OK to do without any locks
6247
   * because no other process can be reading or writing WAL yet.
6248
   */
6249
0
  LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
6250
0
  pg_atomic_write_u64(&XLogCtl->logInsertResult, EndOfLog);
6251
0
  pg_atomic_write_u64(&XLogCtl->logWriteResult, EndOfLog);
6252
0
  pg_atomic_write_u64(&XLogCtl->logFlushResult, EndOfLog);
6253
0
  XLogCtl->LogwrtRqst.Write = EndOfLog;
6254
0
  XLogCtl->LogwrtRqst.Flush = EndOfLog;
6255
6256
  /*
6257
   * Preallocate additional log files, if wanted.
6258
   */
6259
0
  PreallocXlogFiles(EndOfLog, newTLI);
6260
6261
  /*
6262
   * Okay, we're officially UP.
6263
   */
6264
0
  InRecovery = false;
6265
6266
  /* start the archive_timeout timer and LSN running */
6267
0
  XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
6268
0
  XLogCtl->lastSegSwitchLSN = EndOfLog;
6269
6270
  /* also initialize latestCompletedXid, to nextXid - 1 */
6271
0
  LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
6272
0
  TransamVariables->latestCompletedXid = TransamVariables->nextXid;
6273
0
  FullTransactionIdRetreat(&TransamVariables->latestCompletedXid);
6274
0
  LWLockRelease(ProcArrayLock);
6275
6276
  /*
6277
   * Start up subtrans, if not already done for hot standby.  (commit
6278
   * timestamps are started below, if necessary.)
6279
   */
6280
0
  if (standbyState == STANDBY_DISABLED)
6281
0
    StartupSUBTRANS(oldestActiveXID);
6282
6283
  /*
6284
   * Perform end of recovery actions for any SLRUs that need it.
6285
   */
6286
0
  TrimCLOG();
6287
0
  TrimMultiXact();
6288
6289
  /*
6290
   * Reload shared-memory state for prepared transactions.  This needs to
6291
   * happen before renaming the last partial segment of the old timeline as
6292
   * it may be possible that we have to recover some transactions from it.
6293
   */
6294
0
  RecoverPreparedTransactions();
6295
6296
  /* Shut down xlogreader */
6297
0
  ShutdownWalRecovery();
6298
6299
  /* Enable WAL writes for this backend only. */
6300
0
  LocalSetXLogInsertAllowed();
6301
6302
  /* If necessary, write overwrite-contrecord before doing anything else */
6303
0
  if (!XLogRecPtrIsInvalid(abortedRecPtr))
6304
0
  {
6305
0
    Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
6306
0
    CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
6307
0
  }
6308
6309
  /*
6310
   * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
6311
   * record before resource manager writes cleanup WAL records or checkpoint
6312
   * record is written.
6313
   */
6314
0
  Insert->fullPageWrites = lastFullPageWrites;
6315
0
  UpdateFullPageWrites();
6316
6317
  /*
6318
   * Emit checkpoint or end-of-recovery record in XLOG, if required.
6319
   */
6320
0
  if (performedWalRecovery)
6321
0
    promoted = PerformRecoveryXLogAction();
6322
6323
  /*
6324
   * If any of the critical GUCs have changed, log them before we allow
6325
   * backends to write WAL.
6326
   */
6327
0
  XLogReportParameters();
6328
6329
  /* If this is archive recovery, perform post-recovery cleanup actions. */
6330
0
  if (ArchiveRecoveryRequested)
6331
0
    CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
6332
6333
  /*
6334
   * Local WAL inserts enabled, so it's time to finish initialization of
6335
   * commit timestamp.
6336
   */
6337
0
  CompleteCommitTsInitialization();
6338
6339
  /*
6340
   * All done with end-of-recovery actions.
6341
   *
6342
   * Now allow backends to write WAL and update the control file status in
6343
   * consequence.  SharedRecoveryState, that controls if backends can write
6344
   * WAL, is updated while holding ControlFileLock to prevent other backends
6345
   * to look at an inconsistent state of the control file in shared memory.
6346
   * There is still a small window during which backends can write WAL and
6347
   * the control file is still referring to a system not in DB_IN_PRODUCTION
6348
   * state while looking at the on-disk control file.
6349
   *
6350
   * Also, we use info_lck to update SharedRecoveryState to ensure that
6351
   * there are no race conditions concerning visibility of other recent
6352
   * updates to shared memory.
6353
   */
6354
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6355
0
  ControlFile->state = DB_IN_PRODUCTION;
6356
6357
0
  SpinLockAcquire(&XLogCtl->info_lck);
6358
0
  XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
6359
0
  SpinLockRelease(&XLogCtl->info_lck);
6360
6361
0
  UpdateControlFile();
6362
0
  LWLockRelease(ControlFileLock);
6363
6364
  /*
6365
   * Shutdown the recovery environment.  This must occur after
6366
   * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
6367
   * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
6368
   * any session building a snapshot will not rely on KnownAssignedXids as
6369
   * RecoveryInProgress() would return false at this stage.  This is
6370
   * particularly critical for prepared 2PC transactions, that would still
6371
   * need to be included in snapshots once recovery has ended.
6372
   */
6373
0
  if (standbyState != STANDBY_DISABLED)
6374
0
    ShutdownRecoveryTransactionEnvironment();
6375
6376
  /*
6377
   * If there were cascading standby servers connected to us, nudge any wal
6378
   * sender processes to notice that we've been promoted.
6379
   */
6380
0
  WalSndWakeup(true, true);
6381
6382
  /*
6383
   * If this was a promotion, request an (online) checkpoint now. This isn't
6384
   * required for consistency, but the last restartpoint might be far back,
6385
   * and in case of a crash, recovering from it might take a longer than is
6386
   * appropriate now that we're not in standby mode anymore.
6387
   */
6388
0
  if (promoted)
6389
0
    RequestCheckpoint(CHECKPOINT_FORCE);
6390
0
}
6391
6392
/*
6393
 * Callback from PerformWalRecovery(), called when we switch from crash
6394
 * recovery to archive recovery mode.  Updates the control file accordingly.
6395
 */
6396
void
6397
SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr, TimeLineID replayTLI)
6398
0
{
6399
  /* initialize minRecoveryPoint to this record */
6400
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6401
0
  ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6402
0
  if (ControlFile->minRecoveryPoint < EndRecPtr)
6403
0
  {
6404
0
    ControlFile->minRecoveryPoint = EndRecPtr;
6405
0
    ControlFile->minRecoveryPointTLI = replayTLI;
6406
0
  }
6407
  /* update local copy */
6408
0
  LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
6409
0
  LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6410
6411
  /*
6412
   * The startup process can update its local copy of minRecoveryPoint from
6413
   * this point.
6414
   */
6415
0
  updateMinRecoveryPoint = true;
6416
6417
0
  UpdateControlFile();
6418
6419
  /*
6420
   * We update SharedRecoveryState while holding the lock on ControlFileLock
6421
   * so both states are consistent in shared memory.
6422
   */
6423
0
  SpinLockAcquire(&XLogCtl->info_lck);
6424
0
  XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
6425
0
  SpinLockRelease(&XLogCtl->info_lck);
6426
6427
0
  LWLockRelease(ControlFileLock);
6428
0
}
6429
6430
/*
6431
 * Callback from PerformWalRecovery(), called when we reach the end of backup.
6432
 * Updates the control file accordingly.
6433
 */
6434
void
6435
ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
6436
0
{
6437
  /*
6438
   * We have reached the end of base backup, as indicated by pg_control. The
6439
   * data on disk is now consistent (unless minRecoveryPoint is further
6440
   * ahead, which can happen if we crashed during previous recovery).  Reset
6441
   * backupStartPoint and backupEndPoint, and update minRecoveryPoint to
6442
   * make sure we don't allow starting up at an earlier point even if
6443
   * recovery is stopped and restarted soon after this.
6444
   */
6445
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6446
6447
0
  if (ControlFile->minRecoveryPoint < EndRecPtr)
6448
0
  {
6449
0
    ControlFile->minRecoveryPoint = EndRecPtr;
6450
0
    ControlFile->minRecoveryPointTLI = tli;
6451
0
  }
6452
6453
0
  ControlFile->backupStartPoint = InvalidXLogRecPtr;
6454
0
  ControlFile->backupEndPoint = InvalidXLogRecPtr;
6455
0
  ControlFile->backupEndRequired = false;
6456
0
  UpdateControlFile();
6457
6458
0
  LWLockRelease(ControlFileLock);
6459
0
}
6460
6461
/*
6462
 * Perform whatever XLOG actions are necessary at end of REDO.
6463
 *
6464
 * The goal here is to make sure that we'll be able to recover properly if
6465
 * we crash again. If we choose to write a checkpoint, we'll write a shutdown
6466
 * checkpoint rather than an on-line one. This is not particularly critical,
6467
 * but since we may be assigning a new TLI, using a shutdown checkpoint allows
6468
 * us to have the rule that TLI only changes in shutdown checkpoints, which
6469
 * allows some extra error checking in xlog_redo.
6470
 */
6471
static bool
6472
PerformRecoveryXLogAction(void)
6473
0
{
6474
0
  bool    promoted = false;
6475
6476
  /*
6477
   * Perform a checkpoint to update all our recovery activity to disk.
6478
   *
6479
   * Note that we write a shutdown checkpoint rather than an on-line one.
6480
   * This is not particularly critical, but since we may be assigning a new
6481
   * TLI, using a shutdown checkpoint allows us to have the rule that TLI
6482
   * only changes in shutdown checkpoints, which allows some extra error
6483
   * checking in xlog_redo.
6484
   *
6485
   * In promotion, only create a lightweight end-of-recovery record instead
6486
   * of a full checkpoint. A checkpoint is requested later, after we're
6487
   * fully out of recovery mode and already accepting queries.
6488
   */
6489
0
  if (ArchiveRecoveryRequested && IsUnderPostmaster &&
6490
0
    PromoteIsTriggered())
6491
0
  {
6492
0
    promoted = true;
6493
6494
    /*
6495
     * Insert a special WAL record to mark the end of recovery, since we
6496
     * aren't doing a checkpoint. That means that the checkpointer process
6497
     * may likely be in the middle of a time-smoothed restartpoint and
6498
     * could continue to be for minutes after this.  That sounds strange,
6499
     * but the effect is roughly the same and it would be stranger to try
6500
     * to come out of the restartpoint and then checkpoint. We request a
6501
     * checkpoint later anyway, just for safety.
6502
     */
6503
0
    CreateEndOfRecoveryRecord();
6504
0
  }
6505
0
  else
6506
0
  {
6507
0
    RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
6508
0
              CHECKPOINT_IMMEDIATE |
6509
0
              CHECKPOINT_WAIT);
6510
0
  }
6511
6512
0
  return promoted;
6513
0
}
6514
6515
/*
6516
 * Is the system still in recovery?
6517
 *
6518
 * Unlike testing InRecovery, this works in any process that's connected to
6519
 * shared memory.
6520
 */
6521
bool
6522
RecoveryInProgress(void)
6523
0
{
6524
  /*
6525
   * We check shared state each time only until we leave recovery mode. We
6526
   * can't re-enter recovery, so there's no need to keep checking after the
6527
   * shared variable has once been seen false.
6528
   */
6529
0
  if (!LocalRecoveryInProgress)
6530
0
    return false;
6531
0
  else
6532
0
  {
6533
    /*
6534
     * use volatile pointer to make sure we make a fresh read of the
6535
     * shared variable.
6536
     */
6537
0
    volatile XLogCtlData *xlogctl = XLogCtl;
6538
6539
0
    LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
6540
6541
    /*
6542
     * Note: We don't need a memory barrier when we're still in recovery.
6543
     * We might exit recovery immediately after return, so the caller
6544
     * can't rely on 'true' meaning that we're still in recovery anyway.
6545
     */
6546
6547
0
    return LocalRecoveryInProgress;
6548
0
  }
6549
0
}
6550
6551
/*
6552
 * Returns current recovery state from shared memory.
6553
 *
6554
 * This returned state is kept consistent with the contents of the control
6555
 * file.  See details about the possible values of RecoveryState in xlog.h.
6556
 */
6557
RecoveryState
6558
GetRecoveryState(void)
6559
0
{
6560
0
  RecoveryState retval;
6561
6562
0
  SpinLockAcquire(&XLogCtl->info_lck);
6563
0
  retval = XLogCtl->SharedRecoveryState;
6564
0
  SpinLockRelease(&XLogCtl->info_lck);
6565
6566
0
  return retval;
6567
0
}
6568
6569
/*
6570
 * Is this process allowed to insert new WAL records?
6571
 *
6572
 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
6573
 * But we also have provisions for forcing the result "true" or "false"
6574
 * within specific processes regardless of the global state.
6575
 */
6576
bool
6577
XLogInsertAllowed(void)
6578
0
{
6579
  /*
6580
   * If value is "unconditionally true" or "unconditionally false", just
6581
   * return it.  This provides the normal fast path once recovery is known
6582
   * done.
6583
   */
6584
0
  if (LocalXLogInsertAllowed >= 0)
6585
0
    return (bool) LocalXLogInsertAllowed;
6586
6587
  /*
6588
   * Else, must check to see if we're still in recovery.
6589
   */
6590
0
  if (RecoveryInProgress())
6591
0
    return false;
6592
6593
  /*
6594
   * On exit from recovery, reset to "unconditionally true", since there is
6595
   * no need to keep checking.
6596
   */
6597
0
  LocalXLogInsertAllowed = 1;
6598
0
  return true;
6599
0
}
6600
6601
/*
6602
 * Make XLogInsertAllowed() return true in the current process only.
6603
 *
6604
 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
6605
 * and even call LocalSetXLogInsertAllowed() again after that.
6606
 *
6607
 * Returns the previous value of LocalXLogInsertAllowed.
6608
 */
6609
static int
6610
LocalSetXLogInsertAllowed(void)
6611
0
{
6612
0
  int     oldXLogAllowed = LocalXLogInsertAllowed;
6613
6614
0
  LocalXLogInsertAllowed = 1;
6615
6616
0
  return oldXLogAllowed;
6617
0
}
6618
6619
/*
6620
 * Return the current Redo pointer from shared memory.
6621
 *
6622
 * As a side-effect, the local RedoRecPtr copy is updated.
6623
 */
6624
XLogRecPtr
6625
GetRedoRecPtr(void)
6626
0
{
6627
0
  XLogRecPtr  ptr;
6628
6629
  /*
6630
   * The possibly not up-to-date copy in XlogCtl is enough. Even if we
6631
   * grabbed a WAL insertion lock to read the authoritative value in
6632
   * Insert->RedoRecPtr, someone might update it just after we've released
6633
   * the lock.
6634
   */
6635
0
  SpinLockAcquire(&XLogCtl->info_lck);
6636
0
  ptr = XLogCtl->RedoRecPtr;
6637
0
  SpinLockRelease(&XLogCtl->info_lck);
6638
6639
0
  if (RedoRecPtr < ptr)
6640
0
    RedoRecPtr = ptr;
6641
6642
0
  return RedoRecPtr;
6643
0
}
6644
6645
/*
6646
 * Return information needed to decide whether a modified block needs a
6647
 * full-page image to be included in the WAL record.
6648
 *
6649
 * The returned values are cached copies from backend-private memory, and
6650
 * possibly out-of-date or, indeed, uninitialized, in which case they will
6651
 * be InvalidXLogRecPtr and false, respectively.  XLogInsertRecord will
6652
 * re-check them against up-to-date values, while holding the WAL insert lock.
6653
 */
6654
void
6655
GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
6656
0
{
6657
0
  *RedoRecPtr_p = RedoRecPtr;
6658
0
  *doPageWrites_p = doPageWrites;
6659
0
}
6660
6661
/*
6662
 * GetInsertRecPtr -- Returns the current insert position.
6663
 *
6664
 * NOTE: The value *actually* returned is the position of the last full
6665
 * xlog page. It lags behind the real insert position by at most 1 page.
6666
 * For that, we don't need to scan through WAL insertion locks, and an
6667
 * approximation is enough for the current usage of this function.
6668
 */
6669
XLogRecPtr
6670
GetInsertRecPtr(void)
6671
0
{
6672
0
  XLogRecPtr  recptr;
6673
6674
0
  SpinLockAcquire(&XLogCtl->info_lck);
6675
0
  recptr = XLogCtl->LogwrtRqst.Write;
6676
0
  SpinLockRelease(&XLogCtl->info_lck);
6677
6678
0
  return recptr;
6679
0
}
6680
6681
/*
6682
 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
6683
 * position known to be fsync'd to disk. This should only be used on a
6684
 * system that is known not to be in recovery.
6685
 */
6686
XLogRecPtr
6687
GetFlushRecPtr(TimeLineID *insertTLI)
6688
0
{
6689
0
  Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
6690
6691
0
  RefreshXLogWriteResult(LogwrtResult);
6692
6693
  /*
6694
   * If we're writing and flushing WAL, the time line can't be changing, so
6695
   * no lock is required.
6696
   */
6697
0
  if (insertTLI)
6698
0
    *insertTLI = XLogCtl->InsertTimeLineID;
6699
6700
0
  return LogwrtResult.Flush;
6701
0
}
6702
6703
/*
6704
 * GetWALInsertionTimeLine -- Returns the current timeline of a system that
6705
 * is not in recovery.
6706
 */
6707
TimeLineID
6708
GetWALInsertionTimeLine(void)
6709
0
{
6710
0
  Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE);
6711
6712
  /* Since the value can't be changing, no lock is required. */
6713
0
  return XLogCtl->InsertTimeLineID;
6714
0
}
6715
6716
/*
6717
 * GetWALInsertionTimeLineIfSet -- If the system is not in recovery, returns
6718
 * the WAL insertion timeline; else, returns 0. Wherever possible, use
6719
 * GetWALInsertionTimeLine() instead, since it's cheaper. Note that this
6720
 * function decides recovery has ended as soon as the insert TLI is set, which
6721
 * happens before we set XLogCtl->SharedRecoveryState to RECOVERY_STATE_DONE.
6722
 */
6723
TimeLineID
6724
GetWALInsertionTimeLineIfSet(void)
6725
0
{
6726
0
  TimeLineID  insertTLI;
6727
6728
0
  SpinLockAcquire(&XLogCtl->info_lck);
6729
0
  insertTLI = XLogCtl->InsertTimeLineID;
6730
0
  SpinLockRelease(&XLogCtl->info_lck);
6731
6732
0
  return insertTLI;
6733
0
}
6734
6735
/*
6736
 * GetLastImportantRecPtr -- Returns the LSN of the last important record
6737
 * inserted. All records not explicitly marked as unimportant are considered
6738
 * important.
6739
 *
6740
 * The LSN is determined by computing the maximum of
6741
 * WALInsertLocks[i].lastImportantAt.
6742
 */
6743
XLogRecPtr
6744
GetLastImportantRecPtr(void)
6745
0
{
6746
0
  XLogRecPtr  res = InvalidXLogRecPtr;
6747
0
  int     i;
6748
6749
0
  for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
6750
0
  {
6751
0
    XLogRecPtr  last_important;
6752
6753
    /*
6754
     * Need to take a lock to prevent torn reads of the LSN, which are
6755
     * possible on some of the supported platforms. WAL insert locks only
6756
     * support exclusive mode, so we have to use that.
6757
     */
6758
0
    LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
6759
0
    last_important = WALInsertLocks[i].l.lastImportantAt;
6760
0
    LWLockRelease(&WALInsertLocks[i].l.lock);
6761
6762
0
    if (res < last_important)
6763
0
      res = last_important;
6764
0
  }
6765
6766
0
  return res;
6767
0
}
6768
6769
/*
6770
 * Get the time and LSN of the last xlog segment switch
6771
 */
6772
pg_time_t
6773
GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
6774
0
{
6775
0
  pg_time_t result;
6776
6777
  /* Need WALWriteLock, but shared lock is sufficient */
6778
0
  LWLockAcquire(WALWriteLock, LW_SHARED);
6779
0
  result = XLogCtl->lastSegSwitchTime;
6780
0
  *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
6781
0
  LWLockRelease(WALWriteLock);
6782
6783
0
  return result;
6784
0
}
6785
6786
/*
6787
 * This must be called ONCE during postmaster or standalone-backend shutdown
6788
 */
6789
void
6790
ShutdownXLOG(int code, Datum arg)
6791
0
{
6792
  /*
6793
   * We should have an aux process resource owner to use, and we should not
6794
   * be in a transaction that's installed some other resowner.
6795
   */
6796
0
  Assert(AuxProcessResourceOwner != NULL);
6797
0
  Assert(CurrentResourceOwner == NULL ||
6798
0
       CurrentResourceOwner == AuxProcessResourceOwner);
6799
0
  CurrentResourceOwner = AuxProcessResourceOwner;
6800
6801
  /* Don't be chatty in standalone mode */
6802
0
  ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6803
0
      (errmsg("shutting down")));
6804
6805
  /*
6806
   * Signal walsenders to move to stopping state.
6807
   */
6808
0
  WalSndInitStopping();
6809
6810
  /*
6811
   * Wait for WAL senders to be in stopping state.  This prevents commands
6812
   * from writing new WAL.
6813
   */
6814
0
  WalSndWaitStopping();
6815
6816
0
  if (RecoveryInProgress())
6817
0
    CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6818
0
  else
6819
0
  {
6820
    /*
6821
     * If archiving is enabled, rotate the last XLOG file so that all the
6822
     * remaining records are archived (postmaster wakes up the archiver
6823
     * process one more time at the end of shutdown). The checkpoint
6824
     * record will go to the next XLOG file and won't be archived (yet).
6825
     */
6826
0
    if (XLogArchivingActive())
6827
0
      RequestXLogSwitch(false);
6828
6829
0
    CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6830
0
  }
6831
0
}
6832
6833
/*
6834
 * Log start of a checkpoint.
6835
 */
6836
static void
6837
LogCheckpointStart(int flags, bool restartpoint)
6838
0
{
6839
0
  if (restartpoint)
6840
0
    ereport(LOG,
6841
    /* translator: the placeholders show checkpoint options */
6842
0
        (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
6843
0
            (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6844
0
            (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6845
0
            (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6846
0
            (flags & CHECKPOINT_FORCE) ? " force" : "",
6847
0
            (flags & CHECKPOINT_WAIT) ? " wait" : "",
6848
0
            (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
6849
0
            (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
6850
0
            (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
6851
0
  else
6852
0
    ereport(LOG,
6853
    /* translator: the placeholders show checkpoint options */
6854
0
        (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
6855
0
            (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
6856
0
            (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
6857
0
            (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
6858
0
            (flags & CHECKPOINT_FORCE) ? " force" : "",
6859
0
            (flags & CHECKPOINT_WAIT) ? " wait" : "",
6860
0
            (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
6861
0
            (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
6862
0
            (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
6863
0
}
6864
6865
/*
6866
 * Log end of a checkpoint.
6867
 */
6868
static void
6869
LogCheckpointEnd(bool restartpoint)
6870
0
{
6871
0
  long    write_msecs,
6872
0
        sync_msecs,
6873
0
        total_msecs,
6874
0
        longest_msecs,
6875
0
        average_msecs;
6876
0
  uint64    average_sync_time;
6877
6878
0
  CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
6879
6880
0
  write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
6881
0
                          CheckpointStats.ckpt_sync_t);
6882
6883
0
  sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
6884
0
                         CheckpointStats.ckpt_sync_end_t);
6885
6886
  /* Accumulate checkpoint timing summary data, in milliseconds. */
6887
0
  PendingCheckpointerStats.write_time += write_msecs;
6888
0
  PendingCheckpointerStats.sync_time += sync_msecs;
6889
6890
  /*
6891
   * All of the published timing statistics are accounted for.  Only
6892
   * continue if a log message is to be written.
6893
   */
6894
0
  if (!log_checkpoints)
6895
0
    return;
6896
6897
0
  total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
6898
0
                          CheckpointStats.ckpt_end_t);
6899
6900
  /*
6901
   * Timing values returned from CheckpointStats are in microseconds.
6902
   * Convert to milliseconds for consistent printing.
6903
   */
6904
0
  longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
6905
6906
0
  average_sync_time = 0;
6907
0
  if (CheckpointStats.ckpt_sync_rels > 0)
6908
0
    average_sync_time = CheckpointStats.ckpt_agg_sync_time /
6909
0
      CheckpointStats.ckpt_sync_rels;
6910
0
  average_msecs = (long) ((average_sync_time + 999) / 1000);
6911
6912
  /*
6913
   * ControlFileLock is not required to see ControlFile->checkPoint and
6914
   * ->checkPointCopy here as we are the only updator of those variables at
6915
   * this moment.
6916
   */
6917
0
  if (restartpoint)
6918
0
    ereport(LOG,
6919
0
        (errmsg("restartpoint complete: wrote %d buffers (%.1f%%), "
6920
0
            "wrote %d SLRU buffers; %d WAL file(s) added, "
6921
0
            "%d removed, %d recycled; write=%ld.%03d s, "
6922
0
            "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
6923
0
            "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
6924
0
            "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
6925
0
            CheckpointStats.ckpt_bufs_written,
6926
0
            (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6927
0
            CheckpointStats.ckpt_slru_written,
6928
0
            CheckpointStats.ckpt_segs_added,
6929
0
            CheckpointStats.ckpt_segs_removed,
6930
0
            CheckpointStats.ckpt_segs_recycled,
6931
0
            write_msecs / 1000, (int) (write_msecs % 1000),
6932
0
            sync_msecs / 1000, (int) (sync_msecs % 1000),
6933
0
            total_msecs / 1000, (int) (total_msecs % 1000),
6934
0
            CheckpointStats.ckpt_sync_rels,
6935
0
            longest_msecs / 1000, (int) (longest_msecs % 1000),
6936
0
            average_msecs / 1000, (int) (average_msecs % 1000),
6937
0
            (int) (PrevCheckPointDistance / 1024.0),
6938
0
            (int) (CheckPointDistanceEstimate / 1024.0),
6939
0
            LSN_FORMAT_ARGS(ControlFile->checkPoint),
6940
0
            LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
6941
0
  else
6942
0
    ereport(LOG,
6943
0
        (errmsg("checkpoint complete: wrote %d buffers (%.1f%%), "
6944
0
            "wrote %d SLRU buffers; %d WAL file(s) added, "
6945
0
            "%d removed, %d recycled; write=%ld.%03d s, "
6946
0
            "sync=%ld.%03d s, total=%ld.%03d s; sync files=%d, "
6947
0
            "longest=%ld.%03d s, average=%ld.%03d s; distance=%d kB, "
6948
0
            "estimate=%d kB; lsn=%X/%X, redo lsn=%X/%X",
6949
0
            CheckpointStats.ckpt_bufs_written,
6950
0
            (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
6951
0
            CheckpointStats.ckpt_slru_written,
6952
0
            CheckpointStats.ckpt_segs_added,
6953
0
            CheckpointStats.ckpt_segs_removed,
6954
0
            CheckpointStats.ckpt_segs_recycled,
6955
0
            write_msecs / 1000, (int) (write_msecs % 1000),
6956
0
            sync_msecs / 1000, (int) (sync_msecs % 1000),
6957
0
            total_msecs / 1000, (int) (total_msecs % 1000),
6958
0
            CheckpointStats.ckpt_sync_rels,
6959
0
            longest_msecs / 1000, (int) (longest_msecs % 1000),
6960
0
            average_msecs / 1000, (int) (average_msecs % 1000),
6961
0
            (int) (PrevCheckPointDistance / 1024.0),
6962
0
            (int) (CheckPointDistanceEstimate / 1024.0),
6963
0
            LSN_FORMAT_ARGS(ControlFile->checkPoint),
6964
0
            LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo))));
6965
0
}
6966
6967
/*
6968
 * Update the estimate of distance between checkpoints.
6969
 *
6970
 * The estimate is used to calculate the number of WAL segments to keep
6971
 * preallocated, see XLOGfileslop().
6972
 */
6973
static void
6974
UpdateCheckPointDistanceEstimate(uint64 nbytes)
6975
0
{
6976
  /*
6977
   * To estimate the number of segments consumed between checkpoints, keep a
6978
   * moving average of the amount of WAL generated in previous checkpoint
6979
   * cycles. However, if the load is bursty, with quiet periods and busy
6980
   * periods, we want to cater for the peak load. So instead of a plain
6981
   * moving average, let the average decline slowly if the previous cycle
6982
   * used less WAL than estimated, but bump it up immediately if it used
6983
   * more.
6984
   *
6985
   * When checkpoints are triggered by max_wal_size, this should converge to
6986
   * CheckpointSegments * wal_segment_size,
6987
   *
6988
   * Note: This doesn't pay any attention to what caused the checkpoint.
6989
   * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
6990
   * starting a base backup, are counted the same as those created
6991
   * automatically. The slow-decline will largely mask them out, if they are
6992
   * not frequent. If they are frequent, it seems reasonable to count them
6993
   * in as any others; if you issue a manual checkpoint every 5 minutes and
6994
   * never let a timed checkpoint happen, it makes sense to base the
6995
   * preallocation on that 5 minute interval rather than whatever
6996
   * checkpoint_timeout is set to.
6997
   */
6998
0
  PrevCheckPointDistance = nbytes;
6999
0
  if (CheckPointDistanceEstimate < nbytes)
7000
0
    CheckPointDistanceEstimate = nbytes;
7001
0
  else
7002
0
    CheckPointDistanceEstimate =
7003
0
      (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
7004
0
}
7005
7006
/*
7007
 * Update the ps display for a process running a checkpoint.  Note that
7008
 * this routine should not do any allocations so as it can be called
7009
 * from a critical section.
7010
 */
7011
static void
7012
update_checkpoint_display(int flags, bool restartpoint, bool reset)
7013
0
{
7014
  /*
7015
   * The status is reported only for end-of-recovery and shutdown
7016
   * checkpoints or shutdown restartpoints.  Updating the ps display is
7017
   * useful in those situations as it may not be possible to rely on
7018
   * pg_stat_activity to see the status of the checkpointer or the startup
7019
   * process.
7020
   */
7021
0
  if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
7022
0
    return;
7023
7024
0
  if (reset)
7025
0
    set_ps_display("");
7026
0
  else
7027
0
  {
7028
0
    char    activitymsg[128];
7029
7030
0
    snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
7031
0
         (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
7032
0
         (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
7033
0
         restartpoint ? "restartpoint" : "checkpoint");
7034
0
    set_ps_display(activitymsg);
7035
0
  }
7036
0
}
7037
7038
7039
/*
7040
 * Perform a checkpoint --- either during shutdown, or on-the-fly
7041
 *
7042
 * flags is a bitwise OR of the following:
7043
 *  CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7044
 *  CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7045
 *  CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7046
 *    ignoring checkpoint_completion_target parameter.
7047
 *  CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
7048
 *    since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7049
 *    CHECKPOINT_END_OF_RECOVERY).
7050
 *  CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
7051
 *
7052
 * Note: flags contains other bits, of interest here only for logging purposes.
7053
 * In particular note that this routine is synchronous and does not pay
7054
 * attention to CHECKPOINT_WAIT.
7055
 *
7056
 * If !shutdown then we are writing an online checkpoint. An XLOG_CHECKPOINT_REDO
7057
 * record is inserted into WAL at the logical location of the checkpoint, before
7058
 * flushing anything to disk, and when the checkpoint is eventually completed,
7059
 * and it is from this point that WAL replay will begin in the case of a recovery
7060
 * from this checkpoint. Once everything is written to disk, an
7061
 * XLOG_CHECKPOINT_ONLINE record is written to complete the checkpoint, and
7062
 * points back to the earlier XLOG_CHECKPOINT_REDO record. This mechanism allows
7063
 * other write-ahead log records to be written while the checkpoint is in
7064
 * progress, but we must be very careful about order of operations. This function
7065
 * may take many minutes to execute on a busy system.
7066
 *
7067
 * On the other hand, when shutdown is true, concurrent insertion into the
7068
 * write-ahead log is impossible, so there is no need for two separate records.
7069
 * In this case, we only insert an XLOG_CHECKPOINT_SHUTDOWN record, and it's
7070
 * both the record marking the completion of the checkpoint and the location
7071
 * from which WAL replay would begin if needed.
7072
 *
7073
 * Returns true if a new checkpoint was performed, or false if it was skipped
7074
 * because the system was idle.
7075
 */
7076
bool
7077
CreateCheckPoint(int flags)
7078
0
{
7079
0
  bool    shutdown;
7080
0
  CheckPoint  checkPoint;
7081
0
  XLogRecPtr  recptr;
7082
0
  XLogSegNo _logSegNo;
7083
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
7084
0
  uint32    freespace;
7085
0
  XLogRecPtr  PriorRedoPtr;
7086
0
  XLogRecPtr  last_important_lsn;
7087
0
  VirtualTransactionId *vxids;
7088
0
  int     nvxids;
7089
0
  int     oldXLogAllowed = 0;
7090
7091
  /*
7092
   * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7093
   * issued at a different time.
7094
   */
7095
0
  if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7096
0
    shutdown = true;
7097
0
  else
7098
0
    shutdown = false;
7099
7100
  /* sanity check */
7101
0
  if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7102
0
    elog(ERROR, "can't create a checkpoint during recovery");
7103
7104
  /*
7105
   * Prepare to accumulate statistics.
7106
   *
7107
   * Note: because it is possible for log_checkpoints to change while a
7108
   * checkpoint proceeds, we always accumulate stats, even if
7109
   * log_checkpoints is currently off.
7110
   */
7111
0
  MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7112
0
  CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7113
7114
  /*
7115
   * Let smgr prepare for checkpoint; this has to happen outside the
7116
   * critical section and before we determine the REDO pointer.  Note that
7117
   * smgr must not do anything that'd have to be undone if we decide no
7118
   * checkpoint is needed.
7119
   */
7120
0
  SyncPreCheckpoint();
7121
7122
  /*
7123
   * Use a critical section to force system panic if we have trouble.
7124
   */
7125
0
  START_CRIT_SECTION();
7126
7127
0
  if (shutdown)
7128
0
  {
7129
0
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7130
0
    ControlFile->state = DB_SHUTDOWNING;
7131
0
    UpdateControlFile();
7132
0
    LWLockRelease(ControlFileLock);
7133
0
  }
7134
7135
  /* Begin filling in the checkpoint WAL record */
7136
0
  MemSet(&checkPoint, 0, sizeof(checkPoint));
7137
0
  checkPoint.time = (pg_time_t) time(NULL);
7138
7139
  /*
7140
   * For Hot Standby, derive the oldestActiveXid before we fix the redo
7141
   * pointer. This allows us to begin accumulating changes to assemble our
7142
   * starting snapshot of locks and transactions.
7143
   */
7144
0
  if (!shutdown && XLogStandbyInfoActive())
7145
0
    checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
7146
0
  else
7147
0
    checkPoint.oldestActiveXid = InvalidTransactionId;
7148
7149
  /*
7150
   * Get location of last important record before acquiring insert locks (as
7151
   * GetLastImportantRecPtr() also locks WAL locks).
7152
   */
7153
0
  last_important_lsn = GetLastImportantRecPtr();
7154
7155
  /*
7156
   * If this isn't a shutdown or forced checkpoint, and if there has been no
7157
   * WAL activity requiring a checkpoint, skip it.  The idea here is to
7158
   * avoid inserting duplicate checkpoints when the system is idle.
7159
   */
7160
0
  if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
7161
0
          CHECKPOINT_FORCE)) == 0)
7162
0
  {
7163
0
    if (last_important_lsn == ControlFile->checkPoint)
7164
0
    {
7165
0
      END_CRIT_SECTION();
7166
0
      ereport(DEBUG1,
7167
0
          (errmsg_internal("checkpoint skipped because system is idle")));
7168
0
      return false;
7169
0
    }
7170
0
  }
7171
7172
  /*
7173
   * An end-of-recovery checkpoint is created before anyone is allowed to
7174
   * write WAL. To allow us to write the checkpoint record, temporarily
7175
   * enable XLogInsertAllowed.
7176
   */
7177
0
  if (flags & CHECKPOINT_END_OF_RECOVERY)
7178
0
    oldXLogAllowed = LocalSetXLogInsertAllowed();
7179
7180
0
  checkPoint.ThisTimeLineID = XLogCtl->InsertTimeLineID;
7181
0
  if (flags & CHECKPOINT_END_OF_RECOVERY)
7182
0
    checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7183
0
  else
7184
0
    checkPoint.PrevTimeLineID = checkPoint.ThisTimeLineID;
7185
7186
  /*
7187
   * We must block concurrent insertions while examining insert state.
7188
   */
7189
0
  WALInsertLockAcquireExclusive();
7190
7191
0
  checkPoint.fullPageWrites = Insert->fullPageWrites;
7192
0
  checkPoint.wal_level = wal_level;
7193
7194
0
  if (shutdown)
7195
0
  {
7196
0
    XLogRecPtr  curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
7197
7198
    /*
7199
     * Compute new REDO record ptr = location of next XLOG record.
7200
     *
7201
     * Since this is a shutdown checkpoint, there can't be any concurrent
7202
     * WAL insertion.
7203
     */
7204
0
    freespace = INSERT_FREESPACE(curInsert);
7205
0
    if (freespace == 0)
7206
0
    {
7207
0
      if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
7208
0
        curInsert += SizeOfXLogLongPHD;
7209
0
      else
7210
0
        curInsert += SizeOfXLogShortPHD;
7211
0
    }
7212
0
    checkPoint.redo = curInsert;
7213
7214
    /*
7215
     * Here we update the shared RedoRecPtr for future XLogInsert calls;
7216
     * this must be done while holding all the insertion locks.
7217
     *
7218
     * Note: if we fail to complete the checkpoint, RedoRecPtr will be
7219
     * left pointing past where it really needs to point.  This is okay;
7220
     * the only consequence is that XLogInsert might back up whole buffers
7221
     * that it didn't really need to.  We can't postpone advancing
7222
     * RedoRecPtr because XLogInserts that happen while we are dumping
7223
     * buffers must assume that their buffer changes are not included in
7224
     * the checkpoint.
7225
     */
7226
0
    RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
7227
0
  }
7228
7229
  /*
7230
   * Now we can release the WAL insertion locks, allowing other xacts to
7231
   * proceed while we are flushing disk buffers.
7232
   */
7233
0
  WALInsertLockRelease();
7234
7235
  /*
7236
   * If this is an online checkpoint, we have not yet determined the redo
7237
   * point. We do so now by inserting the special XLOG_CHECKPOINT_REDO
7238
   * record; the LSN at which it starts becomes the new redo pointer. We
7239
   * don't do this for a shutdown checkpoint, because in that case no WAL
7240
   * can be written between the redo point and the insertion of the
7241
   * checkpoint record itself, so the checkpoint record itself serves to
7242
   * mark the redo point.
7243
   */
7244
0
  if (!shutdown)
7245
0
  {
7246
    /* Include WAL level in record for WAL summarizer's benefit. */
7247
0
    XLogBeginInsert();
7248
0
    XLogRegisterData(&wal_level, sizeof(wal_level));
7249
0
    (void) XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_REDO);
7250
7251
    /*
7252
     * XLogInsertRecord will have updated XLogCtl->Insert.RedoRecPtr in
7253
     * shared memory and RedoRecPtr in backend-local memory, but we need
7254
     * to copy that into the record that will be inserted when the
7255
     * checkpoint is complete.
7256
     */
7257
0
    checkPoint.redo = RedoRecPtr;
7258
0
  }
7259
7260
  /* Update the info_lck-protected copy of RedoRecPtr as well */
7261
0
  SpinLockAcquire(&XLogCtl->info_lck);
7262
0
  XLogCtl->RedoRecPtr = checkPoint.redo;
7263
0
  SpinLockRelease(&XLogCtl->info_lck);
7264
7265
  /*
7266
   * If enabled, log checkpoint start.  We postpone this until now so as not
7267
   * to log anything if we decided to skip the checkpoint.
7268
   */
7269
0
  if (log_checkpoints)
7270
0
    LogCheckpointStart(flags, false);
7271
7272
  /* Update the process title */
7273
0
  update_checkpoint_display(flags, false, false);
7274
7275
0
  TRACE_POSTGRESQL_CHECKPOINT_START(flags);
7276
7277
  /*
7278
   * Get the other info we need for the checkpoint record.
7279
   *
7280
   * We don't need to save oldestClogXid in the checkpoint, it only matters
7281
   * for the short period in which clog is being truncated, and if we crash
7282
   * during that we'll redo the clog truncation and fix up oldestClogXid
7283
   * there.
7284
   */
7285
0
  LWLockAcquire(XidGenLock, LW_SHARED);
7286
0
  checkPoint.nextXid = TransamVariables->nextXid;
7287
0
  checkPoint.oldestXid = TransamVariables->oldestXid;
7288
0
  checkPoint.oldestXidDB = TransamVariables->oldestXidDB;
7289
0
  LWLockRelease(XidGenLock);
7290
7291
0
  LWLockAcquire(CommitTsLock, LW_SHARED);
7292
0
  checkPoint.oldestCommitTsXid = TransamVariables->oldestCommitTsXid;
7293
0
  checkPoint.newestCommitTsXid = TransamVariables->newestCommitTsXid;
7294
0
  LWLockRelease(CommitTsLock);
7295
7296
0
  LWLockAcquire(OidGenLock, LW_SHARED);
7297
0
  checkPoint.nextOid = TransamVariables->nextOid;
7298
0
  if (!shutdown)
7299
0
    checkPoint.nextOid += TransamVariables->oidCount;
7300
0
  LWLockRelease(OidGenLock);
7301
7302
0
  MultiXactGetCheckptMulti(shutdown,
7303
0
               &checkPoint.nextMulti,
7304
0
               &checkPoint.nextMultiOffset,
7305
0
               &checkPoint.oldestMulti,
7306
0
               &checkPoint.oldestMultiDB);
7307
7308
  /*
7309
   * Having constructed the checkpoint record, ensure all shmem disk buffers
7310
   * and commit-log buffers are flushed to disk.
7311
   *
7312
   * This I/O could fail for various reasons.  If so, we will fail to
7313
   * complete the checkpoint, but there is no reason to force a system
7314
   * panic. Accordingly, exit critical section while doing it.
7315
   */
7316
0
  END_CRIT_SECTION();
7317
7318
  /*
7319
   * In some cases there are groups of actions that must all occur on one
7320
   * side or the other of a checkpoint record. Before flushing the
7321
   * checkpoint record we must explicitly wait for any backend currently
7322
   * performing those groups of actions.
7323
   *
7324
   * One example is end of transaction, so we must wait for any transactions
7325
   * that are currently in commit critical sections.  If an xact inserted
7326
   * its commit record into XLOG just before the REDO point, then a crash
7327
   * restart from the REDO point would not replay that record, which means
7328
   * that our flushing had better include the xact's update of pg_xact.  So
7329
   * we wait till he's out of his commit critical section before proceeding.
7330
   * See notes in RecordTransactionCommit().
7331
   *
7332
   * Because we've already released the insertion locks, this test is a bit
7333
   * fuzzy: it is possible that we will wait for xacts we didn't really need
7334
   * to wait for.  But the delay should be short and it seems better to make
7335
   * checkpoint take a bit longer than to hold off insertions longer than
7336
   * necessary. (In fact, the whole reason we have this issue is that xact.c
7337
   * does commit record XLOG insertion and clog update as two separate steps
7338
   * protected by different locks, but again that seems best on grounds of
7339
   * minimizing lock contention.)
7340
   *
7341
   * A transaction that has not yet set delayChkptFlags when we look cannot
7342
   * be at risk, since it has not inserted its commit record yet; and one
7343
   * that's already cleared it is not at risk either, since it's done fixing
7344
   * clog and we will correctly flush the update below.  So we cannot miss
7345
   * any xacts we need to wait for.
7346
   */
7347
0
  vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
7348
0
  if (nvxids > 0)
7349
0
  {
7350
0
    do
7351
0
    {
7352
      /*
7353
       * Keep absorbing fsync requests while we wait. There could even
7354
       * be a deadlock if we don't, if the process that prevents the
7355
       * checkpoint is trying to add a request to the queue.
7356
       */
7357
0
      AbsorbSyncRequests();
7358
7359
0
      pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_START);
7360
0
      pg_usleep(10000L);  /* wait for 10 msec */
7361
0
      pgstat_report_wait_end();
7362
0
    } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
7363
0
                        DELAY_CHKPT_START));
7364
0
  }
7365
0
  pfree(vxids);
7366
7367
0
  CheckPointGuts(checkPoint.redo, flags);
7368
7369
0
  vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
7370
0
  if (nvxids > 0)
7371
0
  {
7372
0
    do
7373
0
    {
7374
0
      AbsorbSyncRequests();
7375
7376
0
      pgstat_report_wait_start(WAIT_EVENT_CHECKPOINT_DELAY_COMPLETE);
7377
0
      pg_usleep(10000L);  /* wait for 10 msec */
7378
0
      pgstat_report_wait_end();
7379
0
    } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
7380
0
                        DELAY_CHKPT_COMPLETE));
7381
0
  }
7382
0
  pfree(vxids);
7383
7384
  /*
7385
   * Take a snapshot of running transactions and write this to WAL. This
7386
   * allows us to reconstruct the state of running transactions during
7387
   * archive recovery, if required. Skip, if this info disabled.
7388
   *
7389
   * If we are shutting down, or Startup process is completing crash
7390
   * recovery we don't need to write running xact data.
7391
   */
7392
0
  if (!shutdown && XLogStandbyInfoActive())
7393
0
    LogStandbySnapshot();
7394
7395
0
  START_CRIT_SECTION();
7396
7397
  /*
7398
   * Now insert the checkpoint record into XLOG.
7399
   */
7400
0
  XLogBeginInsert();
7401
0
  XLogRegisterData(&checkPoint, sizeof(checkPoint));
7402
0
  recptr = XLogInsert(RM_XLOG_ID,
7403
0
            shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7404
0
            XLOG_CHECKPOINT_ONLINE);
7405
7406
0
  XLogFlush(recptr);
7407
7408
  /*
7409
   * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7410
   * overwritten at next startup.  No-one should even try, this just allows
7411
   * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
7412
   * to just temporarily disable writing until the system has exited
7413
   * recovery.
7414
   */
7415
0
  if (shutdown)
7416
0
  {
7417
0
    if (flags & CHECKPOINT_END_OF_RECOVERY)
7418
0
      LocalXLogInsertAllowed = oldXLogAllowed;
7419
0
    else
7420
0
      LocalXLogInsertAllowed = 0; /* never again write WAL */
7421
0
  }
7422
7423
  /*
7424
   * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7425
   * = end of actual checkpoint record.
7426
   */
7427
0
  if (shutdown && checkPoint.redo != ProcLastRecPtr)
7428
0
    ereport(PANIC,
7429
0
        (errmsg("concurrent write-ahead log activity while database system is shutting down")));
7430
7431
  /*
7432
   * Remember the prior checkpoint's redo ptr for
7433
   * UpdateCheckPointDistanceEstimate()
7434
   */
7435
0
  PriorRedoPtr = ControlFile->checkPointCopy.redo;
7436
7437
  /*
7438
   * Update the control file.
7439
   */
7440
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7441
0
  if (shutdown)
7442
0
    ControlFile->state = DB_SHUTDOWNED;
7443
0
  ControlFile->checkPoint = ProcLastRecPtr;
7444
0
  ControlFile->checkPointCopy = checkPoint;
7445
  /* crash recovery should always recover to the end of WAL */
7446
0
  ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
7447
0
  ControlFile->minRecoveryPointTLI = 0;
7448
7449
  /*
7450
   * Persist unloggedLSN value. It's reset on crash recovery, so this goes
7451
   * unused on non-shutdown checkpoints, but seems useful to store it always
7452
   * for debugging purposes.
7453
   */
7454
0
  ControlFile->unloggedLSN = pg_atomic_read_membarrier_u64(&XLogCtl->unloggedLSN);
7455
7456
0
  UpdateControlFile();
7457
0
  LWLockRelease(ControlFileLock);
7458
7459
  /* Update shared-memory copy of checkpoint XID/epoch */
7460
0
  SpinLockAcquire(&XLogCtl->info_lck);
7461
0
  XLogCtl->ckptFullXid = checkPoint.nextXid;
7462
0
  SpinLockRelease(&XLogCtl->info_lck);
7463
7464
  /*
7465
   * We are now done with critical updates; no need for system panic if we
7466
   * have trouble while fooling with old log segments.
7467
   */
7468
0
  END_CRIT_SECTION();
7469
7470
  /*
7471
   * WAL summaries end when the next XLOG_CHECKPOINT_REDO or
7472
   * XLOG_CHECKPOINT_SHUTDOWN record is reached. This is the first point
7473
   * where (a) we're not inside of a critical section and (b) we can be
7474
   * certain that the relevant record has been flushed to disk, which must
7475
   * happen before it can be summarized.
7476
   *
7477
   * If this is a shutdown checkpoint, then this happens reasonably
7478
   * promptly: we've only just inserted and flushed the
7479
   * XLOG_CHECKPOINT_SHUTDOWN record. If this is not a shutdown checkpoint,
7480
   * then this might not be very prompt at all: the XLOG_CHECKPOINT_REDO
7481
   * record was written before we began flushing data to disk, and that
7482
   * could be many minutes ago at this point. However, we don't XLogFlush()
7483
   * after inserting that record, so we're not guaranteed that it's on disk
7484
   * until after the above call that flushes the XLOG_CHECKPOINT_ONLINE
7485
   * record.
7486
   */
7487
0
  WakeupWalSummarizer();
7488
7489
  /*
7490
   * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7491
   */
7492
0
  SyncPostCheckpoint();
7493
7494
  /*
7495
   * Update the average distance between checkpoints if the prior checkpoint
7496
   * exists.
7497
   */
7498
0
  if (PriorRedoPtr != InvalidXLogRecPtr)
7499
0
    UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
7500
7501
  /*
7502
   * Delete old log files, those no longer needed for last checkpoint to
7503
   * prevent the disk holding the xlog from growing full.
7504
   */
7505
0
  XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7506
0
  KeepLogSeg(recptr, &_logSegNo);
7507
0
  if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
7508
0
                       _logSegNo, InvalidOid,
7509
0
                       InvalidTransactionId))
7510
0
  {
7511
    /*
7512
     * Some slots have been invalidated; recalculate the old-segment
7513
     * horizon, starting again from RedoRecPtr.
7514
     */
7515
0
    XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7516
0
    KeepLogSeg(recptr, &_logSegNo);
7517
0
  }
7518
0
  _logSegNo--;
7519
0
  RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr,
7520
0
             checkPoint.ThisTimeLineID);
7521
7522
  /*
7523
   * Make more log segments if needed.  (Do this after recycling old log
7524
   * segments, since that may supply some of the needed files.)
7525
   */
7526
0
  if (!shutdown)
7527
0
    PreallocXlogFiles(recptr, checkPoint.ThisTimeLineID);
7528
7529
  /*
7530
   * Truncate pg_subtrans if possible.  We can throw away all data before
7531
   * the oldest XMIN of any running transaction.  No future transaction will
7532
   * attempt to reference any pg_subtrans entry older than that (see Asserts
7533
   * in subtrans.c).  During recovery, though, we mustn't do this because
7534
   * StartupSUBTRANS hasn't been called yet.
7535
   */
7536
0
  if (!RecoveryInProgress())
7537
0
    TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
7538
7539
  /* Real work is done; log and update stats. */
7540
0
  LogCheckpointEnd(false);
7541
7542
  /* Reset the process title */
7543
0
  update_checkpoint_display(flags, false, true);
7544
7545
0
  TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
7546
0
                   NBuffers,
7547
0
                   CheckpointStats.ckpt_segs_added,
7548
0
                   CheckpointStats.ckpt_segs_removed,
7549
0
                   CheckpointStats.ckpt_segs_recycled);
7550
7551
0
  return true;
7552
0
}
7553
7554
/*
7555
 * Mark the end of recovery in WAL though without running a full checkpoint.
7556
 * We can expect that a restartpoint is likely to be in progress as we
7557
 * do this, though we are unwilling to wait for it to complete.
7558
 *
7559
 * CreateRestartPoint() allows for the case where recovery may end before
7560
 * the restartpoint completes so there is no concern of concurrent behaviour.
7561
 */
7562
static void
7563
CreateEndOfRecoveryRecord(void)
7564
0
{
7565
0
  xl_end_of_recovery xlrec;
7566
0
  XLogRecPtr  recptr;
7567
7568
  /* sanity check */
7569
0
  if (!RecoveryInProgress())
7570
0
    elog(ERROR, "can only be used to end recovery");
7571
7572
0
  xlrec.end_time = GetCurrentTimestamp();
7573
0
  xlrec.wal_level = wal_level;
7574
7575
0
  WALInsertLockAcquireExclusive();
7576
0
  xlrec.ThisTimeLineID = XLogCtl->InsertTimeLineID;
7577
0
  xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
7578
0
  WALInsertLockRelease();
7579
7580
0
  START_CRIT_SECTION();
7581
7582
0
  XLogBeginInsert();
7583
0
  XLogRegisterData(&xlrec, sizeof(xl_end_of_recovery));
7584
0
  recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
7585
7586
0
  XLogFlush(recptr);
7587
7588
  /*
7589
   * Update the control file so that crash recovery can follow the timeline
7590
   * changes to this point.
7591
   */
7592
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7593
0
  ControlFile->minRecoveryPoint = recptr;
7594
0
  ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
7595
0
  UpdateControlFile();
7596
0
  LWLockRelease(ControlFileLock);
7597
7598
0
  END_CRIT_SECTION();
7599
0
}
7600
7601
/*
7602
 * Write an OVERWRITE_CONTRECORD message.
7603
 *
7604
 * When on WAL replay we expect a continuation record at the start of a page
7605
 * that is not there, recovery ends and WAL writing resumes at that point.
7606
 * But it's wrong to resume writing new WAL back at the start of the record
7607
 * that was broken, because downstream consumers of that WAL (physical
7608
 * replicas) are not prepared to "rewind".  So the first action after
7609
 * finishing replay of all valid WAL must be to write a record of this type
7610
 * at the point where the contrecord was missing; to support xlogreader
7611
 * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
7612
 * to the page header where the record occurs.  xlogreader has an ad-hoc
7613
 * mechanism to report metadata about the broken record, which is what we
7614
 * use here.
7615
 *
7616
 * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
7617
 * skip the record it was reading, and pass back the LSN of the skipped
7618
 * record, so that its caller can verify (on "replay" of that record) that the
7619
 * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
7620
 *
7621
 * 'aborted_lsn' is the beginning position of the record that was incomplete.
7622
 * It is included in the WAL record.  'pagePtr' and 'newTLI' point to the
7623
 * beginning of the XLOG page where the record is to be inserted.  They must
7624
 * match the current WAL insert position, they're passed here just so that we
7625
 * can verify that.
7626
 */
7627
static XLogRecPtr
7628
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr,
7629
                TimeLineID newTLI)
7630
0
{
7631
0
  xl_overwrite_contrecord xlrec;
7632
0
  XLogRecPtr  recptr;
7633
0
  XLogPageHeader pagehdr;
7634
0
  XLogRecPtr  startPos;
7635
7636
  /* sanity checks */
7637
0
  if (!RecoveryInProgress())
7638
0
    elog(ERROR, "can only be used at end of recovery");
7639
0
  if (pagePtr % XLOG_BLCKSZ != 0)
7640
0
    elog(ERROR, "invalid position for missing continuation record %X/%X",
7641
0
       LSN_FORMAT_ARGS(pagePtr));
7642
7643
  /* The current WAL insert position should be right after the page header */
7644
0
  startPos = pagePtr;
7645
0
  if (XLogSegmentOffset(startPos, wal_segment_size) == 0)
7646
0
    startPos += SizeOfXLogLongPHD;
7647
0
  else
7648
0
    startPos += SizeOfXLogShortPHD;
7649
0
  recptr = GetXLogInsertRecPtr();
7650
0
  if (recptr != startPos)
7651
0
    elog(ERROR, "invalid WAL insert position %X/%X for OVERWRITE_CONTRECORD",
7652
0
       LSN_FORMAT_ARGS(recptr));
7653
7654
0
  START_CRIT_SECTION();
7655
7656
  /*
7657
   * Initialize the XLOG page header (by GetXLogBuffer), and set the
7658
   * XLP_FIRST_IS_OVERWRITE_CONTRECORD flag.
7659
   *
7660
   * No other backend is allowed to write WAL yet, so acquiring the WAL
7661
   * insertion lock is just pro forma.
7662
   */
7663
0
  WALInsertLockAcquire();
7664
0
  pagehdr = (XLogPageHeader) GetXLogBuffer(pagePtr, newTLI);
7665
0
  pagehdr->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
7666
0
  WALInsertLockRelease();
7667
7668
  /*
7669
   * Insert the XLOG_OVERWRITE_CONTRECORD record as the first record on the
7670
   * page.  We know it becomes the first record, because no other backend is
7671
   * allowed to write WAL yet.
7672
   */
7673
0
  XLogBeginInsert();
7674
0
  xlrec.overwritten_lsn = aborted_lsn;
7675
0
  xlrec.overwrite_time = GetCurrentTimestamp();
7676
0
  XLogRegisterData(&xlrec, sizeof(xl_overwrite_contrecord));
7677
0
  recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
7678
7679
  /* check that the record was inserted to the right place */
7680
0
  if (ProcLastRecPtr != startPos)
7681
0
    elog(ERROR, "OVERWRITE_CONTRECORD was inserted to unexpected position %X/%X",
7682
0
       LSN_FORMAT_ARGS(ProcLastRecPtr));
7683
7684
0
  XLogFlush(recptr);
7685
7686
0
  END_CRIT_SECTION();
7687
7688
0
  return recptr;
7689
0
}
7690
7691
/*
7692
 * Flush all data in shared memory to disk, and fsync
7693
 *
7694
 * This is the common code shared between regular checkpoints and
7695
 * recovery restartpoints.
7696
 */
7697
static void
7698
CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7699
0
{
7700
0
  CheckPointRelationMap();
7701
0
  CheckPointReplicationSlots(flags & CHECKPOINT_IS_SHUTDOWN);
7702
0
  CheckPointSnapBuild();
7703
0
  CheckPointLogicalRewriteHeap();
7704
0
  CheckPointReplicationOrigin();
7705
7706
  /* Write out all dirty data in SLRUs and the main buffer pool */
7707
0
  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
7708
0
  CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
7709
0
  CheckPointCLOG();
7710
0
  CheckPointCommitTs();
7711
0
  CheckPointSUBTRANS();
7712
0
  CheckPointMultiXact();
7713
0
  CheckPointPredicate();
7714
0
  CheckPointBuffers(flags);
7715
7716
  /* Perform all queued up fsyncs */
7717
0
  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
7718
0
  CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
7719
0
  ProcessSyncRequests();
7720
0
  CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
7721
0
  TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
7722
7723
  /* We deliberately delay 2PC checkpointing as long as possible */
7724
0
  CheckPointTwoPhase(checkPointRedo);
7725
0
}
7726
7727
/*
7728
 * Save a checkpoint for recovery restart if appropriate
7729
 *
7730
 * This function is called each time a checkpoint record is read from XLOG.
7731
 * It must determine whether the checkpoint represents a safe restartpoint or
7732
 * not.  If so, the checkpoint record is stashed in shared memory so that
7733
 * CreateRestartPoint can consult it.  (Note that the latter function is
7734
 * executed by the checkpointer, while this one will be executed by the
7735
 * startup process.)
7736
 */
7737
static void
7738
RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
7739
0
{
7740
  /*
7741
   * Also refrain from creating a restartpoint if we have seen any
7742
   * references to non-existent pages. Restarting recovery from the
7743
   * restartpoint would not see the references, so we would lose the
7744
   * cross-check that the pages belonged to a relation that was dropped
7745
   * later.
7746
   */
7747
0
  if (XLogHaveInvalidPages())
7748
0
  {
7749
0
    elog(DEBUG2,
7750
0
       "could not record restart point at %X/%X because there "
7751
0
       "are unresolved references to invalid pages",
7752
0
       LSN_FORMAT_ARGS(checkPoint->redo));
7753
0
    return;
7754
0
  }
7755
7756
  /*
7757
   * Copy the checkpoint record to shared memory, so that checkpointer can
7758
   * work out the next time it wants to perform a restartpoint.
7759
   */
7760
0
  SpinLockAcquire(&XLogCtl->info_lck);
7761
0
  XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
7762
0
  XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
7763
0
  XLogCtl->lastCheckPoint = *checkPoint;
7764
0
  SpinLockRelease(&XLogCtl->info_lck);
7765
0
}
7766
7767
/*
7768
 * Establish a restartpoint if possible.
7769
 *
7770
 * This is similar to CreateCheckPoint, but is used during WAL recovery
7771
 * to establish a point from which recovery can roll forward without
7772
 * replaying the entire recovery log.
7773
 *
7774
 * Returns true if a new restartpoint was established. We can only establish
7775
 * a restartpoint if we have replayed a safe checkpoint record since last
7776
 * restartpoint.
7777
 */
7778
bool
7779
CreateRestartPoint(int flags)
7780
0
{
7781
0
  XLogRecPtr  lastCheckPointRecPtr;
7782
0
  XLogRecPtr  lastCheckPointEndPtr;
7783
0
  CheckPoint  lastCheckPoint;
7784
0
  XLogRecPtr  PriorRedoPtr;
7785
0
  XLogRecPtr  receivePtr;
7786
0
  XLogRecPtr  replayPtr;
7787
0
  TimeLineID  replayTLI;
7788
0
  XLogRecPtr  endptr;
7789
0
  XLogSegNo _logSegNo;
7790
0
  TimestampTz xtime;
7791
7792
  /* Concurrent checkpoint/restartpoint cannot happen */
7793
0
  Assert(!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
7794
7795
  /* Get a local copy of the last safe checkpoint record. */
7796
0
  SpinLockAcquire(&XLogCtl->info_lck);
7797
0
  lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
7798
0
  lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
7799
0
  lastCheckPoint = XLogCtl->lastCheckPoint;
7800
0
  SpinLockRelease(&XLogCtl->info_lck);
7801
7802
  /*
7803
   * Check that we're still in recovery mode. It's ok if we exit recovery
7804
   * mode after this check, the restart point is valid anyway.
7805
   */
7806
0
  if (!RecoveryInProgress())
7807
0
  {
7808
0
    ereport(DEBUG2,
7809
0
        (errmsg_internal("skipping restartpoint, recovery has already ended")));
7810
0
    return false;
7811
0
  }
7812
7813
  /*
7814
   * If the last checkpoint record we've replayed is already our last
7815
   * restartpoint, we can't perform a new restart point. We still update
7816
   * minRecoveryPoint in that case, so that if this is a shutdown restart
7817
   * point, we won't start up earlier than before. That's not strictly
7818
   * necessary, but when hot standby is enabled, it would be rather weird if
7819
   * the database opened up for read-only connections at a point-in-time
7820
   * before the last shutdown. Such time travel is still possible in case of
7821
   * immediate shutdown, though.
7822
   *
7823
   * We don't explicitly advance minRecoveryPoint when we do create a
7824
   * restartpoint. It's assumed that flushing the buffers will do that as a
7825
   * side-effect.
7826
   */
7827
0
  if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
7828
0
    lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
7829
0
  {
7830
0
    ereport(DEBUG2,
7831
0
        (errmsg_internal("skipping restartpoint, already performed at %X/%X",
7832
0
                 LSN_FORMAT_ARGS(lastCheckPoint.redo))));
7833
7834
0
    UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
7835
0
    if (flags & CHECKPOINT_IS_SHUTDOWN)
7836
0
    {
7837
0
      LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7838
0
      ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7839
0
      UpdateControlFile();
7840
0
      LWLockRelease(ControlFileLock);
7841
0
    }
7842
0
    return false;
7843
0
  }
7844
7845
  /*
7846
   * Update the shared RedoRecPtr so that the startup process can calculate
7847
   * the number of segments replayed since last restartpoint, and request a
7848
   * restartpoint if it exceeds CheckPointSegments.
7849
   *
7850
   * Like in CreateCheckPoint(), hold off insertions to update it, although
7851
   * during recovery this is just pro forma, because no WAL insertions are
7852
   * happening.
7853
   */
7854
0
  WALInsertLockAcquireExclusive();
7855
0
  RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
7856
0
  WALInsertLockRelease();
7857
7858
  /* Also update the info_lck-protected copy */
7859
0
  SpinLockAcquire(&XLogCtl->info_lck);
7860
0
  XLogCtl->RedoRecPtr = lastCheckPoint.redo;
7861
0
  SpinLockRelease(&XLogCtl->info_lck);
7862
7863
  /*
7864
   * Prepare to accumulate statistics.
7865
   *
7866
   * Note: because it is possible for log_checkpoints to change while a
7867
   * checkpoint proceeds, we always accumulate stats, even if
7868
   * log_checkpoints is currently off.
7869
   */
7870
0
  MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7871
0
  CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7872
7873
0
  if (log_checkpoints)
7874
0
    LogCheckpointStart(flags, true);
7875
7876
  /* Update the process title */
7877
0
  update_checkpoint_display(flags, true, false);
7878
7879
0
  CheckPointGuts(lastCheckPoint.redo, flags);
7880
7881
  /*
7882
   * This location needs to be after CheckPointGuts() to ensure that some
7883
   * work has already happened during this checkpoint.
7884
   */
7885
0
  INJECTION_POINT("create-restart-point", NULL);
7886
7887
  /*
7888
   * Remember the prior checkpoint's redo ptr for
7889
   * UpdateCheckPointDistanceEstimate()
7890
   */
7891
0
  PriorRedoPtr = ControlFile->checkPointCopy.redo;
7892
7893
  /*
7894
   * Update pg_control, using current time.  Check that it still shows an
7895
   * older checkpoint, else do nothing; this is a quick hack to make sure
7896
   * nothing really bad happens if somehow we get here after the
7897
   * end-of-recovery checkpoint.
7898
   */
7899
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7900
0
  if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
7901
0
  {
7902
    /*
7903
     * Update the checkpoint information.  We do this even if the cluster
7904
     * does not show DB_IN_ARCHIVE_RECOVERY to match with the set of WAL
7905
     * segments recycled below.
7906
     */
7907
0
    ControlFile->checkPoint = lastCheckPointRecPtr;
7908
0
    ControlFile->checkPointCopy = lastCheckPoint;
7909
7910
    /*
7911
     * Ensure minRecoveryPoint is past the checkpoint record and update it
7912
     * if the control file still shows DB_IN_ARCHIVE_RECOVERY.  Normally,
7913
     * this will have happened already while writing out dirty buffers,
7914
     * but not necessarily - e.g. because no buffers were dirtied.  We do
7915
     * this because a backup performed in recovery uses minRecoveryPoint
7916
     * to determine which WAL files must be included in the backup, and
7917
     * the file (or files) containing the checkpoint record must be
7918
     * included, at a minimum.  Note that for an ordinary restart of
7919
     * recovery there's no value in having the minimum recovery point any
7920
     * earlier than this anyway, because redo will begin just after the
7921
     * checkpoint record.
7922
     */
7923
0
    if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
7924
0
    {
7925
0
      if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
7926
0
      {
7927
0
        ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
7928
0
        ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
7929
7930
        /* update local copy */
7931
0
        LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
7932
0
        LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
7933
0
      }
7934
0
      if (flags & CHECKPOINT_IS_SHUTDOWN)
7935
0
        ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7936
0
    }
7937
0
    UpdateControlFile();
7938
0
  }
7939
0
  LWLockRelease(ControlFileLock);
7940
7941
  /*
7942
   * Update the average distance between checkpoints/restartpoints if the
7943
   * prior checkpoint exists.
7944
   */
7945
0
  if (PriorRedoPtr != InvalidXLogRecPtr)
7946
0
    UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
7947
7948
  /*
7949
   * Delete old log files, those no longer needed for last restartpoint to
7950
   * prevent the disk holding the xlog from growing full.
7951
   */
7952
0
  XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7953
7954
  /*
7955
   * Retreat _logSegNo using the current end of xlog replayed or received,
7956
   * whichever is later.
7957
   */
7958
0
  receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
7959
0
  replayPtr = GetXLogReplayRecPtr(&replayTLI);
7960
0
  endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
7961
0
  KeepLogSeg(endptr, &_logSegNo);
7962
0
  if (InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_REMOVED | RS_INVAL_IDLE_TIMEOUT,
7963
0
                       _logSegNo, InvalidOid,
7964
0
                       InvalidTransactionId))
7965
0
  {
7966
    /*
7967
     * Some slots have been invalidated; recalculate the old-segment
7968
     * horizon, starting again from RedoRecPtr.
7969
     */
7970
0
    XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
7971
0
    KeepLogSeg(endptr, &_logSegNo);
7972
0
  }
7973
0
  _logSegNo--;
7974
7975
  /*
7976
   * Try to recycle segments on a useful timeline. If we've been promoted
7977
   * since the beginning of this restartpoint, use the new timeline chosen
7978
   * at end of recovery.  If we're still in recovery, use the timeline we're
7979
   * currently replaying.
7980
   *
7981
   * There is no guarantee that the WAL segments will be useful on the
7982
   * current timeline; if recovery proceeds to a new timeline right after
7983
   * this, the pre-allocated WAL segments on this timeline will not be used,
7984
   * and will go wasted until recycled on the next restartpoint. We'll live
7985
   * with that.
7986
   */
7987
0
  if (!RecoveryInProgress())
7988
0
    replayTLI = XLogCtl->InsertTimeLineID;
7989
7990
0
  RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr, replayTLI);
7991
7992
  /*
7993
   * Make more log segments if needed.  (Do this after recycling old log
7994
   * segments, since that may supply some of the needed files.)
7995
   */
7996
0
  PreallocXlogFiles(endptr, replayTLI);
7997
7998
  /*
7999
   * Truncate pg_subtrans if possible.  We can throw away all data before
8000
   * the oldest XMIN of any running transaction.  No future transaction will
8001
   * attempt to reference any pg_subtrans entry older than that (see Asserts
8002
   * in subtrans.c).  When hot standby is disabled, though, we mustn't do
8003
   * this because StartupSUBTRANS hasn't been called yet.
8004
   */
8005
0
  if (EnableHotStandby)
8006
0
    TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
8007
8008
  /* Real work is done; log and update stats. */
8009
0
  LogCheckpointEnd(true);
8010
8011
  /* Reset the process title */
8012
0
  update_checkpoint_display(flags, true, true);
8013
8014
0
  xtime = GetLatestXTime();
8015
0
  ereport((log_checkpoints ? LOG : DEBUG2),
8016
0
      (errmsg("recovery restart point at %X/%X",
8017
0
          LSN_FORMAT_ARGS(lastCheckPoint.redo)),
8018
0
       xtime ? errdetail("Last completed transaction was at log time %s.",
8019
0
                 timestamptz_to_str(xtime)) : 0));
8020
8021
  /*
8022
   * Finally, execute archive_cleanup_command, if any.
8023
   */
8024
0
  if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
8025
0
    ExecuteRecoveryCommand(archiveCleanupCommand,
8026
0
                 "archive_cleanup_command",
8027
0
                 false,
8028
0
                 WAIT_EVENT_ARCHIVE_CLEANUP_COMMAND);
8029
8030
0
  return true;
8031
0
}
8032
8033
/*
8034
 * Report availability of WAL for the given target LSN
8035
 *    (typically a slot's restart_lsn)
8036
 *
8037
 * Returns one of the following enum values:
8038
 *
8039
 * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
8040
 *   max_wal_size.
8041
 *
8042
 * * WALAVAIL_EXTENDED means it is still available by preserving extra
8043
 *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
8044
 *   than max_wal_size, this state is not returned.
8045
 *
8046
 * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
8047
 *   remove reserved segments. The walsender using this slot may return to the
8048
 *   above.
8049
 *
8050
 * * WALAVAIL_REMOVED means it has been removed. A replication stream on
8051
 *   a slot with this LSN cannot continue.  (Any associated walsender
8052
 *   processes should have been terminated already.)
8053
 *
8054
 * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
8055
 */
8056
WALAvailability
8057
GetWALAvailability(XLogRecPtr targetLSN)
8058
0
{
8059
0
  XLogRecPtr  currpos;    /* current write LSN */
8060
0
  XLogSegNo currSeg;    /* segid of currpos */
8061
0
  XLogSegNo targetSeg;    /* segid of targetLSN */
8062
0
  XLogSegNo oldestSeg;    /* actual oldest segid */
8063
0
  XLogSegNo oldestSegMaxWalSize;  /* oldest segid kept by max_wal_size */
8064
0
  XLogSegNo oldestSlotSeg;  /* oldest segid kept by slot */
8065
0
  uint64    keepSegs;
8066
8067
  /*
8068
   * slot does not reserve WAL. Either deactivated, or has never been active
8069
   */
8070
0
  if (XLogRecPtrIsInvalid(targetLSN))
8071
0
    return WALAVAIL_INVALID_LSN;
8072
8073
  /*
8074
   * Calculate the oldest segment currently reserved by all slots,
8075
   * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
8076
   * oldestSlotSeg to the current segment.
8077
   */
8078
0
  currpos = GetXLogWriteRecPtr();
8079
0
  XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
8080
0
  KeepLogSeg(currpos, &oldestSlotSeg);
8081
8082
  /*
8083
   * Find the oldest extant segment file. We get 1 until checkpoint removes
8084
   * the first WAL segment file since startup, which causes the status being
8085
   * wrong under certain abnormal conditions but that doesn't actually harm.
8086
   */
8087
0
  oldestSeg = XLogGetLastRemovedSegno() + 1;
8088
8089
  /* calculate oldest segment by max_wal_size */
8090
0
  XLByteToSeg(currpos, currSeg, wal_segment_size);
8091
0
  keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
8092
8093
0
  if (currSeg > keepSegs)
8094
0
    oldestSegMaxWalSize = currSeg - keepSegs;
8095
0
  else
8096
0
    oldestSegMaxWalSize = 1;
8097
8098
  /* the segment we care about */
8099
0
  XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
8100
8101
  /*
8102
   * No point in returning reserved or extended status values if the
8103
   * targetSeg is known to be lost.
8104
   */
8105
0
  if (targetSeg >= oldestSlotSeg)
8106
0
  {
8107
    /* show "reserved" when targetSeg is within max_wal_size */
8108
0
    if (targetSeg >= oldestSegMaxWalSize)
8109
0
      return WALAVAIL_RESERVED;
8110
8111
    /* being retained by slots exceeding max_wal_size */
8112
0
    return WALAVAIL_EXTENDED;
8113
0
  }
8114
8115
  /* WAL segments are no longer retained but haven't been removed yet */
8116
0
  if (targetSeg >= oldestSeg)
8117
0
    return WALAVAIL_UNRESERVED;
8118
8119
  /* Definitely lost */
8120
0
  return WALAVAIL_REMOVED;
8121
0
}
8122
8123
8124
/*
8125
 * Retreat *logSegNo to the last segment that we need to retain because of
8126
 * either wal_keep_size or replication slots.
8127
 *
8128
 * This is calculated by subtracting wal_keep_size from the given xlog
8129
 * location, recptr and by making sure that that result is below the
8130
 * requirement of replication slots.  For the latter criterion we do consider
8131
 * the effects of max_slot_wal_keep_size: reserve at most that much space back
8132
 * from recptr.
8133
 *
8134
 * Note about replication slots: if this function calculates a value
8135
 * that's further ahead than what slots need reserved, then affected
8136
 * slots need to be invalidated and this function invoked again.
8137
 * XXX it might be a good idea to rewrite this function so that
8138
 * invalidation is optionally done here, instead.
8139
 */
8140
static void
8141
KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
8142
0
{
8143
0
  XLogSegNo currSegNo;
8144
0
  XLogSegNo segno;
8145
0
  XLogRecPtr  keep;
8146
8147
0
  XLByteToSeg(recptr, currSegNo, wal_segment_size);
8148
0
  segno = currSegNo;
8149
8150
  /*
8151
   * Calculate how many segments are kept by slots first, adjusting for
8152
   * max_slot_wal_keep_size.
8153
   */
8154
0
  keep = XLogGetReplicationSlotMinimumLSN();
8155
0
  if (keep != InvalidXLogRecPtr && keep < recptr)
8156
0
  {
8157
0
    XLByteToSeg(keep, segno, wal_segment_size);
8158
8159
    /* Cap by max_slot_wal_keep_size ... */
8160
0
    if (max_slot_wal_keep_size_mb >= 0)
8161
0
    {
8162
0
      uint64    slot_keep_segs;
8163
8164
0
      slot_keep_segs =
8165
0
        ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
8166
8167
0
      if (currSegNo - segno > slot_keep_segs)
8168
0
        segno = currSegNo - slot_keep_segs;
8169
0
    }
8170
0
  }
8171
8172
  /*
8173
   * If WAL summarization is in use, don't remove WAL that has yet to be
8174
   * summarized.
8175
   */
8176
0
  keep = GetOldestUnsummarizedLSN(NULL, NULL);
8177
0
  if (keep != InvalidXLogRecPtr)
8178
0
  {
8179
0
    XLogSegNo unsummarized_segno;
8180
8181
0
    XLByteToSeg(keep, unsummarized_segno, wal_segment_size);
8182
0
    if (unsummarized_segno < segno)
8183
0
      segno = unsummarized_segno;
8184
0
  }
8185
8186
  /* but, keep at least wal_keep_size if that's set */
8187
0
  if (wal_keep_size_mb > 0)
8188
0
  {
8189
0
    uint64    keep_segs;
8190
8191
0
    keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
8192
0
    if (currSegNo - segno < keep_segs)
8193
0
    {
8194
      /* avoid underflow, don't go below 1 */
8195
0
      if (currSegNo <= keep_segs)
8196
0
        segno = 1;
8197
0
      else
8198
0
        segno = currSegNo - keep_segs;
8199
0
    }
8200
0
  }
8201
8202
  /* don't delete WAL segments newer than the calculated segment */
8203
0
  if (segno < *logSegNo)
8204
0
    *logSegNo = segno;
8205
0
}
8206
8207
/*
8208
 * Write a NEXTOID log record
8209
 */
8210
void
8211
XLogPutNextOid(Oid nextOid)
8212
0
{
8213
0
  XLogBeginInsert();
8214
0
  XLogRegisterData(&nextOid, sizeof(Oid));
8215
0
  (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
8216
8217
  /*
8218
   * We need not flush the NEXTOID record immediately, because any of the
8219
   * just-allocated OIDs could only reach disk as part of a tuple insert or
8220
   * update that would have its own XLOG record that must follow the NEXTOID
8221
   * record.  Therefore, the standard buffer LSN interlock applied to those
8222
   * records will ensure no such OID reaches disk before the NEXTOID record
8223
   * does.
8224
   *
8225
   * Note, however, that the above statement only covers state "within" the
8226
   * database.  When we use a generated OID as a file or directory name, we
8227
   * are in a sense violating the basic WAL rule, because that filesystem
8228
   * change may reach disk before the NEXTOID WAL record does.  The impact
8229
   * of this is that if a database crash occurs immediately afterward, we
8230
   * might after restart re-generate the same OID and find that it conflicts
8231
   * with the leftover file or directory.  But since for safety's sake we
8232
   * always loop until finding a nonconflicting filename, this poses no real
8233
   * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
8234
   */
8235
0
}
8236
8237
/*
8238
 * Write an XLOG SWITCH record.
8239
 *
8240
 * Here we just blindly issue an XLogInsert request for the record.
8241
 * All the magic happens inside XLogInsert.
8242
 *
8243
 * The return value is either the end+1 address of the switch record,
8244
 * or the end+1 address of the prior segment if we did not need to
8245
 * write a switch record because we are already at segment start.
8246
 */
8247
XLogRecPtr
8248
RequestXLogSwitch(bool mark_unimportant)
8249
0
{
8250
0
  XLogRecPtr  RecPtr;
8251
8252
  /* XLOG SWITCH has no data */
8253
0
  XLogBeginInsert();
8254
8255
0
  if (mark_unimportant)
8256
0
    XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
8257
0
  RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
8258
8259
0
  return RecPtr;
8260
0
}
8261
8262
/*
8263
 * Write a RESTORE POINT record
8264
 */
8265
XLogRecPtr
8266
XLogRestorePoint(const char *rpName)
8267
0
{
8268
0
  XLogRecPtr  RecPtr;
8269
0
  xl_restore_point xlrec;
8270
8271
0
  xlrec.rp_time = GetCurrentTimestamp();
8272
0
  strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
8273
8274
0
  XLogBeginInsert();
8275
0
  XLogRegisterData(&xlrec, sizeof(xl_restore_point));
8276
8277
0
  RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
8278
8279
0
  ereport(LOG,
8280
0
      (errmsg("restore point \"%s\" created at %X/%X",
8281
0
          rpName, LSN_FORMAT_ARGS(RecPtr))));
8282
8283
0
  return RecPtr;
8284
0
}
8285
8286
/*
8287
 * Check if any of the GUC parameters that are critical for hot standby
8288
 * have changed, and update the value in pg_control file if necessary.
8289
 */
8290
static void
8291
XLogReportParameters(void)
8292
0
{
8293
0
  if (wal_level != ControlFile->wal_level ||
8294
0
    wal_log_hints != ControlFile->wal_log_hints ||
8295
0
    MaxConnections != ControlFile->MaxConnections ||
8296
0
    max_worker_processes != ControlFile->max_worker_processes ||
8297
0
    max_wal_senders != ControlFile->max_wal_senders ||
8298
0
    max_prepared_xacts != ControlFile->max_prepared_xacts ||
8299
0
    max_locks_per_xact != ControlFile->max_locks_per_xact ||
8300
0
    track_commit_timestamp != ControlFile->track_commit_timestamp)
8301
0
  {
8302
    /*
8303
     * The change in number of backend slots doesn't need to be WAL-logged
8304
     * if archiving is not enabled, as you can't start archive recovery
8305
     * with wal_level=minimal anyway. We don't really care about the
8306
     * values in pg_control either if wal_level=minimal, but seems better
8307
     * to keep them up-to-date to avoid confusion.
8308
     */
8309
0
    if (wal_level != ControlFile->wal_level || XLogIsNeeded())
8310
0
    {
8311
0
      xl_parameter_change xlrec;
8312
0
      XLogRecPtr  recptr;
8313
8314
0
      xlrec.MaxConnections = MaxConnections;
8315
0
      xlrec.max_worker_processes = max_worker_processes;
8316
0
      xlrec.max_wal_senders = max_wal_senders;
8317
0
      xlrec.max_prepared_xacts = max_prepared_xacts;
8318
0
      xlrec.max_locks_per_xact = max_locks_per_xact;
8319
0
      xlrec.wal_level = wal_level;
8320
0
      xlrec.wal_log_hints = wal_log_hints;
8321
0
      xlrec.track_commit_timestamp = track_commit_timestamp;
8322
8323
0
      XLogBeginInsert();
8324
0
      XLogRegisterData(&xlrec, sizeof(xlrec));
8325
8326
0
      recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
8327
0
      XLogFlush(recptr);
8328
0
    }
8329
8330
0
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8331
8332
0
    ControlFile->MaxConnections = MaxConnections;
8333
0
    ControlFile->max_worker_processes = max_worker_processes;
8334
0
    ControlFile->max_wal_senders = max_wal_senders;
8335
0
    ControlFile->max_prepared_xacts = max_prepared_xacts;
8336
0
    ControlFile->max_locks_per_xact = max_locks_per_xact;
8337
0
    ControlFile->wal_level = wal_level;
8338
0
    ControlFile->wal_log_hints = wal_log_hints;
8339
0
    ControlFile->track_commit_timestamp = track_commit_timestamp;
8340
0
    UpdateControlFile();
8341
8342
0
    LWLockRelease(ControlFileLock);
8343
0
  }
8344
0
}
8345
8346
/*
8347
 * Update full_page_writes in shared memory, and write an
8348
 * XLOG_FPW_CHANGE record if necessary.
8349
 *
8350
 * Note: this function assumes there is no other process running
8351
 * concurrently that could update it.
8352
 */
8353
void
8354
UpdateFullPageWrites(void)
8355
0
{
8356
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
8357
0
  bool    recoveryInProgress;
8358
8359
  /*
8360
   * Do nothing if full_page_writes has not been changed.
8361
   *
8362
   * It's safe to check the shared full_page_writes without the lock,
8363
   * because we assume that there is no concurrently running process which
8364
   * can update it.
8365
   */
8366
0
  if (fullPageWrites == Insert->fullPageWrites)
8367
0
    return;
8368
8369
  /*
8370
   * Perform this outside critical section so that the WAL insert
8371
   * initialization done by RecoveryInProgress() doesn't trigger an
8372
   * assertion failure.
8373
   */
8374
0
  recoveryInProgress = RecoveryInProgress();
8375
8376
0
  START_CRIT_SECTION();
8377
8378
  /*
8379
   * It's always safe to take full page images, even when not strictly
8380
   * required, but not the other round. So if we're setting full_page_writes
8381
   * to true, first set it true and then write the WAL record. If we're
8382
   * setting it to false, first write the WAL record and then set the global
8383
   * flag.
8384
   */
8385
0
  if (fullPageWrites)
8386
0
  {
8387
0
    WALInsertLockAcquireExclusive();
8388
0
    Insert->fullPageWrites = true;
8389
0
    WALInsertLockRelease();
8390
0
  }
8391
8392
  /*
8393
   * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
8394
   * full_page_writes during archive recovery, if required.
8395
   */
8396
0
  if (XLogStandbyInfoActive() && !recoveryInProgress)
8397
0
  {
8398
0
    XLogBeginInsert();
8399
0
    XLogRegisterData(&fullPageWrites, sizeof(bool));
8400
8401
0
    XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
8402
0
  }
8403
8404
0
  if (!fullPageWrites)
8405
0
  {
8406
0
    WALInsertLockAcquireExclusive();
8407
0
    Insert->fullPageWrites = false;
8408
0
    WALInsertLockRelease();
8409
0
  }
8410
0
  END_CRIT_SECTION();
8411
0
}
8412
8413
/*
8414
 * XLOG resource manager's routines
8415
 *
8416
 * Definitions of info values are in include/catalog/pg_control.h, though
8417
 * not all record types are related to control file updates.
8418
 *
8419
 * NOTE: Some XLOG record types that are directly related to WAL recovery
8420
 * are handled in xlogrecovery_redo().
8421
 */
8422
void
8423
xlog_redo(XLogReaderState *record)
8424
0
{
8425
0
  uint8   info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
8426
0
  XLogRecPtr  lsn = record->EndRecPtr;
8427
8428
  /*
8429
   * In XLOG rmgr, backup blocks are only used by XLOG_FPI and
8430
   * XLOG_FPI_FOR_HINT records.
8431
   */
8432
0
  Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
8433
0
       !XLogRecHasAnyBlockRefs(record));
8434
8435
0
  if (info == XLOG_NEXTOID)
8436
0
  {
8437
0
    Oid     nextOid;
8438
8439
    /*
8440
     * We used to try to take the maximum of TransamVariables->nextOid and
8441
     * the recorded nextOid, but that fails if the OID counter wraps
8442
     * around.  Since no OID allocation should be happening during replay
8443
     * anyway, better to just believe the record exactly.  We still take
8444
     * OidGenLock while setting the variable, just in case.
8445
     */
8446
0
    memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
8447
0
    LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8448
0
    TransamVariables->nextOid = nextOid;
8449
0
    TransamVariables->oidCount = 0;
8450
0
    LWLockRelease(OidGenLock);
8451
0
  }
8452
0
  else if (info == XLOG_CHECKPOINT_SHUTDOWN)
8453
0
  {
8454
0
    CheckPoint  checkPoint;
8455
0
    TimeLineID  replayTLI;
8456
8457
0
    memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8458
    /* In a SHUTDOWN checkpoint, believe the counters exactly */
8459
0
    LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8460
0
    TransamVariables->nextXid = checkPoint.nextXid;
8461
0
    LWLockRelease(XidGenLock);
8462
0
    LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
8463
0
    TransamVariables->nextOid = checkPoint.nextOid;
8464
0
    TransamVariables->oidCount = 0;
8465
0
    LWLockRelease(OidGenLock);
8466
0
    MultiXactSetNextMXact(checkPoint.nextMulti,
8467
0
                checkPoint.nextMultiOffset);
8468
8469
0
    MultiXactAdvanceOldest(checkPoint.oldestMulti,
8470
0
                 checkPoint.oldestMultiDB);
8471
8472
    /*
8473
     * No need to set oldestClogXid here as well; it'll be set when we
8474
     * redo an xl_clog_truncate if it changed since initialization.
8475
     */
8476
0
    SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
8477
8478
    /*
8479
     * If we see a shutdown checkpoint while waiting for an end-of-backup
8480
     * record, the backup was canceled and the end-of-backup record will
8481
     * never arrive.
8482
     */
8483
0
    if (ArchiveRecoveryRequested &&
8484
0
      !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
8485
0
      XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
8486
0
      ereport(PANIC,
8487
0
          (errmsg("online backup was canceled, recovery cannot continue")));
8488
8489
    /*
8490
     * If we see a shutdown checkpoint, we know that nothing was running
8491
     * on the primary at this point. So fake-up an empty running-xacts
8492
     * record and use that here and now. Recover additional standby state
8493
     * for prepared transactions.
8494
     */
8495
0
    if (standbyState >= STANDBY_INITIALIZED)
8496
0
    {
8497
0
      TransactionId *xids;
8498
0
      int     nxids;
8499
0
      TransactionId oldestActiveXID;
8500
0
      TransactionId latestCompletedXid;
8501
0
      RunningTransactionsData running;
8502
8503
0
      oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
8504
8505
      /* Update pg_subtrans entries for any prepared transactions */
8506
0
      StandbyRecoverPreparedTransactions();
8507
8508
      /*
8509
       * Construct a RunningTransactions snapshot representing a shut
8510
       * down server, with only prepared transactions still alive. We're
8511
       * never overflowed at this point because all subxids are listed
8512
       * with their parent prepared transactions.
8513
       */
8514
0
      running.xcnt = nxids;
8515
0
      running.subxcnt = 0;
8516
0
      running.subxid_status = SUBXIDS_IN_SUBTRANS;
8517
0
      running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
8518
0
      running.oldestRunningXid = oldestActiveXID;
8519
0
      latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
8520
0
      TransactionIdRetreat(latestCompletedXid);
8521
0
      Assert(TransactionIdIsNormal(latestCompletedXid));
8522
0
      running.latestCompletedXid = latestCompletedXid;
8523
0
      running.xids = xids;
8524
8525
0
      ProcArrayApplyRecoveryInfo(&running);
8526
0
    }
8527
8528
    /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8529
0
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8530
0
    ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8531
0
    LWLockRelease(ControlFileLock);
8532
8533
    /* Update shared-memory copy of checkpoint XID/epoch */
8534
0
    SpinLockAcquire(&XLogCtl->info_lck);
8535
0
    XLogCtl->ckptFullXid = checkPoint.nextXid;
8536
0
    SpinLockRelease(&XLogCtl->info_lck);
8537
8538
    /*
8539
     * We should've already switched to the new TLI before replaying this
8540
     * record.
8541
     */
8542
0
    (void) GetCurrentReplayRecPtr(&replayTLI);
8543
0
    if (checkPoint.ThisTimeLineID != replayTLI)
8544
0
      ereport(PANIC,
8545
0
          (errmsg("unexpected timeline ID %u (should be %u) in shutdown checkpoint record",
8546
0
              checkPoint.ThisTimeLineID, replayTLI)));
8547
8548
0
    RecoveryRestartPoint(&checkPoint, record);
8549
0
  }
8550
0
  else if (info == XLOG_CHECKPOINT_ONLINE)
8551
0
  {
8552
0
    CheckPoint  checkPoint;
8553
0
    TimeLineID  replayTLI;
8554
8555
0
    memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
8556
    /* In an ONLINE checkpoint, treat the XID counter as a minimum */
8557
0
    LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
8558
0
    if (FullTransactionIdPrecedes(TransamVariables->nextXid,
8559
0
                    checkPoint.nextXid))
8560
0
      TransamVariables->nextXid = checkPoint.nextXid;
8561
0
    LWLockRelease(XidGenLock);
8562
8563
    /*
8564
     * We ignore the nextOid counter in an ONLINE checkpoint, preferring
8565
     * to track OID assignment through XLOG_NEXTOID records.  The nextOid
8566
     * counter is from the start of the checkpoint and might well be stale
8567
     * compared to later XLOG_NEXTOID records.  We could try to take the
8568
     * maximum of the nextOid counter and our latest value, but since
8569
     * there's no particular guarantee about the speed with which the OID
8570
     * counter wraps around, that's a risky thing to do.  In any case,
8571
     * users of the nextOid counter are required to avoid assignment of
8572
     * duplicates, so that a somewhat out-of-date value should be safe.
8573
     */
8574
8575
    /* Handle multixact */
8576
0
    MultiXactAdvanceNextMXact(checkPoint.nextMulti,
8577
0
                  checkPoint.nextMultiOffset);
8578
8579
    /*
8580
     * NB: This may perform multixact truncation when replaying WAL
8581
     * generated by an older primary.
8582
     */
8583
0
    MultiXactAdvanceOldest(checkPoint.oldestMulti,
8584
0
                 checkPoint.oldestMultiDB);
8585
0
    if (TransactionIdPrecedes(TransamVariables->oldestXid,
8586
0
                  checkPoint.oldestXid))
8587
0
      SetTransactionIdLimit(checkPoint.oldestXid,
8588
0
                  checkPoint.oldestXidDB);
8589
    /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
8590
0
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8591
0
    ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8592
0
    LWLockRelease(ControlFileLock);
8593
8594
    /* Update shared-memory copy of checkpoint XID/epoch */
8595
0
    SpinLockAcquire(&XLogCtl->info_lck);
8596
0
    XLogCtl->ckptFullXid = checkPoint.nextXid;
8597
0
    SpinLockRelease(&XLogCtl->info_lck);
8598
8599
    /* TLI should not change in an on-line checkpoint */
8600
0
    (void) GetCurrentReplayRecPtr(&replayTLI);
8601
0
    if (checkPoint.ThisTimeLineID != replayTLI)
8602
0
      ereport(PANIC,
8603
0
          (errmsg("unexpected timeline ID %u (should be %u) in online checkpoint record",
8604
0
              checkPoint.ThisTimeLineID, replayTLI)));
8605
8606
0
    RecoveryRestartPoint(&checkPoint, record);
8607
0
  }
8608
0
  else if (info == XLOG_OVERWRITE_CONTRECORD)
8609
0
  {
8610
    /* nothing to do here, handled in xlogrecovery_redo() */
8611
0
  }
8612
0
  else if (info == XLOG_END_OF_RECOVERY)
8613
0
  {
8614
0
    xl_end_of_recovery xlrec;
8615
0
    TimeLineID  replayTLI;
8616
8617
0
    memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
8618
8619
    /*
8620
     * For Hot Standby, we could treat this like a Shutdown Checkpoint,
8621
     * but this case is rarer and harder to test, so the benefit doesn't
8622
     * outweigh the potential extra cost of maintenance.
8623
     */
8624
8625
    /*
8626
     * We should've already switched to the new TLI before replaying this
8627
     * record.
8628
     */
8629
0
    (void) GetCurrentReplayRecPtr(&replayTLI);
8630
0
    if (xlrec.ThisTimeLineID != replayTLI)
8631
0
      ereport(PANIC,
8632
0
          (errmsg("unexpected timeline ID %u (should be %u) in end-of-recovery record",
8633
0
              xlrec.ThisTimeLineID, replayTLI)));
8634
0
  }
8635
0
  else if (info == XLOG_NOOP)
8636
0
  {
8637
    /* nothing to do here */
8638
0
  }
8639
0
  else if (info == XLOG_SWITCH)
8640
0
  {
8641
    /* nothing to do here */
8642
0
  }
8643
0
  else if (info == XLOG_RESTORE_POINT)
8644
0
  {
8645
    /* nothing to do here, handled in xlogrecovery.c */
8646
0
  }
8647
0
  else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
8648
0
  {
8649
    /*
8650
     * XLOG_FPI records contain nothing else but one or more block
8651
     * references. Every block reference must include a full-page image
8652
     * even if full_page_writes was disabled when the record was generated
8653
     * - otherwise there would be no point in this record.
8654
     *
8655
     * XLOG_FPI_FOR_HINT records are generated when a page needs to be
8656
     * WAL-logged because of a hint bit update. They are only generated
8657
     * when checksums and/or wal_log_hints are enabled. They may include
8658
     * no full-page images if full_page_writes was disabled when they were
8659
     * generated. In this case there is nothing to do here.
8660
     *
8661
     * No recovery conflicts are generated by these generic records - if a
8662
     * resource manager needs to generate conflicts, it has to define a
8663
     * separate WAL record type and redo routine.
8664
     */
8665
0
    for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++)
8666
0
    {
8667
0
      Buffer    buffer;
8668
8669
0
      if (!XLogRecHasBlockImage(record, block_id))
8670
0
      {
8671
0
        if (info == XLOG_FPI)
8672
0
          elog(ERROR, "XLOG_FPI record did not contain a full-page image");
8673
0
        continue;
8674
0
      }
8675
8676
0
      if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
8677
0
        elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
8678
0
      UnlockReleaseBuffer(buffer);
8679
0
    }
8680
0
  }
8681
0
  else if (info == XLOG_BACKUP_END)
8682
0
  {
8683
    /* nothing to do here, handled in xlogrecovery_redo() */
8684
0
  }
8685
0
  else if (info == XLOG_PARAMETER_CHANGE)
8686
0
  {
8687
0
    xl_parameter_change xlrec;
8688
8689
    /* Update our copy of the parameters in pg_control */
8690
0
    memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8691
8692
    /*
8693
     * Invalidate logical slots if we are in hot standby and the primary
8694
     * does not have a WAL level sufficient for logical decoding. No need
8695
     * to search for potentially conflicting logically slots if standby is
8696
     * running with wal_level lower than logical, because in that case, we
8697
     * would have either disallowed creation of logical slots or
8698
     * invalidated existing ones.
8699
     */
8700
0
    if (InRecovery && InHotStandby &&
8701
0
      xlrec.wal_level < WAL_LEVEL_LOGICAL &&
8702
0
      wal_level >= WAL_LEVEL_LOGICAL)
8703
0
      InvalidateObsoleteReplicationSlots(RS_INVAL_WAL_LEVEL,
8704
0
                         0, InvalidOid,
8705
0
                         InvalidTransactionId);
8706
8707
0
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8708
0
    ControlFile->MaxConnections = xlrec.MaxConnections;
8709
0
    ControlFile->max_worker_processes = xlrec.max_worker_processes;
8710
0
    ControlFile->max_wal_senders = xlrec.max_wal_senders;
8711
0
    ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8712
0
    ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8713
0
    ControlFile->wal_level = xlrec.wal_level;
8714
0
    ControlFile->wal_log_hints = xlrec.wal_log_hints;
8715
8716
    /*
8717
     * Update minRecoveryPoint to ensure that if recovery is aborted, we
8718
     * recover back up to this point before allowing hot standby again.
8719
     * This is important if the max_* settings are decreased, to ensure
8720
     * you don't run queries against the WAL preceding the change. The
8721
     * local copies cannot be updated as long as crash recovery is
8722
     * happening and we expect all the WAL to be replayed.
8723
     */
8724
0
    if (InArchiveRecovery)
8725
0
    {
8726
0
      LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
8727
0
      LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
8728
0
    }
8729
0
    if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
8730
0
    {
8731
0
      TimeLineID  replayTLI;
8732
8733
0
      (void) GetCurrentReplayRecPtr(&replayTLI);
8734
0
      ControlFile->minRecoveryPoint = lsn;
8735
0
      ControlFile->minRecoveryPointTLI = replayTLI;
8736
0
    }
8737
8738
0
    CommitTsParameterChange(xlrec.track_commit_timestamp,
8739
0
                ControlFile->track_commit_timestamp);
8740
0
    ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
8741
8742
0
    UpdateControlFile();
8743
0
    LWLockRelease(ControlFileLock);
8744
8745
    /* Check to see if any parameter change gives a problem on recovery */
8746
0
    CheckRequiredParameterValues();
8747
0
  }
8748
0
  else if (info == XLOG_FPW_CHANGE)
8749
0
  {
8750
0
    bool    fpw;
8751
8752
0
    memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
8753
8754
    /*
8755
     * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
8756
     * do_pg_backup_start() and do_pg_backup_stop() can check whether
8757
     * full_page_writes has been disabled during online backup.
8758
     */
8759
0
    if (!fpw)
8760
0
    {
8761
0
      SpinLockAcquire(&XLogCtl->info_lck);
8762
0
      if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
8763
0
        XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
8764
0
      SpinLockRelease(&XLogCtl->info_lck);
8765
0
    }
8766
8767
    /* Keep track of full_page_writes */
8768
0
    lastFullPageWrites = fpw;
8769
0
  }
8770
0
  else if (info == XLOG_CHECKPOINT_REDO)
8771
0
  {
8772
    /* nothing to do here, just for informational purposes */
8773
0
  }
8774
0
}
8775
8776
/*
8777
 * Return the extra open flags used for opening a file, depending on the
8778
 * value of the GUCs wal_sync_method, fsync and debug_io_direct.
8779
 */
8780
static int
8781
get_sync_bit(int method)
8782
0
{
8783
0
  int     o_direct_flag = 0;
8784
8785
  /*
8786
   * Use O_DIRECT if requested, except in walreceiver process.  The WAL
8787
   * written by walreceiver is normally read by the startup process soon
8788
   * after it's written.  Also, walreceiver performs unaligned writes, which
8789
   * don't work with O_DIRECT, so it is required for correctness too.
8790
   */
8791
0
  if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
8792
0
    o_direct_flag = PG_O_DIRECT;
8793
8794
  /* If fsync is disabled, never open in sync mode */
8795
0
  if (!enableFsync)
8796
0
    return o_direct_flag;
8797
8798
0
  switch (method)
8799
0
  {
8800
      /*
8801
       * enum values for all sync options are defined even if they are
8802
       * not supported on the current platform.  But if not, they are
8803
       * not included in the enum option array, and therefore will never
8804
       * be seen here.
8805
       */
8806
0
    case WAL_SYNC_METHOD_FSYNC:
8807
0
    case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
8808
0
    case WAL_SYNC_METHOD_FDATASYNC:
8809
0
      return o_direct_flag;
8810
0
#ifdef O_SYNC
8811
0
    case WAL_SYNC_METHOD_OPEN:
8812
0
      return O_SYNC | o_direct_flag;
8813
0
#endif
8814
0
#ifdef O_DSYNC
8815
0
    case WAL_SYNC_METHOD_OPEN_DSYNC:
8816
0
      return O_DSYNC | o_direct_flag;
8817
0
#endif
8818
0
    default:
8819
      /* can't happen (unless we are out of sync with option array) */
8820
0
      elog(ERROR, "unrecognized \"wal_sync_method\": %d", method);
8821
0
      return 0;     /* silence warning */
8822
0
  }
8823
0
}
8824
8825
/*
8826
 * GUC support
8827
 */
8828
void
8829
assign_wal_sync_method(int new_wal_sync_method, void *extra)
8830
0
{
8831
0
  if (wal_sync_method != new_wal_sync_method)
8832
0
  {
8833
    /*
8834
     * To ensure that no blocks escape unsynced, force an fsync on the
8835
     * currently open log segment (if any).  Also, if the open flag is
8836
     * changing, close the log file so it will be reopened (with new flag
8837
     * bit) at next use.
8838
     */
8839
0
    if (openLogFile >= 0)
8840
0
    {
8841
0
      pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
8842
0
      if (pg_fsync(openLogFile) != 0)
8843
0
      {
8844
0
        char    xlogfname[MAXFNAMELEN];
8845
0
        int     save_errno;
8846
8847
0
        save_errno = errno;
8848
0
        XLogFileName(xlogfname, openLogTLI, openLogSegNo,
8849
0
               wal_segment_size);
8850
0
        errno = save_errno;
8851
0
        ereport(PANIC,
8852
0
            (errcode_for_file_access(),
8853
0
             errmsg("could not fsync file \"%s\": %m", xlogfname)));
8854
0
      }
8855
8856
0
      pgstat_report_wait_end();
8857
0
      if (get_sync_bit(wal_sync_method) != get_sync_bit(new_wal_sync_method))
8858
0
        XLogFileClose();
8859
0
    }
8860
0
  }
8861
0
}
8862
8863
8864
/*
8865
 * Issue appropriate kind of fsync (if any) for an XLOG output file.
8866
 *
8867
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
8868
 * 'segno' is for error reporting purposes.
8869
 */
8870
void
8871
issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli)
8872
0
{
8873
0
  char     *msg = NULL;
8874
0
  instr_time  start;
8875
8876
0
  Assert(tli != 0);
8877
8878
  /*
8879
   * Quick exit if fsync is disabled or write() has already synced the WAL
8880
   * file.
8881
   */
8882
0
  if (!enableFsync ||
8883
0
    wal_sync_method == WAL_SYNC_METHOD_OPEN ||
8884
0
    wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC)
8885
0
    return;
8886
8887
  /*
8888
   * Measure I/O timing to sync the WAL file for pg_stat_io.
8889
   */
8890
0
  start = pgstat_prepare_io_time(track_wal_io_timing);
8891
8892
0
  pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
8893
0
  switch (wal_sync_method)
8894
0
  {
8895
0
    case WAL_SYNC_METHOD_FSYNC:
8896
0
      if (pg_fsync_no_writethrough(fd) != 0)
8897
0
        msg = _("could not fsync file \"%s\": %m");
8898
0
      break;
8899
#ifdef HAVE_FSYNC_WRITETHROUGH
8900
    case WAL_SYNC_METHOD_FSYNC_WRITETHROUGH:
8901
      if (pg_fsync_writethrough(fd) != 0)
8902
        msg = _("could not fsync write-through file \"%s\": %m");
8903
      break;
8904
#endif
8905
0
    case WAL_SYNC_METHOD_FDATASYNC:
8906
0
      if (pg_fdatasync(fd) != 0)
8907
0
        msg = _("could not fdatasync file \"%s\": %m");
8908
0
      break;
8909
0
    case WAL_SYNC_METHOD_OPEN:
8910
0
    case WAL_SYNC_METHOD_OPEN_DSYNC:
8911
      /* not reachable */
8912
0
      Assert(false);
8913
0
      break;
8914
0
    default:
8915
0
      ereport(PANIC,
8916
0
          errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8917
0
          errmsg_internal("unrecognized \"wal_sync_method\": %d", wal_sync_method));
8918
0
      break;
8919
0
  }
8920
8921
  /* PANIC if failed to fsync */
8922
0
  if (msg)
8923
0
  {
8924
0
    char    xlogfname[MAXFNAMELEN];
8925
0
    int     save_errno = errno;
8926
8927
0
    XLogFileName(xlogfname, tli, segno, wal_segment_size);
8928
0
    errno = save_errno;
8929
0
    ereport(PANIC,
8930
0
        (errcode_for_file_access(),
8931
0
         errmsg(msg, xlogfname)));
8932
0
  }
8933
8934
0
  pgstat_report_wait_end();
8935
8936
0
  pgstat_count_io_op_time(IOOBJECT_WAL, IOCONTEXT_NORMAL, IOOP_FSYNC,
8937
0
              start, 1, 0);
8938
0
}
8939
8940
/*
8941
 * do_pg_backup_start is the workhorse of the user-visible pg_backup_start()
8942
 * function. It creates the necessary starting checkpoint and constructs the
8943
 * backup state and tablespace map.
8944
 *
8945
 * Input parameters are "state" (the backup state), "fast" (if true, we do
8946
 * the checkpoint in immediate mode to make it faster), and "tablespaces"
8947
 * (if non-NULL, indicates a list of tablespaceinfo structs describing the
8948
 * cluster's tablespaces.).
8949
 *
8950
 * The tablespace map contents are appended to passed-in parameter
8951
 * tablespace_map and the caller is responsible for including it in the backup
8952
 * archive as 'tablespace_map'. The tablespace_map file is required mainly for
8953
 * tar format in windows as native windows utilities are not able to create
8954
 * symlinks while extracting files from tar. However for consistency and
8955
 * platform-independence, we do it the same way everywhere.
8956
 *
8957
 * It fills in "state" with the information required for the backup, such
8958
 * as the minimum WAL location that must be present to restore from this
8959
 * backup (starttli) and the corresponding timeline ID (starttli).
8960
 *
8961
 * Every successfully started backup must be stopped by calling
8962
 * do_pg_backup_stop() or do_pg_abort_backup(). There can be many
8963
 * backups active at the same time.
8964
 *
8965
 * It is the responsibility of the caller of this function to verify the
8966
 * permissions of the calling user!
8967
 */
8968
void
8969
do_pg_backup_start(const char *backupidstr, bool fast, List **tablespaces,
8970
           BackupState *state, StringInfo tblspcmapfile)
8971
0
{
8972
0
  bool    backup_started_in_recovery;
8973
8974
0
  Assert(state != NULL);
8975
0
  backup_started_in_recovery = RecoveryInProgress();
8976
8977
  /*
8978
   * During recovery, we don't need to check WAL level. Because, if WAL
8979
   * level is not sufficient, it's impossible to get here during recovery.
8980
   */
8981
0
  if (!backup_started_in_recovery && !XLogIsNeeded())
8982
0
    ereport(ERROR,
8983
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8984
0
         errmsg("WAL level not sufficient for making an online backup"),
8985
0
         errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
8986
8987
0
  if (strlen(backupidstr) > MAXPGPATH)
8988
0
    ereport(ERROR,
8989
0
        (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8990
0
         errmsg("backup label too long (max %d bytes)",
8991
0
            MAXPGPATH)));
8992
8993
0
  strlcpy(state->name, backupidstr, sizeof(state->name));
8994
8995
  /*
8996
   * Mark backup active in shared memory.  We must do full-page WAL writes
8997
   * during an on-line backup even if not doing so at other times, because
8998
   * it's quite possible for the backup dump to obtain a "torn" (partially
8999
   * written) copy of a database page if it reads the page concurrently with
9000
   * our write to the same page.  This can be fixed as long as the first
9001
   * write to the page in the WAL sequence is a full-page write. Hence, we
9002
   * increment runningBackups then force a CHECKPOINT, to ensure there are
9003
   * no dirty pages in shared memory that might get dumped while the backup
9004
   * is in progress without having a corresponding WAL record.  (Once the
9005
   * backup is complete, we need not force full-page writes anymore, since
9006
   * we expect that any pages not modified during the backup interval must
9007
   * have been correctly captured by the backup.)
9008
   *
9009
   * Note that forcing full-page writes has no effect during an online
9010
   * backup from the standby.
9011
   *
9012
   * We must hold all the insertion locks to change the value of
9013
   * runningBackups, to ensure adequate interlocking against
9014
   * XLogInsertRecord().
9015
   */
9016
0
  WALInsertLockAcquireExclusive();
9017
0
  XLogCtl->Insert.runningBackups++;
9018
0
  WALInsertLockRelease();
9019
9020
  /*
9021
   * Ensure we decrement runningBackups if we fail below. NB -- for this to
9022
   * work correctly, it is critical that sessionBackupState is only updated
9023
   * after this block is over.
9024
   */
9025
0
  PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
9026
0
  {
9027
0
    bool    gotUniqueStartpoint = false;
9028
0
    DIR      *tblspcdir;
9029
0
    struct dirent *de;
9030
0
    tablespaceinfo *ti;
9031
0
    int     datadirpathlen;
9032
9033
    /*
9034
     * Force an XLOG file switch before the checkpoint, to ensure that the
9035
     * WAL segment the checkpoint is written to doesn't contain pages with
9036
     * old timeline IDs.  That would otherwise happen if you called
9037
     * pg_backup_start() right after restoring from a PITR archive: the
9038
     * first WAL segment containing the startup checkpoint has pages in
9039
     * the beginning with the old timeline ID.  That can cause trouble at
9040
     * recovery: we won't have a history file covering the old timeline if
9041
     * pg_wal directory was not included in the base backup and the WAL
9042
     * archive was cleared too before starting the backup.
9043
     *
9044
     * This also ensures that we have emitted a WAL page header that has
9045
     * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
9046
     * Therefore, if a WAL archiver (such as pglesslog) is trying to
9047
     * compress out removable backup blocks, it won't remove any that
9048
     * occur after this point.
9049
     *
9050
     * During recovery, we skip forcing XLOG file switch, which means that
9051
     * the backup taken during recovery is not available for the special
9052
     * recovery case described above.
9053
     */
9054
0
    if (!backup_started_in_recovery)
9055
0
      RequestXLogSwitch(false);
9056
9057
0
    do
9058
0
    {
9059
0
      bool    checkpointfpw;
9060
9061
      /*
9062
       * Force a CHECKPOINT.  Aside from being necessary to prevent torn
9063
       * page problems, this guarantees that two successive backup runs
9064
       * will have different checkpoint positions and hence different
9065
       * history file names, even if nothing happened in between.
9066
       *
9067
       * During recovery, establish a restartpoint if possible. We use
9068
       * the last restartpoint as the backup starting checkpoint. This
9069
       * means that two successive backup runs can have same checkpoint
9070
       * positions.
9071
       *
9072
       * Since the fact that we are executing do_pg_backup_start()
9073
       * during recovery means that checkpointer is running, we can use
9074
       * RequestCheckpoint() to establish a restartpoint.
9075
       *
9076
       * We use CHECKPOINT_IMMEDIATE only if requested by user (via
9077
       * passing fast = true).  Otherwise this can take awhile.
9078
       */
9079
0
      RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
9080
0
                (fast ? CHECKPOINT_IMMEDIATE : 0));
9081
9082
      /*
9083
       * Now we need to fetch the checkpoint record location, and also
9084
       * its REDO pointer.  The oldest point in WAL that would be needed
9085
       * to restore starting from the checkpoint is precisely the REDO
9086
       * pointer.
9087
       */
9088
0
      LWLockAcquire(ControlFileLock, LW_SHARED);
9089
0
      state->checkpointloc = ControlFile->checkPoint;
9090
0
      state->startpoint = ControlFile->checkPointCopy.redo;
9091
0
      state->starttli = ControlFile->checkPointCopy.ThisTimeLineID;
9092
0
      checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
9093
0
      LWLockRelease(ControlFileLock);
9094
9095
0
      if (backup_started_in_recovery)
9096
0
      {
9097
0
        XLogRecPtr  recptr;
9098
9099
        /*
9100
         * Check to see if all WAL replayed during online backup
9101
         * (i.e., since last restartpoint used as backup starting
9102
         * checkpoint) contain full-page writes.
9103
         */
9104
0
        SpinLockAcquire(&XLogCtl->info_lck);
9105
0
        recptr = XLogCtl->lastFpwDisableRecPtr;
9106
0
        SpinLockRelease(&XLogCtl->info_lck);
9107
9108
0
        if (!checkpointfpw || state->startpoint <= recptr)
9109
0
          ereport(ERROR,
9110
0
              (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9111
0
               errmsg("WAL generated with \"full_page_writes=off\" was replayed "
9112
0
                  "since last restartpoint"),
9113
0
               errhint("This means that the backup being taken on the standby "
9114
0
                   "is corrupt and should not be used. "
9115
0
                   "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
9116
0
                   "and then try an online backup again.")));
9117
9118
        /*
9119
         * During recovery, since we don't use the end-of-backup WAL
9120
         * record and don't write the backup history file, the
9121
         * starting WAL location doesn't need to be unique. This means
9122
         * that two base backups started at the same time might use
9123
         * the same checkpoint as starting locations.
9124
         */
9125
0
        gotUniqueStartpoint = true;
9126
0
      }
9127
9128
      /*
9129
       * If two base backups are started at the same time (in WAL sender
9130
       * processes), we need to make sure that they use different
9131
       * checkpoints as starting locations, because we use the starting
9132
       * WAL location as a unique identifier for the base backup in the
9133
       * end-of-backup WAL record and when we write the backup history
9134
       * file. Perhaps it would be better generate a separate unique ID
9135
       * for each backup instead of forcing another checkpoint, but
9136
       * taking a checkpoint right after another is not that expensive
9137
       * either because only few buffers have been dirtied yet.
9138
       */
9139
0
      WALInsertLockAcquireExclusive();
9140
0
      if (XLogCtl->Insert.lastBackupStart < state->startpoint)
9141
0
      {
9142
0
        XLogCtl->Insert.lastBackupStart = state->startpoint;
9143
0
        gotUniqueStartpoint = true;
9144
0
      }
9145
0
      WALInsertLockRelease();
9146
0
    } while (!gotUniqueStartpoint);
9147
9148
    /*
9149
     * Construct tablespace_map file.
9150
     */
9151
0
    datadirpathlen = strlen(DataDir);
9152
9153
    /* Collect information about all tablespaces */
9154
0
    tblspcdir = AllocateDir(PG_TBLSPC_DIR);
9155
0
    while ((de = ReadDir(tblspcdir, PG_TBLSPC_DIR)) != NULL)
9156
0
    {
9157
0
      char    fullpath[MAXPGPATH + sizeof(PG_TBLSPC_DIR)];
9158
0
      char    linkpath[MAXPGPATH];
9159
0
      char     *relpath = NULL;
9160
0
      char     *s;
9161
0
      PGFileType  de_type;
9162
0
      char     *badp;
9163
0
      Oid     tsoid;
9164
9165
      /*
9166
       * Try to parse the directory name as an unsigned integer.
9167
       *
9168
       * Tablespace directories should be positive integers that can be
9169
       * represented in 32 bits, with no leading zeroes or trailing
9170
       * garbage. If we come across a name that doesn't meet those
9171
       * criteria, skip it.
9172
       */
9173
0
      if (de->d_name[0] < '1' || de->d_name[1] > '9')
9174
0
        continue;
9175
0
      errno = 0;
9176
0
      tsoid = strtoul(de->d_name, &badp, 10);
9177
0
      if (*badp != '\0' || errno == EINVAL || errno == ERANGE)
9178
0
        continue;
9179
9180
0
      snprintf(fullpath, sizeof(fullpath), "%s/%s", PG_TBLSPC_DIR, de->d_name);
9181
9182
0
      de_type = get_dirent_type(fullpath, de, false, ERROR);
9183
9184
0
      if (de_type == PGFILETYPE_LNK)
9185
0
      {
9186
0
        StringInfoData escapedpath;
9187
0
        int     rllen;
9188
9189
0
        rllen = readlink(fullpath, linkpath, sizeof(linkpath));
9190
0
        if (rllen < 0)
9191
0
        {
9192
0
          ereport(WARNING,
9193
0
              (errmsg("could not read symbolic link \"%s\": %m",
9194
0
                  fullpath)));
9195
0
          continue;
9196
0
        }
9197
0
        else if (rllen >= sizeof(linkpath))
9198
0
        {
9199
0
          ereport(WARNING,
9200
0
              (errmsg("symbolic link \"%s\" target is too long",
9201
0
                  fullpath)));
9202
0
          continue;
9203
0
        }
9204
0
        linkpath[rllen] = '\0';
9205
9206
        /*
9207
         * Relpath holds the relative path of the tablespace directory
9208
         * when it's located within PGDATA, or NULL if it's located
9209
         * elsewhere.
9210
         */
9211
0
        if (rllen > datadirpathlen &&
9212
0
          strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
9213
0
          IS_DIR_SEP(linkpath[datadirpathlen]))
9214
0
          relpath = pstrdup(linkpath + datadirpathlen + 1);
9215
9216
        /*
9217
         * Add a backslash-escaped version of the link path to the
9218
         * tablespace map file.
9219
         */
9220
0
        initStringInfo(&escapedpath);
9221
0
        for (s = linkpath; *s; s++)
9222
0
        {
9223
0
          if (*s == '\n' || *s == '\r' || *s == '\\')
9224
0
            appendStringInfoChar(&escapedpath, '\\');
9225
0
          appendStringInfoChar(&escapedpath, *s);
9226
0
        }
9227
0
        appendStringInfo(tblspcmapfile, "%s %s\n",
9228
0
                 de->d_name, escapedpath.data);
9229
0
        pfree(escapedpath.data);
9230
0
      }
9231
0
      else if (de_type == PGFILETYPE_DIR)
9232
0
      {
9233
        /*
9234
         * It's possible to use allow_in_place_tablespaces to create
9235
         * directories directly under pg_tblspc, for testing purposes
9236
         * only.
9237
         *
9238
         * In this case, we store a relative path rather than an
9239
         * absolute path into the tablespaceinfo.
9240
         */
9241
0
        snprintf(linkpath, sizeof(linkpath), "%s/%s",
9242
0
             PG_TBLSPC_DIR, de->d_name);
9243
0
        relpath = pstrdup(linkpath);
9244
0
      }
9245
0
      else
9246
0
      {
9247
        /* Skip any other file type that appears here. */
9248
0
        continue;
9249
0
      }
9250
9251
0
      ti = palloc(sizeof(tablespaceinfo));
9252
0
      ti->oid = tsoid;
9253
0
      ti->path = pstrdup(linkpath);
9254
0
      ti->rpath = relpath;
9255
0
      ti->size = -1;
9256
9257
0
      if (tablespaces)
9258
0
        *tablespaces = lappend(*tablespaces, ti);
9259
0
    }
9260
0
    FreeDir(tblspcdir);
9261
9262
0
    state->starttime = (pg_time_t) time(NULL);
9263
0
  }
9264
0
  PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, DatumGetBool(true));
9265
9266
0
  state->started_in_recovery = backup_started_in_recovery;
9267
9268
  /*
9269
   * Mark that the start phase has correctly finished for the backup.
9270
   */
9271
0
  sessionBackupState = SESSION_BACKUP_RUNNING;
9272
0
}
9273
9274
/*
9275
 * Utility routine to fetch the session-level status of a backup running.
9276
 */
9277
SessionBackupState
9278
get_backup_status(void)
9279
0
{
9280
0
  return sessionBackupState;
9281
0
}
9282
9283
/*
9284
 * do_pg_backup_stop
9285
 *
9286
 * Utility function called at the end of an online backup.  It creates history
9287
 * file (if required), resets sessionBackupState and so on.  It can optionally
9288
 * wait for WAL segments to be archived.
9289
 *
9290
 * "state" is filled with the information necessary to restore from this
9291
 * backup with its stop LSN (stoppoint), its timeline ID (stoptli), etc.
9292
 *
9293
 * It is the responsibility of the caller of this function to verify the
9294
 * permissions of the calling user!
9295
 */
9296
void
9297
do_pg_backup_stop(BackupState *state, bool waitforarchive)
9298
0
{
9299
0
  bool    backup_stopped_in_recovery = false;
9300
0
  char    histfilepath[MAXPGPATH];
9301
0
  char    lastxlogfilename[MAXFNAMELEN];
9302
0
  char    histfilename[MAXFNAMELEN];
9303
0
  XLogSegNo _logSegNo;
9304
0
  FILE     *fp;
9305
0
  int     seconds_before_warning;
9306
0
  int     waits = 0;
9307
0
  bool    reported_waiting = false;
9308
9309
0
  Assert(state != NULL);
9310
9311
0
  backup_stopped_in_recovery = RecoveryInProgress();
9312
9313
  /*
9314
   * During recovery, we don't need to check WAL level. Because, if WAL
9315
   * level is not sufficient, it's impossible to get here during recovery.
9316
   */
9317
0
  if (!backup_stopped_in_recovery && !XLogIsNeeded())
9318
0
    ereport(ERROR,
9319
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9320
0
         errmsg("WAL level not sufficient for making an online backup"),
9321
0
         errhint("\"wal_level\" must be set to \"replica\" or \"logical\" at server start.")));
9322
9323
  /*
9324
   * OK to update backup counter and session-level lock.
9325
   *
9326
   * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them,
9327
   * otherwise they can be updated inconsistently, which might cause
9328
   * do_pg_abort_backup() to fail.
9329
   */
9330
0
  WALInsertLockAcquireExclusive();
9331
9332
  /*
9333
   * It is expected that each do_pg_backup_start() call is matched by
9334
   * exactly one do_pg_backup_stop() call.
9335
   */
9336
0
  Assert(XLogCtl->Insert.runningBackups > 0);
9337
0
  XLogCtl->Insert.runningBackups--;
9338
9339
  /*
9340
   * Clean up session-level lock.
9341
   *
9342
   * You might think that WALInsertLockRelease() can be called before
9343
   * cleaning up session-level lock because session-level lock doesn't need
9344
   * to be protected with WAL insertion lock. But since
9345
   * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
9346
   * cleaned up before it.
9347
   */
9348
0
  sessionBackupState = SESSION_BACKUP_NONE;
9349
9350
0
  WALInsertLockRelease();
9351
9352
  /*
9353
   * If we are taking an online backup from the standby, we confirm that the
9354
   * standby has not been promoted during the backup.
9355
   */
9356
0
  if (state->started_in_recovery && !backup_stopped_in_recovery)
9357
0
    ereport(ERROR,
9358
0
        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9359
0
         errmsg("the standby was promoted during online backup"),
9360
0
         errhint("This means that the backup being taken is corrupt "
9361
0
             "and should not be used. "
9362
0
             "Try taking another online backup.")));
9363
9364
  /*
9365
   * During recovery, we don't write an end-of-backup record. We assume that
9366
   * pg_control was backed up last and its minimum recovery point can be
9367
   * available as the backup end location. Since we don't have an
9368
   * end-of-backup record, we use the pg_control value to check whether
9369
   * we've reached the end of backup when starting recovery from this
9370
   * backup. We have no way of checking if pg_control wasn't backed up last
9371
   * however.
9372
   *
9373
   * We don't force a switch to new WAL file but it is still possible to
9374
   * wait for all the required files to be archived if waitforarchive is
9375
   * true. This is okay if we use the backup to start a standby and fetch
9376
   * the missing WAL using streaming replication. But in the case of an
9377
   * archive recovery, a user should set waitforarchive to true and wait for
9378
   * them to be archived to ensure that all the required files are
9379
   * available.
9380
   *
9381
   * We return the current minimum recovery point as the backup end
9382
   * location. Note that it can be greater than the exact backup end
9383
   * location if the minimum recovery point is updated after the backup of
9384
   * pg_control. This is harmless for current uses.
9385
   *
9386
   * XXX currently a backup history file is for informational and debug
9387
   * purposes only. It's not essential for an online backup. Furthermore,
9388
   * even if it's created, it will not be archived during recovery because
9389
   * an archiver is not invoked. So it doesn't seem worthwhile to write a
9390
   * backup history file during recovery.
9391
   */
9392
0
  if (backup_stopped_in_recovery)
9393
0
  {
9394
0
    XLogRecPtr  recptr;
9395
9396
    /*
9397
     * Check to see if all WAL replayed during online backup contain
9398
     * full-page writes.
9399
     */
9400
0
    SpinLockAcquire(&XLogCtl->info_lck);
9401
0
    recptr = XLogCtl->lastFpwDisableRecPtr;
9402
0
    SpinLockRelease(&XLogCtl->info_lck);
9403
9404
0
    if (state->startpoint <= recptr)
9405
0
      ereport(ERROR,
9406
0
          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9407
0
           errmsg("WAL generated with \"full_page_writes=off\" was replayed "
9408
0
              "during online backup"),
9409
0
           errhint("This means that the backup being taken on the standby "
9410
0
               "is corrupt and should not be used. "
9411
0
               "Enable \"full_page_writes\" and run CHECKPOINT on the primary, "
9412
0
               "and then try an online backup again.")));
9413
9414
9415
0
    LWLockAcquire(ControlFileLock, LW_SHARED);
9416
0
    state->stoppoint = ControlFile->minRecoveryPoint;
9417
0
    state->stoptli = ControlFile->minRecoveryPointTLI;
9418
0
    LWLockRelease(ControlFileLock);
9419
0
  }
9420
0
  else
9421
0
  {
9422
0
    char     *history_file;
9423
9424
    /*
9425
     * Write the backup-end xlog record
9426
     */
9427
0
    XLogBeginInsert();
9428
0
    XLogRegisterData(&state->startpoint,
9429
0
             sizeof(state->startpoint));
9430
0
    state->stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
9431
9432
    /*
9433
     * Given that we're not in recovery, InsertTimeLineID is set and can't
9434
     * change, so we can read it without a lock.
9435
     */
9436
0
    state->stoptli = XLogCtl->InsertTimeLineID;
9437
9438
    /*
9439
     * Force a switch to a new xlog segment file, so that the backup is
9440
     * valid as soon as archiver moves out the current segment file.
9441
     */
9442
0
    RequestXLogSwitch(false);
9443
9444
0
    state->stoptime = (pg_time_t) time(NULL);
9445
9446
    /*
9447
     * Write the backup history file
9448
     */
9449
0
    XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
9450
0
    BackupHistoryFilePath(histfilepath, state->stoptli, _logSegNo,
9451
0
                state->startpoint, wal_segment_size);
9452
0
    fp = AllocateFile(histfilepath, "w");
9453
0
    if (!fp)
9454
0
      ereport(ERROR,
9455
0
          (errcode_for_file_access(),
9456
0
           errmsg("could not create file \"%s\": %m",
9457
0
              histfilepath)));
9458
9459
    /* Build and save the contents of the backup history file */
9460
0
    history_file = build_backup_content(state, true);
9461
0
    fprintf(fp, "%s", history_file);
9462
0
    pfree(history_file);
9463
9464
0
    if (fflush(fp) || ferror(fp) || FreeFile(fp))
9465
0
      ereport(ERROR,
9466
0
          (errcode_for_file_access(),
9467
0
           errmsg("could not write file \"%s\": %m",
9468
0
              histfilepath)));
9469
9470
    /*
9471
     * Clean out any no-longer-needed history files.  As a side effect,
9472
     * this will post a .ready file for the newly created history file,
9473
     * notifying the archiver that history file may be archived
9474
     * immediately.
9475
     */
9476
0
    CleanupBackupHistory();
9477
0
  }
9478
9479
  /*
9480
   * If archiving is enabled, wait for all the required WAL files to be
9481
   * archived before returning. If archiving isn't enabled, the required WAL
9482
   * needs to be transported via streaming replication (hopefully with
9483
   * wal_keep_size set high enough), or some more exotic mechanism like
9484
   * polling and copying files from pg_wal with script. We have no knowledge
9485
   * of those mechanisms, so it's up to the user to ensure that he gets all
9486
   * the required WAL.
9487
   *
9488
   * We wait until both the last WAL file filled during backup and the
9489
   * history file have been archived, and assume that the alphabetic sorting
9490
   * property of the WAL files ensures any earlier WAL files are safely
9491
   * archived as well.
9492
   *
9493
   * We wait forever, since archive_command is supposed to work and we
9494
   * assume the admin wanted his backup to work completely. If you don't
9495
   * wish to wait, then either waitforarchive should be passed in as false,
9496
   * or you can set statement_timeout.  Also, some notices are issued to
9497
   * clue in anyone who might be doing this interactively.
9498
   */
9499
9500
0
  if (waitforarchive &&
9501
0
    ((!backup_stopped_in_recovery && XLogArchivingActive()) ||
9502
0
     (backup_stopped_in_recovery && XLogArchivingAlways())))
9503
0
  {
9504
0
    XLByteToPrevSeg(state->stoppoint, _logSegNo, wal_segment_size);
9505
0
    XLogFileName(lastxlogfilename, state->stoptli, _logSegNo,
9506
0
           wal_segment_size);
9507
9508
0
    XLByteToSeg(state->startpoint, _logSegNo, wal_segment_size);
9509
0
    BackupHistoryFileName(histfilename, state->stoptli, _logSegNo,
9510
0
                state->startpoint, wal_segment_size);
9511
9512
0
    seconds_before_warning = 60;
9513
0
    waits = 0;
9514
9515
0
    while (XLogArchiveIsBusy(lastxlogfilename) ||
9516
0
         XLogArchiveIsBusy(histfilename))
9517
0
    {
9518
0
      CHECK_FOR_INTERRUPTS();
9519
9520
0
      if (!reported_waiting && waits > 5)
9521
0
      {
9522
0
        ereport(NOTICE,
9523
0
            (errmsg("base backup done, waiting for required WAL segments to be archived")));
9524
0
        reported_waiting = true;
9525
0
      }
9526
9527
0
      (void) WaitLatch(MyLatch,
9528
0
               WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
9529
0
               1000L,
9530
0
               WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
9531
0
      ResetLatch(MyLatch);
9532
9533
0
      if (++waits >= seconds_before_warning)
9534
0
      {
9535
0
        seconds_before_warning *= 2;  /* This wraps in >10 years... */
9536
0
        ereport(WARNING,
9537
0
            (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
9538
0
                waits),
9539
0
             errhint("Check that your \"archive_command\" is executing properly.  "
9540
0
                 "You can safely cancel this backup, "
9541
0
                 "but the database backup will not be usable without all the WAL segments.")));
9542
0
      }
9543
0
    }
9544
9545
0
    ereport(NOTICE,
9546
0
        (errmsg("all required WAL segments have been archived")));
9547
0
  }
9548
0
  else if (waitforarchive)
9549
0
    ereport(NOTICE,
9550
0
        (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
9551
0
}
9552
9553
9554
/*
9555
 * do_pg_abort_backup: abort a running backup
9556
 *
9557
 * This does just the most basic steps of do_pg_backup_stop(), by taking the
9558
 * system out of backup mode, thus making it a lot more safe to call from
9559
 * an error handler.
9560
 *
9561
 * 'arg' indicates that it's being called during backup setup; so
9562
 * sessionBackupState has not been modified yet, but runningBackups has
9563
 * already been incremented.  When it's false, then it's invoked as a
9564
 * before_shmem_exit handler, and therefore we must not change state
9565
 * unless sessionBackupState indicates that a backup is actually running.
9566
 *
9567
 * NB: This gets used as a PG_ENSURE_ERROR_CLEANUP callback and
9568
 * before_shmem_exit handler, hence the odd-looking signature.
9569
 */
9570
void
9571
do_pg_abort_backup(int code, Datum arg)
9572
0
{
9573
0
  bool    during_backup_start = DatumGetBool(arg);
9574
9575
  /* If called during backup start, there shouldn't be one already running */
9576
0
  Assert(!during_backup_start || sessionBackupState == SESSION_BACKUP_NONE);
9577
9578
0
  if (during_backup_start || sessionBackupState != SESSION_BACKUP_NONE)
9579
0
  {
9580
0
    WALInsertLockAcquireExclusive();
9581
0
    Assert(XLogCtl->Insert.runningBackups > 0);
9582
0
    XLogCtl->Insert.runningBackups--;
9583
9584
0
    sessionBackupState = SESSION_BACKUP_NONE;
9585
0
    WALInsertLockRelease();
9586
9587
0
    if (!during_backup_start)
9588
0
      ereport(WARNING,
9589
0
          errmsg("aborting backup due to backend exiting before pg_backup_stop was called"));
9590
0
  }
9591
0
}
9592
9593
/*
9594
 * Register a handler that will warn about unterminated backups at end of
9595
 * session, unless this has already been done.
9596
 */
9597
void
9598
register_persistent_abort_backup_handler(void)
9599
0
{
9600
0
  static bool already_done = false;
9601
9602
0
  if (already_done)
9603
0
    return;
9604
0
  before_shmem_exit(do_pg_abort_backup, DatumGetBool(false));
9605
0
  already_done = true;
9606
0
}
9607
9608
/*
9609
 * Get latest WAL insert pointer
9610
 */
9611
XLogRecPtr
9612
GetXLogInsertRecPtr(void)
9613
0
{
9614
0
  XLogCtlInsert *Insert = &XLogCtl->Insert;
9615
0
  uint64    current_bytepos;
9616
9617
0
  SpinLockAcquire(&Insert->insertpos_lck);
9618
0
  current_bytepos = Insert->CurrBytePos;
9619
0
  SpinLockRelease(&Insert->insertpos_lck);
9620
9621
0
  return XLogBytePosToRecPtr(current_bytepos);
9622
0
}
9623
9624
/*
9625
 * Get latest WAL write pointer
9626
 */
9627
XLogRecPtr
9628
GetXLogWriteRecPtr(void)
9629
0
{
9630
0
  RefreshXLogWriteResult(LogwrtResult);
9631
9632
0
  return LogwrtResult.Write;
9633
0
}
9634
9635
/*
9636
 * Returns the redo pointer of the last checkpoint or restartpoint. This is
9637
 * the oldest point in WAL that we still need, if we have to restart recovery.
9638
 */
9639
void
9640
GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
9641
0
{
9642
0
  LWLockAcquire(ControlFileLock, LW_SHARED);
9643
0
  *oldrecptr = ControlFile->checkPointCopy.redo;
9644
0
  *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
9645
0
  LWLockRelease(ControlFileLock);
9646
0
}
9647
9648
/* Thin wrapper around ShutdownWalRcv(). */
9649
void
9650
XLogShutdownWalRcv(void)
9651
0
{
9652
0
  ShutdownWalRcv();
9653
9654
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9655
0
  XLogCtl->InstallXLogFileSegmentActive = false;
9656
0
  LWLockRelease(ControlFileLock);
9657
0
}
9658
9659
/* Enable WAL file recycling and preallocation. */
9660
void
9661
SetInstallXLogFileSegmentActive(void)
9662
0
{
9663
0
  LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9664
0
  XLogCtl->InstallXLogFileSegmentActive = true;
9665
0
  LWLockRelease(ControlFileLock);
9666
0
}
9667
9668
bool
9669
IsInstallXLogFileSegmentActive(void)
9670
0
{
9671
0
  bool    result;
9672
9673
0
  LWLockAcquire(ControlFileLock, LW_SHARED);
9674
0
  result = XLogCtl->InstallXLogFileSegmentActive;
9675
0
  LWLockRelease(ControlFileLock);
9676
9677
0
  return result;
9678
0
}
9679
9680
/*
9681
 * Update the WalWriterSleeping flag.
9682
 */
9683
void
9684
SetWalWriterSleeping(bool sleeping)
9685
0
{
9686
0
  SpinLockAcquire(&XLogCtl->info_lck);
9687
0
  XLogCtl->WalWriterSleeping = sleeping;
9688
0
  SpinLockRelease(&XLogCtl->info_lck);
9689
0
}