Coverage Report

Created: 2025-06-13 06:06

/src/postgres/src/backend/postmaster/walsummarizer.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * walsummarizer.c
4
 *
5
 * Background process to perform WAL summarization, if it is enabled.
6
 * It continuously scans the write-ahead log and periodically emits a
7
 * summary file which indicates which blocks in which relation forks
8
 * were modified by WAL records in the LSN range covered by the summary
9
 * file. See walsummary.c and blkreftable.c for more details on the
10
 * naming and contents of WAL summary files.
11
 *
12
 * If configured to do, this background process will also remove WAL
13
 * summary files when the file timestamp is older than a configurable
14
 * threshold (but only if the WAL has been removed first).
15
 *
16
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
17
 *
18
 * IDENTIFICATION
19
 *    src/backend/postmaster/walsummarizer.c
20
 *
21
 *-------------------------------------------------------------------------
22
 */
23
#include "postgres.h"
24
25
#include "access/timeline.h"
26
#include "access/xlog.h"
27
#include "access/xlog_internal.h"
28
#include "access/xlogrecovery.h"
29
#include "access/xlogutils.h"
30
#include "backup/walsummary.h"
31
#include "catalog/storage_xlog.h"
32
#include "commands/dbcommands_xlog.h"
33
#include "common/blkreftable.h"
34
#include "libpq/pqsignal.h"
35
#include "miscadmin.h"
36
#include "pgstat.h"
37
#include "postmaster/auxprocess.h"
38
#include "postmaster/interrupt.h"
39
#include "postmaster/walsummarizer.h"
40
#include "replication/walreceiver.h"
41
#include "storage/aio_subsys.h"
42
#include "storage/fd.h"
43
#include "storage/ipc.h"
44
#include "storage/latch.h"
45
#include "storage/lwlock.h"
46
#include "storage/proc.h"
47
#include "storage/procsignal.h"
48
#include "storage/shmem.h"
49
#include "utils/guc.h"
50
#include "utils/memutils.h"
51
#include "utils/wait_event.h"
52
53
/*
54
 * Data in shared memory related to WAL summarization.
55
 */
56
typedef struct
57
{
58
  /*
59
   * These fields are protected by WALSummarizerLock.
60
   *
61
   * Until we've discovered what summary files already exist on disk and
62
   * stored that information in shared memory, initialized is false and the
63
   * other fields here contain no meaningful information. After that has
64
   * been done, initialized is true.
65
   *
66
   * summarized_tli and summarized_lsn indicate the last LSN and TLI at
67
   * which the next summary file will start. Normally, these are the LSN and
68
   * TLI at which the last file ended; in such case, lsn_is_exact is true.
69
   * If, however, the LSN is just an approximation, then lsn_is_exact is
70
   * false. This can happen if, for example, there are no existing WAL
71
   * summary files at startup. In that case, we have to derive the position
72
   * at which to start summarizing from the WAL files that exist on disk,
73
   * and so the LSN might point to the start of the next file even though
74
   * that might happen to be in the middle of a WAL record.
75
   *
76
   * summarizer_pgprocno is the proc number of the summarizer process, if
77
   * one is running, or else INVALID_PROC_NUMBER.
78
   *
79
   * pending_lsn is used by the summarizer to advertise the ending LSN of a
80
   * record it has recently read. It shouldn't ever be less than
81
   * summarized_lsn, but might be greater, because the summarizer buffers
82
   * data for a range of LSNs in memory before writing out a new file.
83
   */
84
  bool    initialized;
85
  TimeLineID  summarized_tli;
86
  XLogRecPtr  summarized_lsn;
87
  bool    lsn_is_exact;
88
  ProcNumber  summarizer_pgprocno;
89
  XLogRecPtr  pending_lsn;
90
91
  /*
92
   * This field handles its own synchronization.
93
   */
94
  ConditionVariable summary_file_cv;
95
} WalSummarizerData;
96
97
/*
98
 * Private data for our xlogreader's page read callback.
99
 */
100
typedef struct
101
{
102
  TimeLineID  tli;
103
  bool    historic;
104
  XLogRecPtr  read_upto;
105
  bool    end_of_wal;
106
} SummarizerReadLocalXLogPrivate;
107
108
/* Pointer to shared memory state. */
109
static WalSummarizerData *WalSummarizerCtl;
110
111
/*
112
 * When we reach end of WAL and need to read more, we sleep for a number of
113
 * milliseconds that is an integer multiple of MS_PER_SLEEP_QUANTUM. This is
114
 * the multiplier. It should vary between 1 and MAX_SLEEP_QUANTA, depending
115
 * on system activity. See summarizer_wait_for_wal() for how we adjust this.
116
 */
117
static long sleep_quanta = 1;
118
119
/*
120
 * The sleep time will always be a multiple of 200ms and will not exceed
121
 * thirty seconds (150 * 200 = 30 * 1000). Note that the timeout here needs
122
 * to be substantially less than the maximum amount of time for which an
123
 * incremental backup will wait for this process to catch up. Otherwise, an
124
 * incremental backup might time out on an idle system just because we sleep
125
 * for too long.
126
 */
127
#define MAX_SLEEP_QUANTA    150
128
0
#define MS_PER_SLEEP_QUANTUM  200
129
130
/*
131
 * This is a count of the number of pages of WAL that we've read since the
132
 * last time we waited for more WAL to appear.
133
 */
134
static long pages_read_since_last_sleep = 0;
135
136
/*
137
 * Most recent RedoRecPtr value observed by MaybeRemoveOldWalSummaries.
138
 */
139
static XLogRecPtr redo_pointer_at_last_summary_removal = InvalidXLogRecPtr;
140
141
/*
142
 * GUC parameters
143
 */
144
bool    summarize_wal = false;
145
int     wal_summary_keep_time = 10 * HOURS_PER_DAY * MINS_PER_HOUR;
146
147
static void WalSummarizerShutdown(int code, Datum arg);
148
static XLogRecPtr GetLatestLSN(TimeLineID *tli);
149
static void ProcessWalSummarizerInterrupts(void);
150
static XLogRecPtr SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn,
151
                 bool exact, XLogRecPtr switch_lsn,
152
                 XLogRecPtr maximum_lsn);
153
static void SummarizeDbaseRecord(XLogReaderState *xlogreader,
154
                 BlockRefTable *brtab);
155
static void SummarizeSmgrRecord(XLogReaderState *xlogreader,
156
                BlockRefTable *brtab);
157
static void SummarizeXactRecord(XLogReaderState *xlogreader,
158
                BlockRefTable *brtab);
159
static bool SummarizeXlogRecord(XLogReaderState *xlogreader,
160
                bool *new_fast_forward);
161
static int  summarizer_read_local_xlog_page(XLogReaderState *state,
162
                      XLogRecPtr targetPagePtr,
163
                      int reqLen,
164
                      XLogRecPtr targetRecPtr,
165
                      char *cur_page);
166
static void summarizer_wait_for_wal(void);
167
static void MaybeRemoveOldWalSummaries(void);
168
169
/*
170
 * Amount of shared memory required for this module.
171
 */
172
Size
173
WalSummarizerShmemSize(void)
174
0
{
175
0
  return sizeof(WalSummarizerData);
176
0
}
177
178
/*
179
 * Create or attach to shared memory segment for this module.
180
 */
181
void
182
WalSummarizerShmemInit(void)
183
0
{
184
0
  bool    found;
185
186
0
  WalSummarizerCtl = (WalSummarizerData *)
187
0
    ShmemInitStruct("Wal Summarizer Ctl", WalSummarizerShmemSize(),
188
0
            &found);
189
190
0
  if (!found)
191
0
  {
192
    /*
193
     * First time through, so initialize.
194
     *
195
     * We're just filling in dummy values here -- the real initialization
196
     * will happen when GetOldestUnsummarizedLSN() is called for the first
197
     * time.
198
     */
199
0
    WalSummarizerCtl->initialized = false;
200
0
    WalSummarizerCtl->summarized_tli = 0;
201
0
    WalSummarizerCtl->summarized_lsn = InvalidXLogRecPtr;
202
0
    WalSummarizerCtl->lsn_is_exact = false;
203
0
    WalSummarizerCtl->summarizer_pgprocno = INVALID_PROC_NUMBER;
204
0
    WalSummarizerCtl->pending_lsn = InvalidXLogRecPtr;
205
0
    ConditionVariableInit(&WalSummarizerCtl->summary_file_cv);
206
0
  }
207
0
}
208
209
/*
210
 * Entry point for walsummarizer process.
211
 */
212
void
213
WalSummarizerMain(const void *startup_data, size_t startup_data_len)
214
0
{
215
0
  sigjmp_buf  local_sigjmp_buf;
216
0
  MemoryContext context;
217
218
  /*
219
   * Within this function, 'current_lsn' and 'current_tli' refer to the
220
   * point from which the next WAL summary file should start. 'exact' is
221
   * true if 'current_lsn' is known to be the start of a WAL record or WAL
222
   * segment, and false if it might be in the middle of a record someplace.
223
   *
224
   * 'switch_lsn' and 'switch_tli', if set, are the LSN at which we need to
225
   * switch to a new timeline and the timeline to which we need to switch.
226
   * If not set, we either haven't figured out the answers yet or we're
227
   * already on the latest timeline.
228
   */
229
0
  XLogRecPtr  current_lsn;
230
0
  TimeLineID  current_tli;
231
0
  bool    exact;
232
0
  XLogRecPtr  switch_lsn = InvalidXLogRecPtr;
233
0
  TimeLineID  switch_tli = 0;
234
235
0
  Assert(startup_data_len == 0);
236
237
0
  MyBackendType = B_WAL_SUMMARIZER;
238
0
  AuxiliaryProcessMainCommon();
239
240
0
  ereport(DEBUG1,
241
0
      (errmsg_internal("WAL summarizer started")));
242
243
  /*
244
   * Properly accept or ignore signals the postmaster might send us
245
   *
246
   * We have no particular use for SIGINT at the moment, but seems
247
   * reasonable to treat like SIGTERM.
248
   */
249
0
  pqsignal(SIGHUP, SignalHandlerForConfigReload);
250
0
  pqsignal(SIGINT, SignalHandlerForShutdownRequest);
251
0
  pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
252
  /* SIGQUIT handler was already set up by InitPostmasterChild */
253
0
  pqsignal(SIGALRM, SIG_IGN);
254
0
  pqsignal(SIGPIPE, SIG_IGN);
255
0
  pqsignal(SIGUSR1, procsignal_sigusr1_handler);
256
0
  pqsignal(SIGUSR2, SIG_IGN); /* not used */
257
258
  /* Advertise ourselves. */
259
0
  on_shmem_exit(WalSummarizerShutdown, (Datum) 0);
260
0
  LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
261
0
  WalSummarizerCtl->summarizer_pgprocno = MyProcNumber;
262
0
  LWLockRelease(WALSummarizerLock);
263
264
  /* Create and switch to a memory context that we can reset on error. */
265
0
  context = AllocSetContextCreate(TopMemoryContext,
266
0
                  "Wal Summarizer",
267
0
                  ALLOCSET_DEFAULT_SIZES);
268
0
  MemoryContextSwitchTo(context);
269
270
  /*
271
   * Reset some signals that are accepted by postmaster but not here
272
   */
273
0
  pqsignal(SIGCHLD, SIG_DFL);
274
275
  /*
276
   * If an exception is encountered, processing resumes here.
277
   */
278
0
  if (sigsetjmp(local_sigjmp_buf, 1) != 0)
279
0
  {
280
    /* Since not using PG_TRY, must reset error stack by hand */
281
0
    error_context_stack = NULL;
282
283
    /* Prevent interrupts while cleaning up */
284
0
    HOLD_INTERRUPTS();
285
286
    /* Report the error to the server log */
287
0
    EmitErrorReport();
288
289
    /* Release resources we might have acquired. */
290
0
    LWLockReleaseAll();
291
0
    ConditionVariableCancelSleep();
292
0
    pgstat_report_wait_end();
293
0
    pgaio_error_cleanup();
294
0
    ReleaseAuxProcessResources(false);
295
0
    AtEOXact_Files(false);
296
0
    AtEOXact_HashTables(false);
297
298
    /*
299
     * Now return to normal top-level context and clear ErrorContext for
300
     * next time.
301
     */
302
0
    MemoryContextSwitchTo(context);
303
0
    FlushErrorState();
304
305
    /* Flush any leaked data in the top-level context */
306
0
    MemoryContextReset(context);
307
308
    /* Now we can allow interrupts again */
309
0
    RESUME_INTERRUPTS();
310
311
    /*
312
     * Sleep for 10 seconds before attempting to resume operations in
313
     * order to avoid excessive logging.
314
     *
315
     * Many of the likely error conditions are things that will repeat
316
     * every time. For example, if the WAL can't be read or the summary
317
     * can't be written, only administrator action will cure the problem.
318
     * So a really fast retry time doesn't seem to be especially
319
     * beneficial, and it will clutter the logs.
320
     */
321
0
    (void) WaitLatch(NULL,
322
0
             WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
323
0
             10000,
324
0
             WAIT_EVENT_WAL_SUMMARIZER_ERROR);
325
0
  }
326
327
  /* We can now handle ereport(ERROR) */
328
0
  PG_exception_stack = &local_sigjmp_buf;
329
330
  /*
331
   * Unblock signals (they were blocked when the postmaster forked us)
332
   */
333
0
  sigprocmask(SIG_SETMASK, &UnBlockSig, NULL);
334
335
  /*
336
   * Fetch information about previous progress from shared memory, and ask
337
   * GetOldestUnsummarizedLSN to reset pending_lsn to summarized_lsn. We
338
   * might be recovering from an error, and if so, pending_lsn might have
339
   * advanced past summarized_lsn, but any WAL we read previously has been
340
   * lost and will need to be reread.
341
   *
342
   * If we discover that WAL summarization is not enabled, just exit.
343
   */
344
0
  current_lsn = GetOldestUnsummarizedLSN(&current_tli, &exact);
345
0
  if (XLogRecPtrIsInvalid(current_lsn))
346
0
    proc_exit(0);
347
348
  /*
349
   * Loop forever
350
   */
351
0
  for (;;)
352
0
  {
353
0
    XLogRecPtr  latest_lsn;
354
0
    TimeLineID  latest_tli;
355
0
    XLogRecPtr  end_of_summary_lsn;
356
357
    /* Flush any leaked data in the top-level context */
358
0
    MemoryContextReset(context);
359
360
    /* Process any signals received recently. */
361
0
    ProcessWalSummarizerInterrupts();
362
363
    /* If it's time to remove any old WAL summaries, do that now. */
364
0
    MaybeRemoveOldWalSummaries();
365
366
    /* Find the LSN and TLI up to which we can safely summarize. */
367
0
    latest_lsn = GetLatestLSN(&latest_tli);
368
369
    /*
370
     * If we're summarizing a historic timeline and we haven't yet
371
     * computed the point at which to switch to the next timeline, do that
372
     * now.
373
     *
374
     * Note that if this is a standby, what was previously the current
375
     * timeline could become historic at any time.
376
     *
377
     * We could try to make this more efficient by caching the results of
378
     * readTimeLineHistory when latest_tli has not changed, but since we
379
     * only have to do this once per timeline switch, we probably wouldn't
380
     * save any significant amount of work in practice.
381
     */
382
0
    if (current_tli != latest_tli && XLogRecPtrIsInvalid(switch_lsn))
383
0
    {
384
0
      List     *tles = readTimeLineHistory(latest_tli);
385
386
0
      switch_lsn = tliSwitchPoint(current_tli, tles, &switch_tli);
387
0
      ereport(DEBUG1,
388
0
          errmsg_internal("switch point from TLI %u to TLI %u is at %X/%X",
389
0
                  current_tli, switch_tli, LSN_FORMAT_ARGS(switch_lsn)));
390
0
    }
391
392
    /*
393
     * If we've reached the switch LSN, we can't summarize anything else
394
     * on this timeline. Switch to the next timeline and go around again,
395
     * backing up to the exact switch point if we passed it.
396
     */
397
0
    if (!XLogRecPtrIsInvalid(switch_lsn) && current_lsn >= switch_lsn)
398
0
    {
399
      /* Restart summarization from switch point. */
400
0
      current_tli = switch_tli;
401
0
      current_lsn = switch_lsn;
402
403
      /* Next timeline and switch point, if any, not yet known. */
404
0
      switch_lsn = InvalidXLogRecPtr;
405
0
      switch_tli = 0;
406
407
      /* Update (really, rewind, if needed) state in shared memory. */
408
0
      LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
409
0
      WalSummarizerCtl->summarized_lsn = current_lsn;
410
0
      WalSummarizerCtl->summarized_tli = current_tli;
411
0
      WalSummarizerCtl->lsn_is_exact = true;
412
0
      WalSummarizerCtl->pending_lsn = current_lsn;
413
0
      LWLockRelease(WALSummarizerLock);
414
415
0
      continue;
416
0
    }
417
418
    /* Summarize WAL. */
419
0
    end_of_summary_lsn = SummarizeWAL(current_tli,
420
0
                      current_lsn, exact,
421
0
                      switch_lsn, latest_lsn);
422
0
    Assert(!XLogRecPtrIsInvalid(end_of_summary_lsn));
423
0
    Assert(end_of_summary_lsn >= current_lsn);
424
425
    /*
426
     * Update state for next loop iteration.
427
     *
428
     * Next summary file should start from exactly where this one ended.
429
     */
430
0
    current_lsn = end_of_summary_lsn;
431
0
    exact = true;
432
433
    /* Update state in shared memory. */
434
0
    LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
435
0
    WalSummarizerCtl->summarized_lsn = end_of_summary_lsn;
436
0
    WalSummarizerCtl->summarized_tli = current_tli;
437
0
    WalSummarizerCtl->lsn_is_exact = true;
438
0
    WalSummarizerCtl->pending_lsn = end_of_summary_lsn;
439
0
    LWLockRelease(WALSummarizerLock);
440
441
    /* Wake up anyone waiting for more summary files to be written. */
442
0
    ConditionVariableBroadcast(&WalSummarizerCtl->summary_file_cv);
443
0
  }
444
0
}
445
446
/*
447
 * Get information about the state of the WAL summarizer.
448
 */
449
void
450
GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *summarized_lsn,
451
            XLogRecPtr *pending_lsn, int *summarizer_pid)
452
0
{
453
0
  LWLockAcquire(WALSummarizerLock, LW_SHARED);
454
0
  if (!WalSummarizerCtl->initialized)
455
0
  {
456
    /*
457
     * If initialized is false, the rest of the structure contents are
458
     * undefined.
459
     */
460
0
    *summarized_tli = 0;
461
0
    *summarized_lsn = InvalidXLogRecPtr;
462
0
    *pending_lsn = InvalidXLogRecPtr;
463
0
    *summarizer_pid = -1;
464
0
  }
465
0
  else
466
0
  {
467
0
    int     summarizer_pgprocno = WalSummarizerCtl->summarizer_pgprocno;
468
469
0
    *summarized_tli = WalSummarizerCtl->summarized_tli;
470
0
    *summarized_lsn = WalSummarizerCtl->summarized_lsn;
471
0
    if (summarizer_pgprocno == INVALID_PROC_NUMBER)
472
0
    {
473
      /*
474
       * If the summarizer has exited, the fact that it had processed
475
       * beyond summarized_lsn is irrelevant now.
476
       */
477
0
      *pending_lsn = WalSummarizerCtl->summarized_lsn;
478
0
      *summarizer_pid = -1;
479
0
    }
480
0
    else
481
0
    {
482
0
      *pending_lsn = WalSummarizerCtl->pending_lsn;
483
484
      /*
485
       * We're not fussed about inexact answers here, since they could
486
       * become stale instantly, so we don't bother taking the lock, but
487
       * make sure that invalid PID values are normalized to -1.
488
       */
489
0
      *summarizer_pid = GetPGProcByNumber(summarizer_pgprocno)->pid;
490
0
      if (*summarizer_pid <= 0)
491
0
        *summarizer_pid = -1;
492
0
    }
493
0
  }
494
0
  LWLockRelease(WALSummarizerLock);
495
0
}
496
497
/*
498
 * Get the oldest LSN in this server's timeline history that has not yet been
499
 * summarized, and update shared memory state as appropriate.
500
 *
501
 * If *tli != NULL, it will be set to the TLI for the LSN that is returned.
502
 *
503
 * If *lsn_is_exact != NULL, it will be set to true if the returned LSN is
504
 * necessarily the start of a WAL record and false if it's just the beginning
505
 * of a WAL segment.
506
 */
507
XLogRecPtr
508
GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact)
509
0
{
510
0
  TimeLineID  latest_tli;
511
0
  int     n;
512
0
  List     *tles;
513
0
  XLogRecPtr  unsummarized_lsn = InvalidXLogRecPtr;
514
0
  TimeLineID  unsummarized_tli = 0;
515
0
  bool    should_make_exact = false;
516
0
  List     *existing_summaries;
517
0
  ListCell   *lc;
518
0
  bool    am_wal_summarizer = AmWalSummarizerProcess();
519
520
  /* If not summarizing WAL, do nothing. */
521
0
  if (!summarize_wal)
522
0
    return InvalidXLogRecPtr;
523
524
  /*
525
   * If we are not the WAL summarizer process, then we normally just want to
526
   * read the values from shared memory. However, as an exception, if shared
527
   * memory hasn't been initialized yet, then we need to do that so that we
528
   * can read legal values and not remove any WAL too early.
529
   */
530
0
  if (!am_wal_summarizer)
531
0
  {
532
0
    LWLockAcquire(WALSummarizerLock, LW_SHARED);
533
534
0
    if (WalSummarizerCtl->initialized)
535
0
    {
536
0
      unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
537
0
      if (tli != NULL)
538
0
        *tli = WalSummarizerCtl->summarized_tli;
539
0
      if (lsn_is_exact != NULL)
540
0
        *lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
541
0
      LWLockRelease(WALSummarizerLock);
542
0
      return unsummarized_lsn;
543
0
    }
544
545
0
    LWLockRelease(WALSummarizerLock);
546
0
  }
547
548
  /*
549
   * Find the oldest timeline on which WAL still exists, and the earliest
550
   * segment for which it exists.
551
   *
552
   * Note that we do this every time the WAL summarizer process restarts or
553
   * recovers from an error, in case the contents of pg_wal have changed
554
   * under us e.g. if some files were removed, either manually - which
555
   * shouldn't really happen, but might - or by postgres itself, if
556
   * summarize_wal was turned off and then back on again.
557
   */
558
0
  (void) GetLatestLSN(&latest_tli);
559
0
  tles = readTimeLineHistory(latest_tli);
560
0
  for (n = list_length(tles) - 1; n >= 0; --n)
561
0
  {
562
0
    TimeLineHistoryEntry *tle = list_nth(tles, n);
563
0
    XLogSegNo oldest_segno;
564
565
0
    oldest_segno = XLogGetOldestSegno(tle->tli);
566
0
    if (oldest_segno != 0)
567
0
    {
568
      /* Compute oldest LSN that still exists on disk. */
569
0
      XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size,
570
0
                  unsummarized_lsn);
571
572
0
      unsummarized_tli = tle->tli;
573
0
      break;
574
0
    }
575
0
  }
576
577
  /*
578
   * Don't try to summarize anything older than the end LSN of the newest
579
   * summary file that exists for this timeline.
580
   */
581
0
  existing_summaries =
582
0
    GetWalSummaries(unsummarized_tli,
583
0
            InvalidXLogRecPtr, InvalidXLogRecPtr);
584
0
  foreach(lc, existing_summaries)
585
0
  {
586
0
    WalSummaryFile *ws = lfirst(lc);
587
588
0
    if (ws->end_lsn > unsummarized_lsn)
589
0
    {
590
0
      unsummarized_lsn = ws->end_lsn;
591
0
      should_make_exact = true;
592
0
    }
593
0
  }
594
595
  /* It really should not be possible for us to find no WAL. */
596
0
  if (unsummarized_tli == 0)
597
0
    ereport(ERROR,
598
0
        errcode(ERRCODE_INTERNAL_ERROR),
599
0
        errmsg_internal("no WAL found on timeline %u", latest_tli));
600
601
  /*
602
   * If we're the WAL summarizer, we always want to store the values we just
603
   * computed into shared memory, because those are the values we're going
604
   * to use to drive our operation, and so they are the authoritative
605
   * values. Otherwise, we only store values into shared memory if shared
606
   * memory is uninitialized. Our values are not canonical in such a case,
607
   * but it's better to have something than nothing, to guide WAL retention.
608
   */
609
0
  LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
610
0
  if (am_wal_summarizer || !WalSummarizerCtl->initialized)
611
0
  {
612
0
    WalSummarizerCtl->initialized = true;
613
0
    WalSummarizerCtl->summarized_lsn = unsummarized_lsn;
614
0
    WalSummarizerCtl->summarized_tli = unsummarized_tli;
615
0
    WalSummarizerCtl->lsn_is_exact = should_make_exact;
616
0
    WalSummarizerCtl->pending_lsn = unsummarized_lsn;
617
0
  }
618
0
  else
619
0
    unsummarized_lsn = WalSummarizerCtl->summarized_lsn;
620
621
  /* Also return the to the caller as required. */
622
0
  if (tli != NULL)
623
0
    *tli = WalSummarizerCtl->summarized_tli;
624
0
  if (lsn_is_exact != NULL)
625
0
    *lsn_is_exact = WalSummarizerCtl->lsn_is_exact;
626
0
  LWLockRelease(WALSummarizerLock);
627
628
0
  return unsummarized_lsn;
629
0
}
630
631
/*
632
 * Wake up the WAL summarizer process.
633
 *
634
 * This might not work, because there's no guarantee that the WAL summarizer
635
 * process was successfully started, and it also might have started but
636
 * subsequently terminated. So, under normal circumstances, this will get the
637
 * latch set, but there's no guarantee.
638
 */
639
void
640
WakeupWalSummarizer(void)
641
0
{
642
0
  ProcNumber  pgprocno;
643
644
0
  if (WalSummarizerCtl == NULL)
645
0
    return;
646
647
0
  LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
648
0
  pgprocno = WalSummarizerCtl->summarizer_pgprocno;
649
0
  LWLockRelease(WALSummarizerLock);
650
651
0
  if (pgprocno != INVALID_PROC_NUMBER)
652
0
    SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch);
653
0
}
654
655
/*
656
 * Wait until WAL summarization reaches the given LSN, but time out with an
657
 * error if the summarizer seems to be stick.
658
 *
659
 * Returns immediately if summarize_wal is turned off while we wait. Caller
660
 * is expected to handle this case, if necessary.
661
 */
662
void
663
WaitForWalSummarization(XLogRecPtr lsn)
664
0
{
665
0
  TimestampTz initial_time,
666
0
        cycle_time,
667
0
        current_time;
668
0
  XLogRecPtr  prior_pending_lsn = InvalidXLogRecPtr;
669
0
  int     deadcycles = 0;
670
671
0
  initial_time = cycle_time = GetCurrentTimestamp();
672
673
0
  while (1)
674
0
  {
675
0
    long    timeout_in_ms = 10000;
676
0
    XLogRecPtr  summarized_lsn;
677
0
    XLogRecPtr  pending_lsn;
678
679
0
    CHECK_FOR_INTERRUPTS();
680
681
    /* If WAL summarization is disabled while we're waiting, give up. */
682
0
    if (!summarize_wal)
683
0
      return;
684
685
    /*
686
     * If the LSN summarized on disk has reached the target value, stop.
687
     */
688
0
    LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
689
0
    summarized_lsn = WalSummarizerCtl->summarized_lsn;
690
0
    pending_lsn = WalSummarizerCtl->pending_lsn;
691
0
    LWLockRelease(WALSummarizerLock);
692
693
    /* If WAL summarization has progressed sufficiently, stop waiting. */
694
0
    if (summarized_lsn >= lsn)
695
0
      break;
696
697
    /* Recheck current time. */
698
0
    current_time = GetCurrentTimestamp();
699
700
    /* Have we finished the current cycle of waiting? */
701
0
    if (TimestampDifferenceMilliseconds(cycle_time,
702
0
                      current_time) >= timeout_in_ms)
703
0
    {
704
0
      long    elapsed_seconds;
705
706
      /* Begin new wait cycle. */
707
0
      cycle_time = TimestampTzPlusMilliseconds(cycle_time,
708
0
                           timeout_in_ms);
709
710
      /*
711
       * Keep track of the number of cycles during which there has been
712
       * no progression of pending_lsn. If pending_lsn is not advancing,
713
       * that means that not only are no new files appearing on disk,
714
       * but we're not even incorporating new records into the in-memory
715
       * state.
716
       */
717
0
      if (pending_lsn > prior_pending_lsn)
718
0
      {
719
0
        prior_pending_lsn = pending_lsn;
720
0
        deadcycles = 0;
721
0
      }
722
0
      else
723
0
        ++deadcycles;
724
725
      /*
726
       * If we've managed to wait for an entire minute without the WAL
727
       * summarizer absorbing a single WAL record, error out; probably
728
       * something is wrong.
729
       *
730
       * We could consider also erroring out if the summarizer is taking
731
       * too long to catch up, but it's not clear what rate of progress
732
       * would be acceptable and what would be too slow. So instead, we
733
       * just try to error out in the case where there's no progress at
734
       * all. That seems likely to catch a reasonable number of the
735
       * things that can go wrong in practice (e.g. the summarizer
736
       * process is completely hung, say because somebody hooked up a
737
       * debugger to it or something) without giving up too quickly when
738
       * the system is just slow.
739
       */
740
0
      if (deadcycles >= 6)
741
0
        ereport(ERROR,
742
0
            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
743
0
             errmsg("WAL summarization is not progressing"),
744
0
             errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.",
745
0
                   LSN_FORMAT_ARGS(lsn),
746
0
                   LSN_FORMAT_ARGS(summarized_lsn),
747
0
                   LSN_FORMAT_ARGS(pending_lsn))));
748
749
750
      /*
751
       * Otherwise, just let the user know what's happening.
752
       */
753
0
      elapsed_seconds =
754
0
        TimestampDifferenceMilliseconds(initial_time,
755
0
                        current_time) / 1000;
756
0
      ereport(WARNING,
757
0
          (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
758
0
           errmsg_plural("still waiting for WAL summarization through %X/%X after %ld second",
759
0
                   "still waiting for WAL summarization through %X/%X after %ld seconds",
760
0
                   elapsed_seconds,
761
0
                   LSN_FORMAT_ARGS(lsn),
762
0
                   elapsed_seconds),
763
0
           errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.",
764
0
                 LSN_FORMAT_ARGS(summarized_lsn),
765
0
                 LSN_FORMAT_ARGS(pending_lsn))));
766
0
    }
767
768
    /*
769
     * Align the wait time to prevent drift. This doesn't really matter,
770
     * but we'd like the warnings about how long we've been waiting to say
771
     * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever
772
     * drifting to something that is not a multiple of ten.
773
     */
774
0
    timeout_in_ms -=
775
0
      TimestampDifferenceMilliseconds(cycle_time, current_time);
776
777
    /* Wait and see. */
778
0
    ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv,
779
0
                  timeout_in_ms,
780
0
                  WAIT_EVENT_WAL_SUMMARY_READY);
781
0
  }
782
783
0
  ConditionVariableCancelSleep();
784
0
}
785
786
/*
787
 * On exit, update shared memory to make it clear that we're no longer
788
 * running.
789
 */
790
static void
791
WalSummarizerShutdown(int code, Datum arg)
792
0
{
793
0
  LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
794
0
  WalSummarizerCtl->summarizer_pgprocno = INVALID_PROC_NUMBER;
795
0
  LWLockRelease(WALSummarizerLock);
796
0
}
797
798
/*
799
 * Get the latest LSN that is eligible to be summarized, and set *tli to the
800
 * corresponding timeline.
801
 */
802
static XLogRecPtr
803
GetLatestLSN(TimeLineID *tli)
804
0
{
805
0
  if (!RecoveryInProgress())
806
0
  {
807
    /* Don't summarize WAL before it's flushed. */
808
0
    return GetFlushRecPtr(tli);
809
0
  }
810
0
  else
811
0
  {
812
0
    XLogRecPtr  flush_lsn;
813
0
    TimeLineID  flush_tli;
814
0
    XLogRecPtr  replay_lsn;
815
0
    TimeLineID  replay_tli;
816
0
    TimeLineID  insert_tli;
817
818
    /*
819
     * After the insert TLI has been set and before the control file has
820
     * been updated to show the DB in production, RecoveryInProgress()
821
     * will return true, because it's not yet safe for all backends to
822
     * begin writing WAL. However, replay has already ceased, so from our
823
     * point of view, recovery is already over. We should summarize up to
824
     * where replay stopped and then prepare to resume at the start of the
825
     * insert timeline.
826
     */
827
0
    if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0)
828
0
    {
829
0
      *tli = insert_tli;
830
0
      return GetXLogReplayRecPtr(NULL);
831
0
    }
832
833
    /*
834
     * What we really want to know is how much WAL has been flushed to
835
     * disk, but the only flush position available is the one provided by
836
     * the walreceiver, which may not be running, because this could be
837
     * crash recovery or recovery via restore_command. So use either the
838
     * WAL receiver's flush position or the replay position, whichever is
839
     * further ahead, on the theory that if the WAL has been replayed then
840
     * it must also have been flushed to disk.
841
     */
842
0
    flush_lsn = GetWalRcvFlushRecPtr(NULL, &flush_tli);
843
0
    replay_lsn = GetXLogReplayRecPtr(&replay_tli);
844
0
    if (flush_lsn > replay_lsn)
845
0
    {
846
0
      *tli = flush_tli;
847
0
      return flush_lsn;
848
0
    }
849
0
    else
850
0
    {
851
0
      *tli = replay_tli;
852
0
      return replay_lsn;
853
0
    }
854
0
  }
855
0
}
856
857
/*
858
 * Interrupt handler for main loop of WAL summarizer process.
859
 */
860
static void
861
ProcessWalSummarizerInterrupts(void)
862
0
{
863
0
  if (ProcSignalBarrierPending)
864
0
    ProcessProcSignalBarrier();
865
866
0
  if (ConfigReloadPending)
867
0
  {
868
0
    ConfigReloadPending = false;
869
0
    ProcessConfigFile(PGC_SIGHUP);
870
0
  }
871
872
0
  if (ShutdownRequestPending || !summarize_wal)
873
0
  {
874
0
    ereport(DEBUG1,
875
0
        errmsg_internal("WAL summarizer shutting down"));
876
0
    proc_exit(0);
877
0
  }
878
879
  /* Perform logging of memory contexts of this process */
880
0
  if (LogMemoryContextPending)
881
0
    ProcessLogMemoryContextInterrupt();
882
0
}
883
884
/*
885
 * Summarize a range of WAL records on a single timeline.
886
 *
887
 * 'tli' is the timeline to be summarized.
888
 *
889
 * 'start_lsn' is the point at which we should start summarizing. If this
890
 * value comes from the end LSN of the previous record as returned by the
891
 * xlogreader machinery, 'exact' should be true; otherwise, 'exact' should
892
 * be false, and this function will search forward for the start of a valid
893
 * WAL record.
894
 *
895
 * 'switch_lsn' is the point at which we should switch to a later timeline,
896
 * if we're summarizing a historic timeline.
897
 *
898
 * 'maximum_lsn' identifies the point beyond which we can't count on being
899
 * able to read any more WAL. It should be the switch point when reading a
900
 * historic timeline, or the most-recently-measured end of WAL when reading
901
 * the current timeline.
902
 *
903
 * The return value is the LSN at which the WAL summary actually ends. Most
904
 * often, a summary file ends because we notice that a checkpoint has
905
 * occurred and reach the redo pointer of that checkpoint, but sometimes
906
 * we stop for other reasons, such as a timeline switch.
907
 */
908
static XLogRecPtr
909
SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
910
       XLogRecPtr switch_lsn, XLogRecPtr maximum_lsn)
911
0
{
912
0
  SummarizerReadLocalXLogPrivate *private_data;
913
0
  XLogReaderState *xlogreader;
914
0
  XLogRecPtr  summary_start_lsn;
915
0
  XLogRecPtr  summary_end_lsn = switch_lsn;
916
0
  char    temp_path[MAXPGPATH];
917
0
  char    final_path[MAXPGPATH];
918
0
  WalSummaryIO io;
919
0
  BlockRefTable *brtab = CreateEmptyBlockRefTable();
920
0
  bool    fast_forward = true;
921
922
  /* Initialize private data for xlogreader. */
923
0
  private_data = (SummarizerReadLocalXLogPrivate *)
924
0
    palloc0(sizeof(SummarizerReadLocalXLogPrivate));
925
0
  private_data->tli = tli;
926
0
  private_data->historic = !XLogRecPtrIsInvalid(switch_lsn);
927
0
  private_data->read_upto = maximum_lsn;
928
929
  /* Create xlogreader. */
930
0
  xlogreader = XLogReaderAllocate(wal_segment_size, NULL,
931
0
                  XL_ROUTINE(.page_read = &summarizer_read_local_xlog_page,
932
0
                         .segment_open = &wal_segment_open,
933
0
                         .segment_close = &wal_segment_close),
934
0
                  private_data);
935
0
  if (xlogreader == NULL)
936
0
    ereport(ERROR,
937
0
        (errcode(ERRCODE_OUT_OF_MEMORY),
938
0
         errmsg("out of memory"),
939
0
         errdetail("Failed while allocating a WAL reading processor.")));
940
941
  /*
942
   * When exact = false, we're starting from an arbitrary point in the WAL
943
   * and must search forward for the start of the next record.
944
   *
945
   * When exact = true, start_lsn should be either the LSN where a record
946
   * begins, or the LSN of a page where the page header is immediately
947
   * followed by the start of a new record. XLogBeginRead should tolerate
948
   * either case.
949
   *
950
   * We need to allow for both cases because the behavior of xlogreader
951
   * varies. When a record spans two or more xlog pages, the ending LSN
952
   * reported by xlogreader will be the starting LSN of the following
953
   * record, but when an xlog page boundary falls between two records, the
954
   * end LSN for the first will be reported as the first byte of the
955
   * following page. We can't know until we read that page how large the
956
   * header will be, but we'll have to skip over it to find the next record.
957
   */
958
0
  if (exact)
959
0
  {
960
    /*
961
     * Even if start_lsn is the beginning of a page rather than the
962
     * beginning of the first record on that page, we should still use it
963
     * as the start LSN for the summary file. That's because we detect
964
     * missing summary files by looking for cases where the end LSN of one
965
     * file is less than the start LSN of the next file. When only a page
966
     * header is skipped, nothing has been missed.
967
     */
968
0
    XLogBeginRead(xlogreader, start_lsn);
969
0
    summary_start_lsn = start_lsn;
970
0
  }
971
0
  else
972
0
  {
973
0
    summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn);
974
0
    if (XLogRecPtrIsInvalid(summary_start_lsn))
975
0
    {
976
      /*
977
       * If we hit end-of-WAL while trying to find the next valid
978
       * record, we must be on a historic timeline that has no valid
979
       * records that begin after start_lsn and before end of WAL.
980
       */
981
0
      if (private_data->end_of_wal)
982
0
      {
983
0
        ereport(DEBUG1,
984
0
            errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X",
985
0
                    tli,
986
0
                    LSN_FORMAT_ARGS(start_lsn),
987
0
                    LSN_FORMAT_ARGS(private_data->read_upto)));
988
989
        /*
990
         * The timeline ends at or after start_lsn, without containing
991
         * any records. Thus, we must make sure the main loop does not
992
         * iterate. If start_lsn is the end of the timeline, then we
993
         * won't actually emit an empty summary file, but otherwise,
994
         * we must, to capture the fact that the LSN range in question
995
         * contains no interesting WAL records.
996
         */
997
0
        summary_start_lsn = start_lsn;
998
0
        summary_end_lsn = private_data->read_upto;
999
0
        switch_lsn = xlogreader->EndRecPtr;
1000
0
      }
1001
0
      else
1002
0
        ereport(ERROR,
1003
0
            (errmsg("could not find a valid record after %X/%X",
1004
0
                LSN_FORMAT_ARGS(start_lsn))));
1005
0
    }
1006
1007
    /* We shouldn't go backward. */
1008
0
    Assert(summary_start_lsn >= start_lsn);
1009
0
  }
1010
1011
  /*
1012
   * Main loop: read xlog records one by one.
1013
   */
1014
0
  while (1)
1015
0
  {
1016
0
    int     block_id;
1017
0
    char     *errormsg;
1018
0
    XLogRecord *record;
1019
0
    uint8   rmid;
1020
1021
0
    ProcessWalSummarizerInterrupts();
1022
1023
    /* We shouldn't go backward. */
1024
0
    Assert(summary_start_lsn <= xlogreader->EndRecPtr);
1025
1026
    /* Now read the next record. */
1027
0
    record = XLogReadRecord(xlogreader, &errormsg);
1028
0
    if (record == NULL)
1029
0
    {
1030
0
      if (private_data->end_of_wal)
1031
0
      {
1032
        /*
1033
         * This timeline must be historic and must end before we were
1034
         * able to read a complete record.
1035
         */
1036
0
        ereport(DEBUG1,
1037
0
            errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X",
1038
0
                    tli,
1039
0
                    LSN_FORMAT_ARGS(xlogreader->EndRecPtr),
1040
0
                    LSN_FORMAT_ARGS(private_data->read_upto)));
1041
        /* Summary ends at end of WAL. */
1042
0
        summary_end_lsn = private_data->read_upto;
1043
0
        break;
1044
0
      }
1045
0
      if (errormsg)
1046
0
        ereport(ERROR,
1047
0
            (errcode_for_file_access(),
1048
0
             errmsg("could not read WAL from timeline %u at %X/%X: %s",
1049
0
                tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr),
1050
0
                errormsg)));
1051
0
      else
1052
0
        ereport(ERROR,
1053
0
            (errcode_for_file_access(),
1054
0
             errmsg("could not read WAL from timeline %u at %X/%X",
1055
0
                tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr))));
1056
0
    }
1057
1058
    /* We shouldn't go backward. */
1059
0
    Assert(summary_start_lsn <= xlogreader->EndRecPtr);
1060
1061
0
    if (!XLogRecPtrIsInvalid(switch_lsn) &&
1062
0
      xlogreader->ReadRecPtr >= switch_lsn)
1063
0
    {
1064
      /*
1065
       * Whoops! We've read a record that *starts* after the switch LSN,
1066
       * contrary to our goal of reading only until we hit the first
1067
       * record that ends at or after the switch LSN. Pretend we didn't
1068
       * read it after all by bailing out of this loop right here,
1069
       * before we do anything with this record.
1070
       *
1071
       * This can happen because the last record before the switch LSN
1072
       * might be continued across multiple pages, and then we might
1073
       * come to a page with XLP_FIRST_IS_OVERWRITE_CONTRECORD set. In
1074
       * that case, the record that was continued across multiple pages
1075
       * is incomplete and will be disregarded, and the read will
1076
       * restart from the beginning of the page that is flagged
1077
       * XLP_FIRST_IS_OVERWRITE_CONTRECORD.
1078
       *
1079
       * If this case occurs, we can fairly say that the current summary
1080
       * file ends at the switch LSN exactly. The first record on the
1081
       * page marked XLP_FIRST_IS_OVERWRITE_CONTRECORD will be
1082
       * discovered when generating the next summary file.
1083
       */
1084
0
      summary_end_lsn = switch_lsn;
1085
0
      break;
1086
0
    }
1087
1088
    /*
1089
     * Certain types of records require special handling. Redo points and
1090
     * shutdown checkpoints trigger creation of new summary files and can
1091
     * also cause us to enter or exit "fast forward" mode. Other types of
1092
     * records can require special updates to the block reference table.
1093
     */
1094
0
    rmid = XLogRecGetRmid(xlogreader);
1095
0
    if (rmid == RM_XLOG_ID)
1096
0
    {
1097
0
      bool    new_fast_forward;
1098
1099
      /*
1100
       * If we've already processed some WAL records when we hit a redo
1101
       * point or shutdown checkpoint, then we stop summarization before
1102
       * including this record in the current file, so that it will be
1103
       * the first record in the next file.
1104
       *
1105
       * When we hit one of those record types as the first record in a
1106
       * file, we adjust our notion of whether we're fast-forwarding.
1107
       * Any WAL generated with wal_level=minimal must be skipped
1108
       * without actually generating any summary file, because an
1109
       * incremental backup that crosses such WAL would be unsafe.
1110
       */
1111
0
      if (SummarizeXlogRecord(xlogreader, &new_fast_forward))
1112
0
      {
1113
0
        if (xlogreader->ReadRecPtr > summary_start_lsn)
1114
0
        {
1115
0
          summary_end_lsn = xlogreader->ReadRecPtr;
1116
0
          break;
1117
0
        }
1118
0
        else
1119
0
          fast_forward = new_fast_forward;
1120
0
      }
1121
0
    }
1122
0
    else if (!fast_forward)
1123
0
    {
1124
      /*
1125
       * This switch handles record types that require extra updates to
1126
       * the contents of the block reference table.
1127
       */
1128
0
      switch (rmid)
1129
0
      {
1130
0
        case RM_DBASE_ID:
1131
0
          SummarizeDbaseRecord(xlogreader, brtab);
1132
0
          break;
1133
0
        case RM_SMGR_ID:
1134
0
          SummarizeSmgrRecord(xlogreader, brtab);
1135
0
          break;
1136
0
        case RM_XACT_ID:
1137
0
          SummarizeXactRecord(xlogreader, brtab);
1138
0
          break;
1139
0
      }
1140
0
    }
1141
1142
    /*
1143
     * If we're in fast-forward mode, we don't really need to do anything.
1144
     * Otherwise, feed block references from xlog record to block
1145
     * reference table.
1146
     */
1147
0
    if (!fast_forward)
1148
0
    {
1149
0
      for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader);
1150
0
         block_id++)
1151
0
      {
1152
0
        RelFileLocator rlocator;
1153
0
        ForkNumber  forknum;
1154
0
        BlockNumber blocknum;
1155
1156
0
        if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator,
1157
0
                        &forknum, &blocknum, NULL))
1158
0
          continue;
1159
1160
        /*
1161
         * As we do elsewhere, ignore the FSM fork, because it's not
1162
         * fully WAL-logged.
1163
         */
1164
0
        if (forknum != FSM_FORKNUM)
1165
0
          BlockRefTableMarkBlockModified(brtab, &rlocator, forknum,
1166
0
                           blocknum);
1167
0
      }
1168
0
    }
1169
1170
    /* Update our notion of where this summary file ends. */
1171
0
    summary_end_lsn = xlogreader->EndRecPtr;
1172
1173
    /* Also update shared memory. */
1174
0
    LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE);
1175
0
    Assert(summary_end_lsn >= WalSummarizerCtl->summarized_lsn);
1176
0
    WalSummarizerCtl->pending_lsn = summary_end_lsn;
1177
0
    LWLockRelease(WALSummarizerLock);
1178
1179
    /*
1180
     * If we have a switch LSN and have reached it, stop before reading
1181
     * the next record.
1182
     */
1183
0
    if (!XLogRecPtrIsInvalid(switch_lsn) &&
1184
0
      xlogreader->EndRecPtr >= switch_lsn)
1185
0
      break;
1186
0
  }
1187
1188
  /* Destroy xlogreader. */
1189
0
  pfree(xlogreader->private_data);
1190
0
  XLogReaderFree(xlogreader);
1191
1192
  /*
1193
   * If a timeline switch occurs, we may fail to make any progress at all
1194
   * before exiting the loop above. If that happens, we don't write a WAL
1195
   * summary file at all. We can also skip writing a file if we're in
1196
   * fast-forward mode.
1197
   */
1198
0
  if (summary_end_lsn > summary_start_lsn && !fast_forward)
1199
0
  {
1200
    /* Generate temporary and final path name. */
1201
0
    snprintf(temp_path, MAXPGPATH,
1202
0
         XLOGDIR "/summaries/temp.summary");
1203
0
    snprintf(final_path, MAXPGPATH,
1204
0
         XLOGDIR "/summaries/%08X%08X%08X%08X%08X.summary",
1205
0
         tli,
1206
0
         LSN_FORMAT_ARGS(summary_start_lsn),
1207
0
         LSN_FORMAT_ARGS(summary_end_lsn));
1208
1209
    /* Open the temporary file for writing. */
1210
0
    io.filepos = 0;
1211
0
    io.file = PathNameOpenFile(temp_path, O_WRONLY | O_CREAT | O_TRUNC);
1212
0
    if (io.file < 0)
1213
0
      ereport(ERROR,
1214
0
          (errcode_for_file_access(),
1215
0
           errmsg("could not create file \"%s\": %m", temp_path)));
1216
1217
    /* Write the data. */
1218
0
    WriteBlockRefTable(brtab, WriteWalSummary, &io);
1219
1220
    /* Close temporary file and shut down xlogreader. */
1221
0
    FileClose(io.file);
1222
1223
    /* Tell the user what we did. */
1224
0
    ereport(DEBUG1,
1225
0
        errmsg_internal("summarized WAL on TLI %u from %X/%X to %X/%X",
1226
0
                tli,
1227
0
                LSN_FORMAT_ARGS(summary_start_lsn),
1228
0
                LSN_FORMAT_ARGS(summary_end_lsn)));
1229
1230
    /* Durably rename the new summary into place. */
1231
0
    durable_rename(temp_path, final_path, ERROR);
1232
0
  }
1233
1234
  /* If we skipped a non-zero amount of WAL, log a debug message. */
1235
0
  if (summary_end_lsn > summary_start_lsn && fast_forward)
1236
0
    ereport(DEBUG1,
1237
0
        errmsg_internal("skipped summarizing WAL on TLI %u from %X/%X to %X/%X",
1238
0
                tli,
1239
0
                LSN_FORMAT_ARGS(summary_start_lsn),
1240
0
                LSN_FORMAT_ARGS(summary_end_lsn)));
1241
1242
0
  return summary_end_lsn;
1243
0
}
1244
1245
/*
1246
 * Special handling for WAL records with RM_DBASE_ID.
1247
 */
1248
static void
1249
SummarizeDbaseRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
1250
0
{
1251
0
  uint8   info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1252
1253
  /*
1254
   * We use relfilenode zero for a given database OID and tablespace OID to
1255
   * indicate that all relations with that pair of IDs have been recreated
1256
   * if they exist at all. Effectively, we're setting a limit block of 0 for
1257
   * all such relfilenodes.
1258
   *
1259
   * Technically, this special handling is only needed in the case of
1260
   * XLOG_DBASE_CREATE_FILE_COPY, because that can create a whole bunch of
1261
   * relation files in a directory without logging anything specific to each
1262
   * one. If we didn't mark the whole DB OID/TS OID combination in some way,
1263
   * then a tablespace that was dropped after the reference backup and
1264
   * recreated using the FILE_COPY method prior to the incremental backup
1265
   * would look just like one that was never touched at all, which would be
1266
   * catastrophic.
1267
   *
1268
   * But it seems best to adopt this treatment for all records that drop or
1269
   * create a DB OID/TS OID combination. That's similar to how we treat the
1270
   * limit block for individual relations, and it's an extra layer of safety
1271
   * here. We can never lose data by marking more stuff as needing to be
1272
   * backed up in full.
1273
   */
1274
0
  if (info == XLOG_DBASE_CREATE_FILE_COPY)
1275
0
  {
1276
0
    xl_dbase_create_file_copy_rec *xlrec;
1277
0
    RelFileLocator rlocator;
1278
1279
0
    xlrec =
1280
0
      (xl_dbase_create_file_copy_rec *) XLogRecGetData(xlogreader);
1281
0
    rlocator.spcOid = xlrec->tablespace_id;
1282
0
    rlocator.dbOid = xlrec->db_id;
1283
0
    rlocator.relNumber = 0;
1284
0
    BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1285
0
  }
1286
0
  else if (info == XLOG_DBASE_CREATE_WAL_LOG)
1287
0
  {
1288
0
    xl_dbase_create_wal_log_rec *xlrec;
1289
0
    RelFileLocator rlocator;
1290
1291
0
    xlrec = (xl_dbase_create_wal_log_rec *) XLogRecGetData(xlogreader);
1292
0
    rlocator.spcOid = xlrec->tablespace_id;
1293
0
    rlocator.dbOid = xlrec->db_id;
1294
0
    rlocator.relNumber = 0;
1295
0
    BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1296
0
  }
1297
0
  else if (info == XLOG_DBASE_DROP)
1298
0
  {
1299
0
    xl_dbase_drop_rec *xlrec;
1300
0
    RelFileLocator rlocator;
1301
0
    int     i;
1302
1303
0
    xlrec = (xl_dbase_drop_rec *) XLogRecGetData(xlogreader);
1304
0
    rlocator.dbOid = xlrec->db_id;
1305
0
    rlocator.relNumber = 0;
1306
0
    for (i = 0; i < xlrec->ntablespaces; ++i)
1307
0
    {
1308
0
      rlocator.spcOid = xlrec->tablespace_ids[i];
1309
0
      BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0);
1310
0
    }
1311
0
  }
1312
0
}
1313
1314
/*
1315
 * Special handling for WAL records with RM_SMGR_ID.
1316
 */
1317
static void
1318
SummarizeSmgrRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
1319
0
{
1320
0
  uint8   info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1321
1322
0
  if (info == XLOG_SMGR_CREATE)
1323
0
  {
1324
0
    xl_smgr_create *xlrec;
1325
1326
    /*
1327
     * If a new relation fork is created on disk, there is no point
1328
     * tracking anything about which blocks have been modified, because
1329
     * the whole thing will be new. Hence, set the limit block for this
1330
     * fork to 0.
1331
     *
1332
     * Ignore the FSM fork, which is not fully WAL-logged.
1333
     */
1334
0
    xlrec = (xl_smgr_create *) XLogRecGetData(xlogreader);
1335
1336
0
    if (xlrec->forkNum != FSM_FORKNUM)
1337
0
      BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1338
0
                     xlrec->forkNum, 0);
1339
0
  }
1340
0
  else if (info == XLOG_SMGR_TRUNCATE)
1341
0
  {
1342
0
    xl_smgr_truncate *xlrec;
1343
1344
0
    xlrec = (xl_smgr_truncate *) XLogRecGetData(xlogreader);
1345
1346
    /*
1347
     * If a relation fork is truncated on disk, there is no point in
1348
     * tracking anything about block modifications beyond the truncation
1349
     * point.
1350
     *
1351
     * We ignore SMGR_TRUNCATE_FSM here because the FSM isn't fully
1352
     * WAL-logged and thus we can't track modified blocks for it anyway.
1353
     */
1354
0
    if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0)
1355
0
      BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1356
0
                     MAIN_FORKNUM, xlrec->blkno);
1357
0
    if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0)
1358
0
      BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator,
1359
0
                     VISIBILITYMAP_FORKNUM, xlrec->blkno);
1360
0
  }
1361
0
}
1362
1363
/*
1364
 * Special handling for WAL records with RM_XACT_ID.
1365
 */
1366
static void
1367
SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab)
1368
0
{
1369
0
  uint8   info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1370
0
  uint8   xact_info = info & XLOG_XACT_OPMASK;
1371
1372
0
  if (xact_info == XLOG_XACT_COMMIT ||
1373
0
    xact_info == XLOG_XACT_COMMIT_PREPARED)
1374
0
  {
1375
0
    xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(xlogreader);
1376
0
    xl_xact_parsed_commit parsed;
1377
0
    int     i;
1378
1379
    /*
1380
     * Don't track modified blocks for any relations that were removed on
1381
     * commit.
1382
     */
1383
0
    ParseCommitRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed);
1384
0
    for (i = 0; i < parsed.nrels; ++i)
1385
0
    {
1386
0
      ForkNumber  forknum;
1387
1388
0
      for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum)
1389
0
        if (forknum != FSM_FORKNUM)
1390
0
          BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i],
1391
0
                         forknum, 0);
1392
0
    }
1393
0
  }
1394
0
  else if (xact_info == XLOG_XACT_ABORT ||
1395
0
       xact_info == XLOG_XACT_ABORT_PREPARED)
1396
0
  {
1397
0
    xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(xlogreader);
1398
0
    xl_xact_parsed_abort parsed;
1399
0
    int     i;
1400
1401
    /*
1402
     * Don't track modified blocks for any relations that were removed on
1403
     * abort.
1404
     */
1405
0
    ParseAbortRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed);
1406
0
    for (i = 0; i < parsed.nrels; ++i)
1407
0
    {
1408
0
      ForkNumber  forknum;
1409
1410
0
      for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum)
1411
0
        if (forknum != FSM_FORKNUM)
1412
0
          BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i],
1413
0
                         forknum, 0);
1414
0
    }
1415
0
  }
1416
0
}
1417
1418
/*
1419
 * Special handling for WAL records with RM_XLOG_ID.
1420
 *
1421
 * The return value is true if WAL summarization should stop before this
1422
 * record and false otherwise. When the return value is true,
1423
 * *new_fast_forward indicates whether future processing should be done
1424
 * in fast forward mode (i.e. read WAL without emitting summaries) or not.
1425
 */
1426
static bool
1427
SummarizeXlogRecord(XLogReaderState *xlogreader, bool *new_fast_forward)
1428
0
{
1429
0
  uint8   info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
1430
0
  int     record_wal_level;
1431
1432
0
  if (info == XLOG_CHECKPOINT_REDO)
1433
0
  {
1434
    /* Payload is wal_level at the time record was written. */
1435
0
    memcpy(&record_wal_level, XLogRecGetData(xlogreader), sizeof(int));
1436
0
  }
1437
0
  else if (info == XLOG_CHECKPOINT_SHUTDOWN)
1438
0
  {
1439
0
    CheckPoint  rec_ckpt;
1440
1441
    /* Extract wal_level at time record was written from payload. */
1442
0
    memcpy(&rec_ckpt, XLogRecGetData(xlogreader), sizeof(CheckPoint));
1443
0
    record_wal_level = rec_ckpt.wal_level;
1444
0
  }
1445
0
  else if (info == XLOG_PARAMETER_CHANGE)
1446
0
  {
1447
0
    xl_parameter_change xlrec;
1448
1449
    /* Extract wal_level at time record was written from payload. */
1450
0
    memcpy(&xlrec, XLogRecGetData(xlogreader),
1451
0
         sizeof(xl_parameter_change));
1452
0
    record_wal_level = xlrec.wal_level;
1453
0
  }
1454
0
  else if (info == XLOG_END_OF_RECOVERY)
1455
0
  {
1456
0
    xl_end_of_recovery xlrec;
1457
1458
    /* Extract wal_level at time record was written from payload. */
1459
0
    memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
1460
0
    record_wal_level = xlrec.wal_level;
1461
0
  }
1462
0
  else
1463
0
  {
1464
    /* No special handling required. Return false. */
1465
0
    return false;
1466
0
  }
1467
1468
  /*
1469
   * Redo can only begin at an XLOG_CHECKPOINT_REDO or
1470
   * XLOG_CHECKPOINT_SHUTDOWN record, so we want WAL summarization to begin
1471
   * at those points. Hence, when those records are encountered, return
1472
   * true, so that we stop just before summarizing either of those records.
1473
   *
1474
   * We also reach here if we just saw XLOG_END_OF_RECOVERY or
1475
   * XLOG_PARAMETER_CHANGE. These are not places where recovery can start,
1476
   * but they're still relevant here. A new timeline can begin with
1477
   * XLOG_END_OF_RECOVERY, so we need to confirm the WAL level at that
1478
   * point; and a restart can provoke XLOG_PARAMETER_CHANGE after an
1479
   * intervening change to postgresql.conf, which might force us to stop
1480
   * summarizing.
1481
   */
1482
0
  *new_fast_forward = (record_wal_level == WAL_LEVEL_MINIMAL);
1483
0
  return true;
1484
0
}
1485
1486
/*
1487
 * Similar to read_local_xlog_page, but limited to read from one particular
1488
 * timeline. If the end of WAL is reached, it will wait for more if reading
1489
 * from the current timeline, or give up if reading from a historic timeline.
1490
 * In the latter case, it will also set private_data->end_of_wal = true.
1491
 *
1492
 * Caller must set private_data->tli to the TLI of interest,
1493
 * private_data->read_upto to the lowest LSN that is not known to be safe
1494
 * to read on that timeline, and private_data->historic to true if and only
1495
 * if the timeline is not the current timeline. This function will update
1496
 * private_data->read_upto and private_data->historic if more WAL appears
1497
 * on the current timeline or if the current timeline becomes historic.
1498
 */
1499
static int
1500
summarizer_read_local_xlog_page(XLogReaderState *state,
1501
                XLogRecPtr targetPagePtr, int reqLen,
1502
                XLogRecPtr targetRecPtr, char *cur_page)
1503
0
{
1504
0
  int     count;
1505
0
  WALReadError errinfo;
1506
0
  SummarizerReadLocalXLogPrivate *private_data;
1507
1508
0
  ProcessWalSummarizerInterrupts();
1509
1510
0
  private_data = (SummarizerReadLocalXLogPrivate *)
1511
0
    state->private_data;
1512
1513
0
  while (1)
1514
0
  {
1515
0
    if (targetPagePtr + XLOG_BLCKSZ <= private_data->read_upto)
1516
0
    {
1517
      /*
1518
       * more than one block available; read only that block, have
1519
       * caller come back if they need more.
1520
       */
1521
0
      count = XLOG_BLCKSZ;
1522
0
      break;
1523
0
    }
1524
0
    else if (targetPagePtr + reqLen > private_data->read_upto)
1525
0
    {
1526
      /* We don't seem to have enough data. */
1527
0
      if (private_data->historic)
1528
0
      {
1529
        /*
1530
         * This is a historic timeline, so there will never be any
1531
         * more data than we have currently.
1532
         */
1533
0
        private_data->end_of_wal = true;
1534
0
        return -1;
1535
0
      }
1536
0
      else
1537
0
      {
1538
0
        XLogRecPtr  latest_lsn;
1539
0
        TimeLineID  latest_tli;
1540
1541
        /*
1542
         * This is - or at least was up until very recently - the
1543
         * current timeline, so more data might show up.  Delay here
1544
         * so we don't tight-loop.
1545
         */
1546
0
        ProcessWalSummarizerInterrupts();
1547
0
        summarizer_wait_for_wal();
1548
1549
        /* Recheck end-of-WAL. */
1550
0
        latest_lsn = GetLatestLSN(&latest_tli);
1551
0
        if (private_data->tli == latest_tli)
1552
0
        {
1553
          /* Still the current timeline, update max LSN. */
1554
0
          Assert(latest_lsn >= private_data->read_upto);
1555
0
          private_data->read_upto = latest_lsn;
1556
0
        }
1557
0
        else
1558
0
        {
1559
0
          List     *tles = readTimeLineHistory(latest_tli);
1560
0
          XLogRecPtr  switchpoint;
1561
1562
          /*
1563
           * The timeline we're scanning is no longer the latest
1564
           * one. Figure out when it ended.
1565
           */
1566
0
          private_data->historic = true;
1567
0
          switchpoint = tliSwitchPoint(private_data->tli, tles,
1568
0
                         NULL);
1569
1570
          /*
1571
           * Allow reads up to exactly the switch point.
1572
           *
1573
           * It's possible that this will cause read_upto to move
1574
           * backwards, because we might have been promoted before
1575
           * reaching the end of the previous timeline. In that
1576
           * case, the next loop iteration will likely conclude that
1577
           * we've reached end of WAL.
1578
           */
1579
0
          private_data->read_upto = switchpoint;
1580
1581
          /* Debugging output. */
1582
0
          ereport(DEBUG1,
1583
0
              errmsg_internal("timeline %u became historic, can read up to %X/%X",
1584
0
                      private_data->tli, LSN_FORMAT_ARGS(private_data->read_upto)));
1585
0
        }
1586
1587
        /* Go around and try again. */
1588
0
      }
1589
0
    }
1590
0
    else
1591
0
    {
1592
      /* enough bytes available to satisfy the request */
1593
0
      count = private_data->read_upto - targetPagePtr;
1594
0
      break;
1595
0
    }
1596
0
  }
1597
1598
0
  if (!WALRead(state, cur_page, targetPagePtr, count,
1599
0
         private_data->tli, &errinfo))
1600
0
    WALReadRaiseError(&errinfo);
1601
1602
  /* Track that we read a page, for sleep time calculation. */
1603
0
  ++pages_read_since_last_sleep;
1604
1605
  /* number of valid bytes in the buffer */
1606
0
  return count;
1607
0
}
1608
1609
/*
1610
 * Sleep for long enough that we believe it's likely that more WAL will
1611
 * be available afterwards.
1612
 */
1613
static void
1614
summarizer_wait_for_wal(void)
1615
0
{
1616
0
  if (pages_read_since_last_sleep == 0)
1617
0
  {
1618
    /*
1619
     * No pages were read since the last sleep, so double the sleep time,
1620
     * but not beyond the maximum allowable value.
1621
     */
1622
0
    sleep_quanta = Min(sleep_quanta * 2, MAX_SLEEP_QUANTA);
1623
0
  }
1624
0
  else if (pages_read_since_last_sleep > 1)
1625
0
  {
1626
    /*
1627
     * Multiple pages were read since the last sleep, so reduce the sleep
1628
     * time.
1629
     *
1630
     * A large burst of activity should be able to quickly reduce the
1631
     * sleep time to the minimum, but we don't want a handful of extra WAL
1632
     * records to provoke a strong reaction. We choose to reduce the sleep
1633
     * time by 1 quantum for each page read beyond the first, which is a
1634
     * fairly arbitrary way of trying to be reactive without overreacting.
1635
     */
1636
0
    if (pages_read_since_last_sleep > sleep_quanta - 1)
1637
0
      sleep_quanta = 1;
1638
0
    else
1639
0
      sleep_quanta -= pages_read_since_last_sleep;
1640
0
  }
1641
1642
  /* Report pending statistics to the cumulative stats system. */
1643
0
  pgstat_report_wal(false);
1644
1645
  /* OK, now sleep. */
1646
0
  (void) WaitLatch(MyLatch,
1647
0
           WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
1648
0
           sleep_quanta * MS_PER_SLEEP_QUANTUM,
1649
0
           WAIT_EVENT_WAL_SUMMARIZER_WAL);
1650
0
  ResetLatch(MyLatch);
1651
1652
  /* Reset count of pages read. */
1653
0
  pages_read_since_last_sleep = 0;
1654
0
}
1655
1656
/*
1657
 * Remove WAL summaries whose mtimes are older than wal_summary_keep_time.
1658
 */
1659
static void
1660
MaybeRemoveOldWalSummaries(void)
1661
0
{
1662
0
  XLogRecPtr  redo_pointer = GetRedoRecPtr();
1663
0
  List     *wslist;
1664
0
  time_t    cutoff_time;
1665
1666
  /* If WAL summary removal is disabled, don't do anything. */
1667
0
  if (wal_summary_keep_time == 0)
1668
0
    return;
1669
1670
  /*
1671
   * If the redo pointer has not advanced, don't do anything.
1672
   *
1673
   * This has the effect that we only try to remove old WAL summary files
1674
   * once per checkpoint cycle.
1675
   */
1676
0
  if (redo_pointer == redo_pointer_at_last_summary_removal)
1677
0
    return;
1678
0
  redo_pointer_at_last_summary_removal = redo_pointer;
1679
1680
  /*
1681
   * Files should only be removed if the last modification time precedes the
1682
   * cutoff time we compute here.
1683
   */
1684
0
  cutoff_time = time(NULL) - wal_summary_keep_time * SECS_PER_MINUTE;
1685
1686
  /* Get all the summaries that currently exist. */
1687
0
  wslist = GetWalSummaries(0, InvalidXLogRecPtr, InvalidXLogRecPtr);
1688
1689
  /* Loop until all summaries have been considered for removal. */
1690
0
  while (wslist != NIL)
1691
0
  {
1692
0
    ListCell   *lc;
1693
0
    XLogSegNo oldest_segno;
1694
0
    XLogRecPtr  oldest_lsn = InvalidXLogRecPtr;
1695
0
    TimeLineID  selected_tli;
1696
1697
0
    ProcessWalSummarizerInterrupts();
1698
1699
    /*
1700
     * Pick a timeline for which some summary files still exist on disk,
1701
     * and find the oldest LSN that still exists on disk for that
1702
     * timeline.
1703
     */
1704
0
    selected_tli = ((WalSummaryFile *) linitial(wslist))->tli;
1705
0
    oldest_segno = XLogGetOldestSegno(selected_tli);
1706
0
    if (oldest_segno != 0)
1707
0
      XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size,
1708
0
                  oldest_lsn);
1709
1710
1711
    /* Consider each WAL file on the selected timeline in turn. */
1712
0
    foreach(lc, wslist)
1713
0
    {
1714
0
      WalSummaryFile *ws = lfirst(lc);
1715
1716
0
      ProcessWalSummarizerInterrupts();
1717
1718
      /* If it's not on this timeline, it's not time to consider it. */
1719
0
      if (selected_tli != ws->tli)
1720
0
        continue;
1721
1722
      /*
1723
       * If the WAL doesn't exist any more, we can remove it if the file
1724
       * modification time is old enough.
1725
       */
1726
0
      if (XLogRecPtrIsInvalid(oldest_lsn) || ws->end_lsn <= oldest_lsn)
1727
0
        RemoveWalSummaryIfOlderThan(ws, cutoff_time);
1728
1729
      /*
1730
       * Whether we removed the file or not, we need not consider it
1731
       * again.
1732
       */
1733
0
      wslist = foreach_delete_current(wslist, lc);
1734
0
      pfree(ws);
1735
0
    }
1736
0
  }
1737
0
}