/src/postgres/src/backend/postmaster/walsummarizer.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * walsummarizer.c |
4 | | * |
5 | | * Background process to perform WAL summarization, if it is enabled. |
6 | | * It continuously scans the write-ahead log and periodically emits a |
7 | | * summary file which indicates which blocks in which relation forks |
8 | | * were modified by WAL records in the LSN range covered by the summary |
9 | | * file. See walsummary.c and blkreftable.c for more details on the |
10 | | * naming and contents of WAL summary files. |
11 | | * |
12 | | * If configured to do, this background process will also remove WAL |
13 | | * summary files when the file timestamp is older than a configurable |
14 | | * threshold (but only if the WAL has been removed first). |
15 | | * |
16 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
17 | | * |
18 | | * IDENTIFICATION |
19 | | * src/backend/postmaster/walsummarizer.c |
20 | | * |
21 | | *------------------------------------------------------------------------- |
22 | | */ |
23 | | #include "postgres.h" |
24 | | |
25 | | #include "access/timeline.h" |
26 | | #include "access/xlog.h" |
27 | | #include "access/xlog_internal.h" |
28 | | #include "access/xlogrecovery.h" |
29 | | #include "access/xlogutils.h" |
30 | | #include "backup/walsummary.h" |
31 | | #include "catalog/storage_xlog.h" |
32 | | #include "commands/dbcommands_xlog.h" |
33 | | #include "common/blkreftable.h" |
34 | | #include "libpq/pqsignal.h" |
35 | | #include "miscadmin.h" |
36 | | #include "pgstat.h" |
37 | | #include "postmaster/auxprocess.h" |
38 | | #include "postmaster/interrupt.h" |
39 | | #include "postmaster/walsummarizer.h" |
40 | | #include "replication/walreceiver.h" |
41 | | #include "storage/aio_subsys.h" |
42 | | #include "storage/fd.h" |
43 | | #include "storage/ipc.h" |
44 | | #include "storage/latch.h" |
45 | | #include "storage/lwlock.h" |
46 | | #include "storage/proc.h" |
47 | | #include "storage/procsignal.h" |
48 | | #include "storage/shmem.h" |
49 | | #include "utils/guc.h" |
50 | | #include "utils/memutils.h" |
51 | | #include "utils/wait_event.h" |
52 | | |
53 | | /* |
54 | | * Data in shared memory related to WAL summarization. |
55 | | */ |
56 | | typedef struct |
57 | | { |
58 | | /* |
59 | | * These fields are protected by WALSummarizerLock. |
60 | | * |
61 | | * Until we've discovered what summary files already exist on disk and |
62 | | * stored that information in shared memory, initialized is false and the |
63 | | * other fields here contain no meaningful information. After that has |
64 | | * been done, initialized is true. |
65 | | * |
66 | | * summarized_tli and summarized_lsn indicate the last LSN and TLI at |
67 | | * which the next summary file will start. Normally, these are the LSN and |
68 | | * TLI at which the last file ended; in such case, lsn_is_exact is true. |
69 | | * If, however, the LSN is just an approximation, then lsn_is_exact is |
70 | | * false. This can happen if, for example, there are no existing WAL |
71 | | * summary files at startup. In that case, we have to derive the position |
72 | | * at which to start summarizing from the WAL files that exist on disk, |
73 | | * and so the LSN might point to the start of the next file even though |
74 | | * that might happen to be in the middle of a WAL record. |
75 | | * |
76 | | * summarizer_pgprocno is the proc number of the summarizer process, if |
77 | | * one is running, or else INVALID_PROC_NUMBER. |
78 | | * |
79 | | * pending_lsn is used by the summarizer to advertise the ending LSN of a |
80 | | * record it has recently read. It shouldn't ever be less than |
81 | | * summarized_lsn, but might be greater, because the summarizer buffers |
82 | | * data for a range of LSNs in memory before writing out a new file. |
83 | | */ |
84 | | bool initialized; |
85 | | TimeLineID summarized_tli; |
86 | | XLogRecPtr summarized_lsn; |
87 | | bool lsn_is_exact; |
88 | | ProcNumber summarizer_pgprocno; |
89 | | XLogRecPtr pending_lsn; |
90 | | |
91 | | /* |
92 | | * This field handles its own synchronization. |
93 | | */ |
94 | | ConditionVariable summary_file_cv; |
95 | | } WalSummarizerData; |
96 | | |
97 | | /* |
98 | | * Private data for our xlogreader's page read callback. |
99 | | */ |
100 | | typedef struct |
101 | | { |
102 | | TimeLineID tli; |
103 | | bool historic; |
104 | | XLogRecPtr read_upto; |
105 | | bool end_of_wal; |
106 | | } SummarizerReadLocalXLogPrivate; |
107 | | |
108 | | /* Pointer to shared memory state. */ |
109 | | static WalSummarizerData *WalSummarizerCtl; |
110 | | |
111 | | /* |
112 | | * When we reach end of WAL and need to read more, we sleep for a number of |
113 | | * milliseconds that is an integer multiple of MS_PER_SLEEP_QUANTUM. This is |
114 | | * the multiplier. It should vary between 1 and MAX_SLEEP_QUANTA, depending |
115 | | * on system activity. See summarizer_wait_for_wal() for how we adjust this. |
116 | | */ |
117 | | static long sleep_quanta = 1; |
118 | | |
119 | | /* |
120 | | * The sleep time will always be a multiple of 200ms and will not exceed |
121 | | * thirty seconds (150 * 200 = 30 * 1000). Note that the timeout here needs |
122 | | * to be substantially less than the maximum amount of time for which an |
123 | | * incremental backup will wait for this process to catch up. Otherwise, an |
124 | | * incremental backup might time out on an idle system just because we sleep |
125 | | * for too long. |
126 | | */ |
127 | | #define MAX_SLEEP_QUANTA 150 |
128 | 0 | #define MS_PER_SLEEP_QUANTUM 200 |
129 | | |
130 | | /* |
131 | | * This is a count of the number of pages of WAL that we've read since the |
132 | | * last time we waited for more WAL to appear. |
133 | | */ |
134 | | static long pages_read_since_last_sleep = 0; |
135 | | |
136 | | /* |
137 | | * Most recent RedoRecPtr value observed by MaybeRemoveOldWalSummaries. |
138 | | */ |
139 | | static XLogRecPtr redo_pointer_at_last_summary_removal = InvalidXLogRecPtr; |
140 | | |
141 | | /* |
142 | | * GUC parameters |
143 | | */ |
144 | | bool summarize_wal = false; |
145 | | int wal_summary_keep_time = 10 * HOURS_PER_DAY * MINS_PER_HOUR; |
146 | | |
147 | | static void WalSummarizerShutdown(int code, Datum arg); |
148 | | static XLogRecPtr GetLatestLSN(TimeLineID *tli); |
149 | | static void ProcessWalSummarizerInterrupts(void); |
150 | | static XLogRecPtr SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, |
151 | | bool exact, XLogRecPtr switch_lsn, |
152 | | XLogRecPtr maximum_lsn); |
153 | | static void SummarizeDbaseRecord(XLogReaderState *xlogreader, |
154 | | BlockRefTable *brtab); |
155 | | static void SummarizeSmgrRecord(XLogReaderState *xlogreader, |
156 | | BlockRefTable *brtab); |
157 | | static void SummarizeXactRecord(XLogReaderState *xlogreader, |
158 | | BlockRefTable *brtab); |
159 | | static bool SummarizeXlogRecord(XLogReaderState *xlogreader, |
160 | | bool *new_fast_forward); |
161 | | static int summarizer_read_local_xlog_page(XLogReaderState *state, |
162 | | XLogRecPtr targetPagePtr, |
163 | | int reqLen, |
164 | | XLogRecPtr targetRecPtr, |
165 | | char *cur_page); |
166 | | static void summarizer_wait_for_wal(void); |
167 | | static void MaybeRemoveOldWalSummaries(void); |
168 | | |
169 | | /* |
170 | | * Amount of shared memory required for this module. |
171 | | */ |
172 | | Size |
173 | | WalSummarizerShmemSize(void) |
174 | 0 | { |
175 | 0 | return sizeof(WalSummarizerData); |
176 | 0 | } |
177 | | |
178 | | /* |
179 | | * Create or attach to shared memory segment for this module. |
180 | | */ |
181 | | void |
182 | | WalSummarizerShmemInit(void) |
183 | 0 | { |
184 | 0 | bool found; |
185 | |
|
186 | 0 | WalSummarizerCtl = (WalSummarizerData *) |
187 | 0 | ShmemInitStruct("Wal Summarizer Ctl", WalSummarizerShmemSize(), |
188 | 0 | &found); |
189 | |
|
190 | 0 | if (!found) |
191 | 0 | { |
192 | | /* |
193 | | * First time through, so initialize. |
194 | | * |
195 | | * We're just filling in dummy values here -- the real initialization |
196 | | * will happen when GetOldestUnsummarizedLSN() is called for the first |
197 | | * time. |
198 | | */ |
199 | 0 | WalSummarizerCtl->initialized = false; |
200 | 0 | WalSummarizerCtl->summarized_tli = 0; |
201 | 0 | WalSummarizerCtl->summarized_lsn = InvalidXLogRecPtr; |
202 | 0 | WalSummarizerCtl->lsn_is_exact = false; |
203 | 0 | WalSummarizerCtl->summarizer_pgprocno = INVALID_PROC_NUMBER; |
204 | 0 | WalSummarizerCtl->pending_lsn = InvalidXLogRecPtr; |
205 | 0 | ConditionVariableInit(&WalSummarizerCtl->summary_file_cv); |
206 | 0 | } |
207 | 0 | } |
208 | | |
209 | | /* |
210 | | * Entry point for walsummarizer process. |
211 | | */ |
212 | | void |
213 | | WalSummarizerMain(const void *startup_data, size_t startup_data_len) |
214 | 0 | { |
215 | 0 | sigjmp_buf local_sigjmp_buf; |
216 | 0 | MemoryContext context; |
217 | | |
218 | | /* |
219 | | * Within this function, 'current_lsn' and 'current_tli' refer to the |
220 | | * point from which the next WAL summary file should start. 'exact' is |
221 | | * true if 'current_lsn' is known to be the start of a WAL record or WAL |
222 | | * segment, and false if it might be in the middle of a record someplace. |
223 | | * |
224 | | * 'switch_lsn' and 'switch_tli', if set, are the LSN at which we need to |
225 | | * switch to a new timeline and the timeline to which we need to switch. |
226 | | * If not set, we either haven't figured out the answers yet or we're |
227 | | * already on the latest timeline. |
228 | | */ |
229 | 0 | XLogRecPtr current_lsn; |
230 | 0 | TimeLineID current_tli; |
231 | 0 | bool exact; |
232 | 0 | XLogRecPtr switch_lsn = InvalidXLogRecPtr; |
233 | 0 | TimeLineID switch_tli = 0; |
234 | |
|
235 | 0 | Assert(startup_data_len == 0); |
236 | |
|
237 | 0 | MyBackendType = B_WAL_SUMMARIZER; |
238 | 0 | AuxiliaryProcessMainCommon(); |
239 | |
|
240 | 0 | ereport(DEBUG1, |
241 | 0 | (errmsg_internal("WAL summarizer started"))); |
242 | | |
243 | | /* |
244 | | * Properly accept or ignore signals the postmaster might send us |
245 | | * |
246 | | * We have no particular use for SIGINT at the moment, but seems |
247 | | * reasonable to treat like SIGTERM. |
248 | | */ |
249 | 0 | pqsignal(SIGHUP, SignalHandlerForConfigReload); |
250 | 0 | pqsignal(SIGINT, SignalHandlerForShutdownRequest); |
251 | 0 | pqsignal(SIGTERM, SignalHandlerForShutdownRequest); |
252 | | /* SIGQUIT handler was already set up by InitPostmasterChild */ |
253 | 0 | pqsignal(SIGALRM, SIG_IGN); |
254 | 0 | pqsignal(SIGPIPE, SIG_IGN); |
255 | 0 | pqsignal(SIGUSR1, procsignal_sigusr1_handler); |
256 | 0 | pqsignal(SIGUSR2, SIG_IGN); /* not used */ |
257 | | |
258 | | /* Advertise ourselves. */ |
259 | 0 | on_shmem_exit(WalSummarizerShutdown, (Datum) 0); |
260 | 0 | LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); |
261 | 0 | WalSummarizerCtl->summarizer_pgprocno = MyProcNumber; |
262 | 0 | LWLockRelease(WALSummarizerLock); |
263 | | |
264 | | /* Create and switch to a memory context that we can reset on error. */ |
265 | 0 | context = AllocSetContextCreate(TopMemoryContext, |
266 | 0 | "Wal Summarizer", |
267 | 0 | ALLOCSET_DEFAULT_SIZES); |
268 | 0 | MemoryContextSwitchTo(context); |
269 | | |
270 | | /* |
271 | | * Reset some signals that are accepted by postmaster but not here |
272 | | */ |
273 | 0 | pqsignal(SIGCHLD, SIG_DFL); |
274 | | |
275 | | /* |
276 | | * If an exception is encountered, processing resumes here. |
277 | | */ |
278 | 0 | if (sigsetjmp(local_sigjmp_buf, 1) != 0) |
279 | 0 | { |
280 | | /* Since not using PG_TRY, must reset error stack by hand */ |
281 | 0 | error_context_stack = NULL; |
282 | | |
283 | | /* Prevent interrupts while cleaning up */ |
284 | 0 | HOLD_INTERRUPTS(); |
285 | | |
286 | | /* Report the error to the server log */ |
287 | 0 | EmitErrorReport(); |
288 | | |
289 | | /* Release resources we might have acquired. */ |
290 | 0 | LWLockReleaseAll(); |
291 | 0 | ConditionVariableCancelSleep(); |
292 | 0 | pgstat_report_wait_end(); |
293 | 0 | pgaio_error_cleanup(); |
294 | 0 | ReleaseAuxProcessResources(false); |
295 | 0 | AtEOXact_Files(false); |
296 | 0 | AtEOXact_HashTables(false); |
297 | | |
298 | | /* |
299 | | * Now return to normal top-level context and clear ErrorContext for |
300 | | * next time. |
301 | | */ |
302 | 0 | MemoryContextSwitchTo(context); |
303 | 0 | FlushErrorState(); |
304 | | |
305 | | /* Flush any leaked data in the top-level context */ |
306 | 0 | MemoryContextReset(context); |
307 | | |
308 | | /* Now we can allow interrupts again */ |
309 | 0 | RESUME_INTERRUPTS(); |
310 | | |
311 | | /* |
312 | | * Sleep for 10 seconds before attempting to resume operations in |
313 | | * order to avoid excessive logging. |
314 | | * |
315 | | * Many of the likely error conditions are things that will repeat |
316 | | * every time. For example, if the WAL can't be read or the summary |
317 | | * can't be written, only administrator action will cure the problem. |
318 | | * So a really fast retry time doesn't seem to be especially |
319 | | * beneficial, and it will clutter the logs. |
320 | | */ |
321 | 0 | (void) WaitLatch(NULL, |
322 | 0 | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, |
323 | 0 | 10000, |
324 | 0 | WAIT_EVENT_WAL_SUMMARIZER_ERROR); |
325 | 0 | } |
326 | | |
327 | | /* We can now handle ereport(ERROR) */ |
328 | 0 | PG_exception_stack = &local_sigjmp_buf; |
329 | | |
330 | | /* |
331 | | * Unblock signals (they were blocked when the postmaster forked us) |
332 | | */ |
333 | 0 | sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); |
334 | | |
335 | | /* |
336 | | * Fetch information about previous progress from shared memory, and ask |
337 | | * GetOldestUnsummarizedLSN to reset pending_lsn to summarized_lsn. We |
338 | | * might be recovering from an error, and if so, pending_lsn might have |
339 | | * advanced past summarized_lsn, but any WAL we read previously has been |
340 | | * lost and will need to be reread. |
341 | | * |
342 | | * If we discover that WAL summarization is not enabled, just exit. |
343 | | */ |
344 | 0 | current_lsn = GetOldestUnsummarizedLSN(¤t_tli, &exact); |
345 | 0 | if (XLogRecPtrIsInvalid(current_lsn)) |
346 | 0 | proc_exit(0); |
347 | | |
348 | | /* |
349 | | * Loop forever |
350 | | */ |
351 | 0 | for (;;) |
352 | 0 | { |
353 | 0 | XLogRecPtr latest_lsn; |
354 | 0 | TimeLineID latest_tli; |
355 | 0 | XLogRecPtr end_of_summary_lsn; |
356 | | |
357 | | /* Flush any leaked data in the top-level context */ |
358 | 0 | MemoryContextReset(context); |
359 | | |
360 | | /* Process any signals received recently. */ |
361 | 0 | ProcessWalSummarizerInterrupts(); |
362 | | |
363 | | /* If it's time to remove any old WAL summaries, do that now. */ |
364 | 0 | MaybeRemoveOldWalSummaries(); |
365 | | |
366 | | /* Find the LSN and TLI up to which we can safely summarize. */ |
367 | 0 | latest_lsn = GetLatestLSN(&latest_tli); |
368 | | |
369 | | /* |
370 | | * If we're summarizing a historic timeline and we haven't yet |
371 | | * computed the point at which to switch to the next timeline, do that |
372 | | * now. |
373 | | * |
374 | | * Note that if this is a standby, what was previously the current |
375 | | * timeline could become historic at any time. |
376 | | * |
377 | | * We could try to make this more efficient by caching the results of |
378 | | * readTimeLineHistory when latest_tli has not changed, but since we |
379 | | * only have to do this once per timeline switch, we probably wouldn't |
380 | | * save any significant amount of work in practice. |
381 | | */ |
382 | 0 | if (current_tli != latest_tli && XLogRecPtrIsInvalid(switch_lsn)) |
383 | 0 | { |
384 | 0 | List *tles = readTimeLineHistory(latest_tli); |
385 | |
|
386 | 0 | switch_lsn = tliSwitchPoint(current_tli, tles, &switch_tli); |
387 | 0 | ereport(DEBUG1, |
388 | 0 | errmsg_internal("switch point from TLI %u to TLI %u is at %X/%X", |
389 | 0 | current_tli, switch_tli, LSN_FORMAT_ARGS(switch_lsn))); |
390 | 0 | } |
391 | | |
392 | | /* |
393 | | * If we've reached the switch LSN, we can't summarize anything else |
394 | | * on this timeline. Switch to the next timeline and go around again, |
395 | | * backing up to the exact switch point if we passed it. |
396 | | */ |
397 | 0 | if (!XLogRecPtrIsInvalid(switch_lsn) && current_lsn >= switch_lsn) |
398 | 0 | { |
399 | | /* Restart summarization from switch point. */ |
400 | 0 | current_tli = switch_tli; |
401 | 0 | current_lsn = switch_lsn; |
402 | | |
403 | | /* Next timeline and switch point, if any, not yet known. */ |
404 | 0 | switch_lsn = InvalidXLogRecPtr; |
405 | 0 | switch_tli = 0; |
406 | | |
407 | | /* Update (really, rewind, if needed) state in shared memory. */ |
408 | 0 | LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); |
409 | 0 | WalSummarizerCtl->summarized_lsn = current_lsn; |
410 | 0 | WalSummarizerCtl->summarized_tli = current_tli; |
411 | 0 | WalSummarizerCtl->lsn_is_exact = true; |
412 | 0 | WalSummarizerCtl->pending_lsn = current_lsn; |
413 | 0 | LWLockRelease(WALSummarizerLock); |
414 | |
|
415 | 0 | continue; |
416 | 0 | } |
417 | | |
418 | | /* Summarize WAL. */ |
419 | 0 | end_of_summary_lsn = SummarizeWAL(current_tli, |
420 | 0 | current_lsn, exact, |
421 | 0 | switch_lsn, latest_lsn); |
422 | 0 | Assert(!XLogRecPtrIsInvalid(end_of_summary_lsn)); |
423 | 0 | Assert(end_of_summary_lsn >= current_lsn); |
424 | | |
425 | | /* |
426 | | * Update state for next loop iteration. |
427 | | * |
428 | | * Next summary file should start from exactly where this one ended. |
429 | | */ |
430 | 0 | current_lsn = end_of_summary_lsn; |
431 | 0 | exact = true; |
432 | | |
433 | | /* Update state in shared memory. */ |
434 | 0 | LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); |
435 | 0 | WalSummarizerCtl->summarized_lsn = end_of_summary_lsn; |
436 | 0 | WalSummarizerCtl->summarized_tli = current_tli; |
437 | 0 | WalSummarizerCtl->lsn_is_exact = true; |
438 | 0 | WalSummarizerCtl->pending_lsn = end_of_summary_lsn; |
439 | 0 | LWLockRelease(WALSummarizerLock); |
440 | | |
441 | | /* Wake up anyone waiting for more summary files to be written. */ |
442 | 0 | ConditionVariableBroadcast(&WalSummarizerCtl->summary_file_cv); |
443 | 0 | } |
444 | 0 | } |
445 | | |
446 | | /* |
447 | | * Get information about the state of the WAL summarizer. |
448 | | */ |
449 | | void |
450 | | GetWalSummarizerState(TimeLineID *summarized_tli, XLogRecPtr *summarized_lsn, |
451 | | XLogRecPtr *pending_lsn, int *summarizer_pid) |
452 | 0 | { |
453 | 0 | LWLockAcquire(WALSummarizerLock, LW_SHARED); |
454 | 0 | if (!WalSummarizerCtl->initialized) |
455 | 0 | { |
456 | | /* |
457 | | * If initialized is false, the rest of the structure contents are |
458 | | * undefined. |
459 | | */ |
460 | 0 | *summarized_tli = 0; |
461 | 0 | *summarized_lsn = InvalidXLogRecPtr; |
462 | 0 | *pending_lsn = InvalidXLogRecPtr; |
463 | 0 | *summarizer_pid = -1; |
464 | 0 | } |
465 | 0 | else |
466 | 0 | { |
467 | 0 | int summarizer_pgprocno = WalSummarizerCtl->summarizer_pgprocno; |
468 | |
|
469 | 0 | *summarized_tli = WalSummarizerCtl->summarized_tli; |
470 | 0 | *summarized_lsn = WalSummarizerCtl->summarized_lsn; |
471 | 0 | if (summarizer_pgprocno == INVALID_PROC_NUMBER) |
472 | 0 | { |
473 | | /* |
474 | | * If the summarizer has exited, the fact that it had processed |
475 | | * beyond summarized_lsn is irrelevant now. |
476 | | */ |
477 | 0 | *pending_lsn = WalSummarizerCtl->summarized_lsn; |
478 | 0 | *summarizer_pid = -1; |
479 | 0 | } |
480 | 0 | else |
481 | 0 | { |
482 | 0 | *pending_lsn = WalSummarizerCtl->pending_lsn; |
483 | | |
484 | | /* |
485 | | * We're not fussed about inexact answers here, since they could |
486 | | * become stale instantly, so we don't bother taking the lock, but |
487 | | * make sure that invalid PID values are normalized to -1. |
488 | | */ |
489 | 0 | *summarizer_pid = GetPGProcByNumber(summarizer_pgprocno)->pid; |
490 | 0 | if (*summarizer_pid <= 0) |
491 | 0 | *summarizer_pid = -1; |
492 | 0 | } |
493 | 0 | } |
494 | 0 | LWLockRelease(WALSummarizerLock); |
495 | 0 | } |
496 | | |
497 | | /* |
498 | | * Get the oldest LSN in this server's timeline history that has not yet been |
499 | | * summarized, and update shared memory state as appropriate. |
500 | | * |
501 | | * If *tli != NULL, it will be set to the TLI for the LSN that is returned. |
502 | | * |
503 | | * If *lsn_is_exact != NULL, it will be set to true if the returned LSN is |
504 | | * necessarily the start of a WAL record and false if it's just the beginning |
505 | | * of a WAL segment. |
506 | | */ |
507 | | XLogRecPtr |
508 | | GetOldestUnsummarizedLSN(TimeLineID *tli, bool *lsn_is_exact) |
509 | 0 | { |
510 | 0 | TimeLineID latest_tli; |
511 | 0 | int n; |
512 | 0 | List *tles; |
513 | 0 | XLogRecPtr unsummarized_lsn = InvalidXLogRecPtr; |
514 | 0 | TimeLineID unsummarized_tli = 0; |
515 | 0 | bool should_make_exact = false; |
516 | 0 | List *existing_summaries; |
517 | 0 | ListCell *lc; |
518 | 0 | bool am_wal_summarizer = AmWalSummarizerProcess(); |
519 | | |
520 | | /* If not summarizing WAL, do nothing. */ |
521 | 0 | if (!summarize_wal) |
522 | 0 | return InvalidXLogRecPtr; |
523 | | |
524 | | /* |
525 | | * If we are not the WAL summarizer process, then we normally just want to |
526 | | * read the values from shared memory. However, as an exception, if shared |
527 | | * memory hasn't been initialized yet, then we need to do that so that we |
528 | | * can read legal values and not remove any WAL too early. |
529 | | */ |
530 | 0 | if (!am_wal_summarizer) |
531 | 0 | { |
532 | 0 | LWLockAcquire(WALSummarizerLock, LW_SHARED); |
533 | |
|
534 | 0 | if (WalSummarizerCtl->initialized) |
535 | 0 | { |
536 | 0 | unsummarized_lsn = WalSummarizerCtl->summarized_lsn; |
537 | 0 | if (tli != NULL) |
538 | 0 | *tli = WalSummarizerCtl->summarized_tli; |
539 | 0 | if (lsn_is_exact != NULL) |
540 | 0 | *lsn_is_exact = WalSummarizerCtl->lsn_is_exact; |
541 | 0 | LWLockRelease(WALSummarizerLock); |
542 | 0 | return unsummarized_lsn; |
543 | 0 | } |
544 | | |
545 | 0 | LWLockRelease(WALSummarizerLock); |
546 | 0 | } |
547 | | |
548 | | /* |
549 | | * Find the oldest timeline on which WAL still exists, and the earliest |
550 | | * segment for which it exists. |
551 | | * |
552 | | * Note that we do this every time the WAL summarizer process restarts or |
553 | | * recovers from an error, in case the contents of pg_wal have changed |
554 | | * under us e.g. if some files were removed, either manually - which |
555 | | * shouldn't really happen, but might - or by postgres itself, if |
556 | | * summarize_wal was turned off and then back on again. |
557 | | */ |
558 | 0 | (void) GetLatestLSN(&latest_tli); |
559 | 0 | tles = readTimeLineHistory(latest_tli); |
560 | 0 | for (n = list_length(tles) - 1; n >= 0; --n) |
561 | 0 | { |
562 | 0 | TimeLineHistoryEntry *tle = list_nth(tles, n); |
563 | 0 | XLogSegNo oldest_segno; |
564 | |
|
565 | 0 | oldest_segno = XLogGetOldestSegno(tle->tli); |
566 | 0 | if (oldest_segno != 0) |
567 | 0 | { |
568 | | /* Compute oldest LSN that still exists on disk. */ |
569 | 0 | XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size, |
570 | 0 | unsummarized_lsn); |
571 | |
|
572 | 0 | unsummarized_tli = tle->tli; |
573 | 0 | break; |
574 | 0 | } |
575 | 0 | } |
576 | | |
577 | | /* |
578 | | * Don't try to summarize anything older than the end LSN of the newest |
579 | | * summary file that exists for this timeline. |
580 | | */ |
581 | 0 | existing_summaries = |
582 | 0 | GetWalSummaries(unsummarized_tli, |
583 | 0 | InvalidXLogRecPtr, InvalidXLogRecPtr); |
584 | 0 | foreach(lc, existing_summaries) |
585 | 0 | { |
586 | 0 | WalSummaryFile *ws = lfirst(lc); |
587 | |
|
588 | 0 | if (ws->end_lsn > unsummarized_lsn) |
589 | 0 | { |
590 | 0 | unsummarized_lsn = ws->end_lsn; |
591 | 0 | should_make_exact = true; |
592 | 0 | } |
593 | 0 | } |
594 | | |
595 | | /* It really should not be possible for us to find no WAL. */ |
596 | 0 | if (unsummarized_tli == 0) |
597 | 0 | ereport(ERROR, |
598 | 0 | errcode(ERRCODE_INTERNAL_ERROR), |
599 | 0 | errmsg_internal("no WAL found on timeline %u", latest_tli)); |
600 | | |
601 | | /* |
602 | | * If we're the WAL summarizer, we always want to store the values we just |
603 | | * computed into shared memory, because those are the values we're going |
604 | | * to use to drive our operation, and so they are the authoritative |
605 | | * values. Otherwise, we only store values into shared memory if shared |
606 | | * memory is uninitialized. Our values are not canonical in such a case, |
607 | | * but it's better to have something than nothing, to guide WAL retention. |
608 | | */ |
609 | 0 | LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); |
610 | 0 | if (am_wal_summarizer || !WalSummarizerCtl->initialized) |
611 | 0 | { |
612 | 0 | WalSummarizerCtl->initialized = true; |
613 | 0 | WalSummarizerCtl->summarized_lsn = unsummarized_lsn; |
614 | 0 | WalSummarizerCtl->summarized_tli = unsummarized_tli; |
615 | 0 | WalSummarizerCtl->lsn_is_exact = should_make_exact; |
616 | 0 | WalSummarizerCtl->pending_lsn = unsummarized_lsn; |
617 | 0 | } |
618 | 0 | else |
619 | 0 | unsummarized_lsn = WalSummarizerCtl->summarized_lsn; |
620 | | |
621 | | /* Also return the to the caller as required. */ |
622 | 0 | if (tli != NULL) |
623 | 0 | *tli = WalSummarizerCtl->summarized_tli; |
624 | 0 | if (lsn_is_exact != NULL) |
625 | 0 | *lsn_is_exact = WalSummarizerCtl->lsn_is_exact; |
626 | 0 | LWLockRelease(WALSummarizerLock); |
627 | |
|
628 | 0 | return unsummarized_lsn; |
629 | 0 | } |
630 | | |
631 | | /* |
632 | | * Wake up the WAL summarizer process. |
633 | | * |
634 | | * This might not work, because there's no guarantee that the WAL summarizer |
635 | | * process was successfully started, and it also might have started but |
636 | | * subsequently terminated. So, under normal circumstances, this will get the |
637 | | * latch set, but there's no guarantee. |
638 | | */ |
639 | | void |
640 | | WakeupWalSummarizer(void) |
641 | 0 | { |
642 | 0 | ProcNumber pgprocno; |
643 | |
|
644 | 0 | if (WalSummarizerCtl == NULL) |
645 | 0 | return; |
646 | | |
647 | 0 | LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); |
648 | 0 | pgprocno = WalSummarizerCtl->summarizer_pgprocno; |
649 | 0 | LWLockRelease(WALSummarizerLock); |
650 | |
|
651 | 0 | if (pgprocno != INVALID_PROC_NUMBER) |
652 | 0 | SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch); |
653 | 0 | } |
654 | | |
655 | | /* |
656 | | * Wait until WAL summarization reaches the given LSN, but time out with an |
657 | | * error if the summarizer seems to be stick. |
658 | | * |
659 | | * Returns immediately if summarize_wal is turned off while we wait. Caller |
660 | | * is expected to handle this case, if necessary. |
661 | | */ |
662 | | void |
663 | | WaitForWalSummarization(XLogRecPtr lsn) |
664 | 0 | { |
665 | 0 | TimestampTz initial_time, |
666 | 0 | cycle_time, |
667 | 0 | current_time; |
668 | 0 | XLogRecPtr prior_pending_lsn = InvalidXLogRecPtr; |
669 | 0 | int deadcycles = 0; |
670 | |
|
671 | 0 | initial_time = cycle_time = GetCurrentTimestamp(); |
672 | |
|
673 | 0 | while (1) |
674 | 0 | { |
675 | 0 | long timeout_in_ms = 10000; |
676 | 0 | XLogRecPtr summarized_lsn; |
677 | 0 | XLogRecPtr pending_lsn; |
678 | |
|
679 | 0 | CHECK_FOR_INTERRUPTS(); |
680 | | |
681 | | /* If WAL summarization is disabled while we're waiting, give up. */ |
682 | 0 | if (!summarize_wal) |
683 | 0 | return; |
684 | | |
685 | | /* |
686 | | * If the LSN summarized on disk has reached the target value, stop. |
687 | | */ |
688 | 0 | LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); |
689 | 0 | summarized_lsn = WalSummarizerCtl->summarized_lsn; |
690 | 0 | pending_lsn = WalSummarizerCtl->pending_lsn; |
691 | 0 | LWLockRelease(WALSummarizerLock); |
692 | | |
693 | | /* If WAL summarization has progressed sufficiently, stop waiting. */ |
694 | 0 | if (summarized_lsn >= lsn) |
695 | 0 | break; |
696 | | |
697 | | /* Recheck current time. */ |
698 | 0 | current_time = GetCurrentTimestamp(); |
699 | | |
700 | | /* Have we finished the current cycle of waiting? */ |
701 | 0 | if (TimestampDifferenceMilliseconds(cycle_time, |
702 | 0 | current_time) >= timeout_in_ms) |
703 | 0 | { |
704 | 0 | long elapsed_seconds; |
705 | | |
706 | | /* Begin new wait cycle. */ |
707 | 0 | cycle_time = TimestampTzPlusMilliseconds(cycle_time, |
708 | 0 | timeout_in_ms); |
709 | | |
710 | | /* |
711 | | * Keep track of the number of cycles during which there has been |
712 | | * no progression of pending_lsn. If pending_lsn is not advancing, |
713 | | * that means that not only are no new files appearing on disk, |
714 | | * but we're not even incorporating new records into the in-memory |
715 | | * state. |
716 | | */ |
717 | 0 | if (pending_lsn > prior_pending_lsn) |
718 | 0 | { |
719 | 0 | prior_pending_lsn = pending_lsn; |
720 | 0 | deadcycles = 0; |
721 | 0 | } |
722 | 0 | else |
723 | 0 | ++deadcycles; |
724 | | |
725 | | /* |
726 | | * If we've managed to wait for an entire minute without the WAL |
727 | | * summarizer absorbing a single WAL record, error out; probably |
728 | | * something is wrong. |
729 | | * |
730 | | * We could consider also erroring out if the summarizer is taking |
731 | | * too long to catch up, but it's not clear what rate of progress |
732 | | * would be acceptable and what would be too slow. So instead, we |
733 | | * just try to error out in the case where there's no progress at |
734 | | * all. That seems likely to catch a reasonable number of the |
735 | | * things that can go wrong in practice (e.g. the summarizer |
736 | | * process is completely hung, say because somebody hooked up a |
737 | | * debugger to it or something) without giving up too quickly when |
738 | | * the system is just slow. |
739 | | */ |
740 | 0 | if (deadcycles >= 6) |
741 | 0 | ereport(ERROR, |
742 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
743 | 0 | errmsg("WAL summarization is not progressing"), |
744 | 0 | errdetail("Summarization is needed through %X/%X, but is stuck at %X/%X on disk and %X/%X in memory.", |
745 | 0 | LSN_FORMAT_ARGS(lsn), |
746 | 0 | LSN_FORMAT_ARGS(summarized_lsn), |
747 | 0 | LSN_FORMAT_ARGS(pending_lsn)))); |
748 | | |
749 | | |
750 | | /* |
751 | | * Otherwise, just let the user know what's happening. |
752 | | */ |
753 | 0 | elapsed_seconds = |
754 | 0 | TimestampDifferenceMilliseconds(initial_time, |
755 | 0 | current_time) / 1000; |
756 | 0 | ereport(WARNING, |
757 | 0 | (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), |
758 | 0 | errmsg_plural("still waiting for WAL summarization through %X/%X after %ld second", |
759 | 0 | "still waiting for WAL summarization through %X/%X after %ld seconds", |
760 | 0 | elapsed_seconds, |
761 | 0 | LSN_FORMAT_ARGS(lsn), |
762 | 0 | elapsed_seconds), |
763 | 0 | errdetail("Summarization has reached %X/%X on disk and %X/%X in memory.", |
764 | 0 | LSN_FORMAT_ARGS(summarized_lsn), |
765 | 0 | LSN_FORMAT_ARGS(pending_lsn)))); |
766 | 0 | } |
767 | | |
768 | | /* |
769 | | * Align the wait time to prevent drift. This doesn't really matter, |
770 | | * but we'd like the warnings about how long we've been waiting to say |
771 | | * 10 seconds, 20 seconds, 30 seconds, 40 seconds ... without ever |
772 | | * drifting to something that is not a multiple of ten. |
773 | | */ |
774 | 0 | timeout_in_ms -= |
775 | 0 | TimestampDifferenceMilliseconds(cycle_time, current_time); |
776 | | |
777 | | /* Wait and see. */ |
778 | 0 | ConditionVariableTimedSleep(&WalSummarizerCtl->summary_file_cv, |
779 | 0 | timeout_in_ms, |
780 | 0 | WAIT_EVENT_WAL_SUMMARY_READY); |
781 | 0 | } |
782 | | |
783 | 0 | ConditionVariableCancelSleep(); |
784 | 0 | } |
785 | | |
786 | | /* |
787 | | * On exit, update shared memory to make it clear that we're no longer |
788 | | * running. |
789 | | */ |
790 | | static void |
791 | | WalSummarizerShutdown(int code, Datum arg) |
792 | 0 | { |
793 | 0 | LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); |
794 | 0 | WalSummarizerCtl->summarizer_pgprocno = INVALID_PROC_NUMBER; |
795 | 0 | LWLockRelease(WALSummarizerLock); |
796 | 0 | } |
797 | | |
798 | | /* |
799 | | * Get the latest LSN that is eligible to be summarized, and set *tli to the |
800 | | * corresponding timeline. |
801 | | */ |
802 | | static XLogRecPtr |
803 | | GetLatestLSN(TimeLineID *tli) |
804 | 0 | { |
805 | 0 | if (!RecoveryInProgress()) |
806 | 0 | { |
807 | | /* Don't summarize WAL before it's flushed. */ |
808 | 0 | return GetFlushRecPtr(tli); |
809 | 0 | } |
810 | 0 | else |
811 | 0 | { |
812 | 0 | XLogRecPtr flush_lsn; |
813 | 0 | TimeLineID flush_tli; |
814 | 0 | XLogRecPtr replay_lsn; |
815 | 0 | TimeLineID replay_tli; |
816 | 0 | TimeLineID insert_tli; |
817 | | |
818 | | /* |
819 | | * After the insert TLI has been set and before the control file has |
820 | | * been updated to show the DB in production, RecoveryInProgress() |
821 | | * will return true, because it's not yet safe for all backends to |
822 | | * begin writing WAL. However, replay has already ceased, so from our |
823 | | * point of view, recovery is already over. We should summarize up to |
824 | | * where replay stopped and then prepare to resume at the start of the |
825 | | * insert timeline. |
826 | | */ |
827 | 0 | if ((insert_tli = GetWALInsertionTimeLineIfSet()) != 0) |
828 | 0 | { |
829 | 0 | *tli = insert_tli; |
830 | 0 | return GetXLogReplayRecPtr(NULL); |
831 | 0 | } |
832 | | |
833 | | /* |
834 | | * What we really want to know is how much WAL has been flushed to |
835 | | * disk, but the only flush position available is the one provided by |
836 | | * the walreceiver, which may not be running, because this could be |
837 | | * crash recovery or recovery via restore_command. So use either the |
838 | | * WAL receiver's flush position or the replay position, whichever is |
839 | | * further ahead, on the theory that if the WAL has been replayed then |
840 | | * it must also have been flushed to disk. |
841 | | */ |
842 | 0 | flush_lsn = GetWalRcvFlushRecPtr(NULL, &flush_tli); |
843 | 0 | replay_lsn = GetXLogReplayRecPtr(&replay_tli); |
844 | 0 | if (flush_lsn > replay_lsn) |
845 | 0 | { |
846 | 0 | *tli = flush_tli; |
847 | 0 | return flush_lsn; |
848 | 0 | } |
849 | 0 | else |
850 | 0 | { |
851 | 0 | *tli = replay_tli; |
852 | 0 | return replay_lsn; |
853 | 0 | } |
854 | 0 | } |
855 | 0 | } |
856 | | |
857 | | /* |
858 | | * Interrupt handler for main loop of WAL summarizer process. |
859 | | */ |
860 | | static void |
861 | | ProcessWalSummarizerInterrupts(void) |
862 | 0 | { |
863 | 0 | if (ProcSignalBarrierPending) |
864 | 0 | ProcessProcSignalBarrier(); |
865 | |
|
866 | 0 | if (ConfigReloadPending) |
867 | 0 | { |
868 | 0 | ConfigReloadPending = false; |
869 | 0 | ProcessConfigFile(PGC_SIGHUP); |
870 | 0 | } |
871 | |
|
872 | 0 | if (ShutdownRequestPending || !summarize_wal) |
873 | 0 | { |
874 | 0 | ereport(DEBUG1, |
875 | 0 | errmsg_internal("WAL summarizer shutting down")); |
876 | 0 | proc_exit(0); |
877 | 0 | } |
878 | | |
879 | | /* Perform logging of memory contexts of this process */ |
880 | 0 | if (LogMemoryContextPending) |
881 | 0 | ProcessLogMemoryContextInterrupt(); |
882 | 0 | } |
883 | | |
884 | | /* |
885 | | * Summarize a range of WAL records on a single timeline. |
886 | | * |
887 | | * 'tli' is the timeline to be summarized. |
888 | | * |
889 | | * 'start_lsn' is the point at which we should start summarizing. If this |
890 | | * value comes from the end LSN of the previous record as returned by the |
891 | | * xlogreader machinery, 'exact' should be true; otherwise, 'exact' should |
892 | | * be false, and this function will search forward for the start of a valid |
893 | | * WAL record. |
894 | | * |
895 | | * 'switch_lsn' is the point at which we should switch to a later timeline, |
896 | | * if we're summarizing a historic timeline. |
897 | | * |
898 | | * 'maximum_lsn' identifies the point beyond which we can't count on being |
899 | | * able to read any more WAL. It should be the switch point when reading a |
900 | | * historic timeline, or the most-recently-measured end of WAL when reading |
901 | | * the current timeline. |
902 | | * |
903 | | * The return value is the LSN at which the WAL summary actually ends. Most |
904 | | * often, a summary file ends because we notice that a checkpoint has |
905 | | * occurred and reach the redo pointer of that checkpoint, but sometimes |
906 | | * we stop for other reasons, such as a timeline switch. |
907 | | */ |
908 | | static XLogRecPtr |
909 | | SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact, |
910 | | XLogRecPtr switch_lsn, XLogRecPtr maximum_lsn) |
911 | 0 | { |
912 | 0 | SummarizerReadLocalXLogPrivate *private_data; |
913 | 0 | XLogReaderState *xlogreader; |
914 | 0 | XLogRecPtr summary_start_lsn; |
915 | 0 | XLogRecPtr summary_end_lsn = switch_lsn; |
916 | 0 | char temp_path[MAXPGPATH]; |
917 | 0 | char final_path[MAXPGPATH]; |
918 | 0 | WalSummaryIO io; |
919 | 0 | BlockRefTable *brtab = CreateEmptyBlockRefTable(); |
920 | 0 | bool fast_forward = true; |
921 | | |
922 | | /* Initialize private data for xlogreader. */ |
923 | 0 | private_data = (SummarizerReadLocalXLogPrivate *) |
924 | 0 | palloc0(sizeof(SummarizerReadLocalXLogPrivate)); |
925 | 0 | private_data->tli = tli; |
926 | 0 | private_data->historic = !XLogRecPtrIsInvalid(switch_lsn); |
927 | 0 | private_data->read_upto = maximum_lsn; |
928 | | |
929 | | /* Create xlogreader. */ |
930 | 0 | xlogreader = XLogReaderAllocate(wal_segment_size, NULL, |
931 | 0 | XL_ROUTINE(.page_read = &summarizer_read_local_xlog_page, |
932 | 0 | .segment_open = &wal_segment_open, |
933 | 0 | .segment_close = &wal_segment_close), |
934 | 0 | private_data); |
935 | 0 | if (xlogreader == NULL) |
936 | 0 | ereport(ERROR, |
937 | 0 | (errcode(ERRCODE_OUT_OF_MEMORY), |
938 | 0 | errmsg("out of memory"), |
939 | 0 | errdetail("Failed while allocating a WAL reading processor."))); |
940 | | |
941 | | /* |
942 | | * When exact = false, we're starting from an arbitrary point in the WAL |
943 | | * and must search forward for the start of the next record. |
944 | | * |
945 | | * When exact = true, start_lsn should be either the LSN where a record |
946 | | * begins, or the LSN of a page where the page header is immediately |
947 | | * followed by the start of a new record. XLogBeginRead should tolerate |
948 | | * either case. |
949 | | * |
950 | | * We need to allow for both cases because the behavior of xlogreader |
951 | | * varies. When a record spans two or more xlog pages, the ending LSN |
952 | | * reported by xlogreader will be the starting LSN of the following |
953 | | * record, but when an xlog page boundary falls between two records, the |
954 | | * end LSN for the first will be reported as the first byte of the |
955 | | * following page. We can't know until we read that page how large the |
956 | | * header will be, but we'll have to skip over it to find the next record. |
957 | | */ |
958 | 0 | if (exact) |
959 | 0 | { |
960 | | /* |
961 | | * Even if start_lsn is the beginning of a page rather than the |
962 | | * beginning of the first record on that page, we should still use it |
963 | | * as the start LSN for the summary file. That's because we detect |
964 | | * missing summary files by looking for cases where the end LSN of one |
965 | | * file is less than the start LSN of the next file. When only a page |
966 | | * header is skipped, nothing has been missed. |
967 | | */ |
968 | 0 | XLogBeginRead(xlogreader, start_lsn); |
969 | 0 | summary_start_lsn = start_lsn; |
970 | 0 | } |
971 | 0 | else |
972 | 0 | { |
973 | 0 | summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn); |
974 | 0 | if (XLogRecPtrIsInvalid(summary_start_lsn)) |
975 | 0 | { |
976 | | /* |
977 | | * If we hit end-of-WAL while trying to find the next valid |
978 | | * record, we must be on a historic timeline that has no valid |
979 | | * records that begin after start_lsn and before end of WAL. |
980 | | */ |
981 | 0 | if (private_data->end_of_wal) |
982 | 0 | { |
983 | 0 | ereport(DEBUG1, |
984 | 0 | errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X", |
985 | 0 | tli, |
986 | 0 | LSN_FORMAT_ARGS(start_lsn), |
987 | 0 | LSN_FORMAT_ARGS(private_data->read_upto))); |
988 | | |
989 | | /* |
990 | | * The timeline ends at or after start_lsn, without containing |
991 | | * any records. Thus, we must make sure the main loop does not |
992 | | * iterate. If start_lsn is the end of the timeline, then we |
993 | | * won't actually emit an empty summary file, but otherwise, |
994 | | * we must, to capture the fact that the LSN range in question |
995 | | * contains no interesting WAL records. |
996 | | */ |
997 | 0 | summary_start_lsn = start_lsn; |
998 | 0 | summary_end_lsn = private_data->read_upto; |
999 | 0 | switch_lsn = xlogreader->EndRecPtr; |
1000 | 0 | } |
1001 | 0 | else |
1002 | 0 | ereport(ERROR, |
1003 | 0 | (errmsg("could not find a valid record after %X/%X", |
1004 | 0 | LSN_FORMAT_ARGS(start_lsn)))); |
1005 | 0 | } |
1006 | | |
1007 | | /* We shouldn't go backward. */ |
1008 | 0 | Assert(summary_start_lsn >= start_lsn); |
1009 | 0 | } |
1010 | | |
1011 | | /* |
1012 | | * Main loop: read xlog records one by one. |
1013 | | */ |
1014 | 0 | while (1) |
1015 | 0 | { |
1016 | 0 | int block_id; |
1017 | 0 | char *errormsg; |
1018 | 0 | XLogRecord *record; |
1019 | 0 | uint8 rmid; |
1020 | |
|
1021 | 0 | ProcessWalSummarizerInterrupts(); |
1022 | | |
1023 | | /* We shouldn't go backward. */ |
1024 | 0 | Assert(summary_start_lsn <= xlogreader->EndRecPtr); |
1025 | | |
1026 | | /* Now read the next record. */ |
1027 | 0 | record = XLogReadRecord(xlogreader, &errormsg); |
1028 | 0 | if (record == NULL) |
1029 | 0 | { |
1030 | 0 | if (private_data->end_of_wal) |
1031 | 0 | { |
1032 | | /* |
1033 | | * This timeline must be historic and must end before we were |
1034 | | * able to read a complete record. |
1035 | | */ |
1036 | 0 | ereport(DEBUG1, |
1037 | 0 | errmsg_internal("could not read WAL from timeline %u at %X/%X: end of WAL at %X/%X", |
1038 | 0 | tli, |
1039 | 0 | LSN_FORMAT_ARGS(xlogreader->EndRecPtr), |
1040 | 0 | LSN_FORMAT_ARGS(private_data->read_upto))); |
1041 | | /* Summary ends at end of WAL. */ |
1042 | 0 | summary_end_lsn = private_data->read_upto; |
1043 | 0 | break; |
1044 | 0 | } |
1045 | 0 | if (errormsg) |
1046 | 0 | ereport(ERROR, |
1047 | 0 | (errcode_for_file_access(), |
1048 | 0 | errmsg("could not read WAL from timeline %u at %X/%X: %s", |
1049 | 0 | tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr), |
1050 | 0 | errormsg))); |
1051 | 0 | else |
1052 | 0 | ereport(ERROR, |
1053 | 0 | (errcode_for_file_access(), |
1054 | 0 | errmsg("could not read WAL from timeline %u at %X/%X", |
1055 | 0 | tli, LSN_FORMAT_ARGS(xlogreader->EndRecPtr)))); |
1056 | 0 | } |
1057 | | |
1058 | | /* We shouldn't go backward. */ |
1059 | 0 | Assert(summary_start_lsn <= xlogreader->EndRecPtr); |
1060 | |
|
1061 | 0 | if (!XLogRecPtrIsInvalid(switch_lsn) && |
1062 | 0 | xlogreader->ReadRecPtr >= switch_lsn) |
1063 | 0 | { |
1064 | | /* |
1065 | | * Whoops! We've read a record that *starts* after the switch LSN, |
1066 | | * contrary to our goal of reading only until we hit the first |
1067 | | * record that ends at or after the switch LSN. Pretend we didn't |
1068 | | * read it after all by bailing out of this loop right here, |
1069 | | * before we do anything with this record. |
1070 | | * |
1071 | | * This can happen because the last record before the switch LSN |
1072 | | * might be continued across multiple pages, and then we might |
1073 | | * come to a page with XLP_FIRST_IS_OVERWRITE_CONTRECORD set. In |
1074 | | * that case, the record that was continued across multiple pages |
1075 | | * is incomplete and will be disregarded, and the read will |
1076 | | * restart from the beginning of the page that is flagged |
1077 | | * XLP_FIRST_IS_OVERWRITE_CONTRECORD. |
1078 | | * |
1079 | | * If this case occurs, we can fairly say that the current summary |
1080 | | * file ends at the switch LSN exactly. The first record on the |
1081 | | * page marked XLP_FIRST_IS_OVERWRITE_CONTRECORD will be |
1082 | | * discovered when generating the next summary file. |
1083 | | */ |
1084 | 0 | summary_end_lsn = switch_lsn; |
1085 | 0 | break; |
1086 | 0 | } |
1087 | | |
1088 | | /* |
1089 | | * Certain types of records require special handling. Redo points and |
1090 | | * shutdown checkpoints trigger creation of new summary files and can |
1091 | | * also cause us to enter or exit "fast forward" mode. Other types of |
1092 | | * records can require special updates to the block reference table. |
1093 | | */ |
1094 | 0 | rmid = XLogRecGetRmid(xlogreader); |
1095 | 0 | if (rmid == RM_XLOG_ID) |
1096 | 0 | { |
1097 | 0 | bool new_fast_forward; |
1098 | | |
1099 | | /* |
1100 | | * If we've already processed some WAL records when we hit a redo |
1101 | | * point or shutdown checkpoint, then we stop summarization before |
1102 | | * including this record in the current file, so that it will be |
1103 | | * the first record in the next file. |
1104 | | * |
1105 | | * When we hit one of those record types as the first record in a |
1106 | | * file, we adjust our notion of whether we're fast-forwarding. |
1107 | | * Any WAL generated with wal_level=minimal must be skipped |
1108 | | * without actually generating any summary file, because an |
1109 | | * incremental backup that crosses such WAL would be unsafe. |
1110 | | */ |
1111 | 0 | if (SummarizeXlogRecord(xlogreader, &new_fast_forward)) |
1112 | 0 | { |
1113 | 0 | if (xlogreader->ReadRecPtr > summary_start_lsn) |
1114 | 0 | { |
1115 | 0 | summary_end_lsn = xlogreader->ReadRecPtr; |
1116 | 0 | break; |
1117 | 0 | } |
1118 | 0 | else |
1119 | 0 | fast_forward = new_fast_forward; |
1120 | 0 | } |
1121 | 0 | } |
1122 | 0 | else if (!fast_forward) |
1123 | 0 | { |
1124 | | /* |
1125 | | * This switch handles record types that require extra updates to |
1126 | | * the contents of the block reference table. |
1127 | | */ |
1128 | 0 | switch (rmid) |
1129 | 0 | { |
1130 | 0 | case RM_DBASE_ID: |
1131 | 0 | SummarizeDbaseRecord(xlogreader, brtab); |
1132 | 0 | break; |
1133 | 0 | case RM_SMGR_ID: |
1134 | 0 | SummarizeSmgrRecord(xlogreader, brtab); |
1135 | 0 | break; |
1136 | 0 | case RM_XACT_ID: |
1137 | 0 | SummarizeXactRecord(xlogreader, brtab); |
1138 | 0 | break; |
1139 | 0 | } |
1140 | 0 | } |
1141 | | |
1142 | | /* |
1143 | | * If we're in fast-forward mode, we don't really need to do anything. |
1144 | | * Otherwise, feed block references from xlog record to block |
1145 | | * reference table. |
1146 | | */ |
1147 | 0 | if (!fast_forward) |
1148 | 0 | { |
1149 | 0 | for (block_id = 0; block_id <= XLogRecMaxBlockId(xlogreader); |
1150 | 0 | block_id++) |
1151 | 0 | { |
1152 | 0 | RelFileLocator rlocator; |
1153 | 0 | ForkNumber forknum; |
1154 | 0 | BlockNumber blocknum; |
1155 | |
|
1156 | 0 | if (!XLogRecGetBlockTagExtended(xlogreader, block_id, &rlocator, |
1157 | 0 | &forknum, &blocknum, NULL)) |
1158 | 0 | continue; |
1159 | | |
1160 | | /* |
1161 | | * As we do elsewhere, ignore the FSM fork, because it's not |
1162 | | * fully WAL-logged. |
1163 | | */ |
1164 | 0 | if (forknum != FSM_FORKNUM) |
1165 | 0 | BlockRefTableMarkBlockModified(brtab, &rlocator, forknum, |
1166 | 0 | blocknum); |
1167 | 0 | } |
1168 | 0 | } |
1169 | | |
1170 | | /* Update our notion of where this summary file ends. */ |
1171 | 0 | summary_end_lsn = xlogreader->EndRecPtr; |
1172 | | |
1173 | | /* Also update shared memory. */ |
1174 | 0 | LWLockAcquire(WALSummarizerLock, LW_EXCLUSIVE); |
1175 | 0 | Assert(summary_end_lsn >= WalSummarizerCtl->summarized_lsn); |
1176 | 0 | WalSummarizerCtl->pending_lsn = summary_end_lsn; |
1177 | 0 | LWLockRelease(WALSummarizerLock); |
1178 | | |
1179 | | /* |
1180 | | * If we have a switch LSN and have reached it, stop before reading |
1181 | | * the next record. |
1182 | | */ |
1183 | 0 | if (!XLogRecPtrIsInvalid(switch_lsn) && |
1184 | 0 | xlogreader->EndRecPtr >= switch_lsn) |
1185 | 0 | break; |
1186 | 0 | } |
1187 | | |
1188 | | /* Destroy xlogreader. */ |
1189 | 0 | pfree(xlogreader->private_data); |
1190 | 0 | XLogReaderFree(xlogreader); |
1191 | | |
1192 | | /* |
1193 | | * If a timeline switch occurs, we may fail to make any progress at all |
1194 | | * before exiting the loop above. If that happens, we don't write a WAL |
1195 | | * summary file at all. We can also skip writing a file if we're in |
1196 | | * fast-forward mode. |
1197 | | */ |
1198 | 0 | if (summary_end_lsn > summary_start_lsn && !fast_forward) |
1199 | 0 | { |
1200 | | /* Generate temporary and final path name. */ |
1201 | 0 | snprintf(temp_path, MAXPGPATH, |
1202 | 0 | XLOGDIR "/summaries/temp.summary"); |
1203 | 0 | snprintf(final_path, MAXPGPATH, |
1204 | 0 | XLOGDIR "/summaries/%08X%08X%08X%08X%08X.summary", |
1205 | 0 | tli, |
1206 | 0 | LSN_FORMAT_ARGS(summary_start_lsn), |
1207 | 0 | LSN_FORMAT_ARGS(summary_end_lsn)); |
1208 | | |
1209 | | /* Open the temporary file for writing. */ |
1210 | 0 | io.filepos = 0; |
1211 | 0 | io.file = PathNameOpenFile(temp_path, O_WRONLY | O_CREAT | O_TRUNC); |
1212 | 0 | if (io.file < 0) |
1213 | 0 | ereport(ERROR, |
1214 | 0 | (errcode_for_file_access(), |
1215 | 0 | errmsg("could not create file \"%s\": %m", temp_path))); |
1216 | | |
1217 | | /* Write the data. */ |
1218 | 0 | WriteBlockRefTable(brtab, WriteWalSummary, &io); |
1219 | | |
1220 | | /* Close temporary file and shut down xlogreader. */ |
1221 | 0 | FileClose(io.file); |
1222 | | |
1223 | | /* Tell the user what we did. */ |
1224 | 0 | ereport(DEBUG1, |
1225 | 0 | errmsg_internal("summarized WAL on TLI %u from %X/%X to %X/%X", |
1226 | 0 | tli, |
1227 | 0 | LSN_FORMAT_ARGS(summary_start_lsn), |
1228 | 0 | LSN_FORMAT_ARGS(summary_end_lsn))); |
1229 | | |
1230 | | /* Durably rename the new summary into place. */ |
1231 | 0 | durable_rename(temp_path, final_path, ERROR); |
1232 | 0 | } |
1233 | | |
1234 | | /* If we skipped a non-zero amount of WAL, log a debug message. */ |
1235 | 0 | if (summary_end_lsn > summary_start_lsn && fast_forward) |
1236 | 0 | ereport(DEBUG1, |
1237 | 0 | errmsg_internal("skipped summarizing WAL on TLI %u from %X/%X to %X/%X", |
1238 | 0 | tli, |
1239 | 0 | LSN_FORMAT_ARGS(summary_start_lsn), |
1240 | 0 | LSN_FORMAT_ARGS(summary_end_lsn))); |
1241 | | |
1242 | 0 | return summary_end_lsn; |
1243 | 0 | } |
1244 | | |
1245 | | /* |
1246 | | * Special handling for WAL records with RM_DBASE_ID. |
1247 | | */ |
1248 | | static void |
1249 | | SummarizeDbaseRecord(XLogReaderState *xlogreader, BlockRefTable *brtab) |
1250 | 0 | { |
1251 | 0 | uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; |
1252 | | |
1253 | | /* |
1254 | | * We use relfilenode zero for a given database OID and tablespace OID to |
1255 | | * indicate that all relations with that pair of IDs have been recreated |
1256 | | * if they exist at all. Effectively, we're setting a limit block of 0 for |
1257 | | * all such relfilenodes. |
1258 | | * |
1259 | | * Technically, this special handling is only needed in the case of |
1260 | | * XLOG_DBASE_CREATE_FILE_COPY, because that can create a whole bunch of |
1261 | | * relation files in a directory without logging anything specific to each |
1262 | | * one. If we didn't mark the whole DB OID/TS OID combination in some way, |
1263 | | * then a tablespace that was dropped after the reference backup and |
1264 | | * recreated using the FILE_COPY method prior to the incremental backup |
1265 | | * would look just like one that was never touched at all, which would be |
1266 | | * catastrophic. |
1267 | | * |
1268 | | * But it seems best to adopt this treatment for all records that drop or |
1269 | | * create a DB OID/TS OID combination. That's similar to how we treat the |
1270 | | * limit block for individual relations, and it's an extra layer of safety |
1271 | | * here. We can never lose data by marking more stuff as needing to be |
1272 | | * backed up in full. |
1273 | | */ |
1274 | 0 | if (info == XLOG_DBASE_CREATE_FILE_COPY) |
1275 | 0 | { |
1276 | 0 | xl_dbase_create_file_copy_rec *xlrec; |
1277 | 0 | RelFileLocator rlocator; |
1278 | |
|
1279 | 0 | xlrec = |
1280 | 0 | (xl_dbase_create_file_copy_rec *) XLogRecGetData(xlogreader); |
1281 | 0 | rlocator.spcOid = xlrec->tablespace_id; |
1282 | 0 | rlocator.dbOid = xlrec->db_id; |
1283 | 0 | rlocator.relNumber = 0; |
1284 | 0 | BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0); |
1285 | 0 | } |
1286 | 0 | else if (info == XLOG_DBASE_CREATE_WAL_LOG) |
1287 | 0 | { |
1288 | 0 | xl_dbase_create_wal_log_rec *xlrec; |
1289 | 0 | RelFileLocator rlocator; |
1290 | |
|
1291 | 0 | xlrec = (xl_dbase_create_wal_log_rec *) XLogRecGetData(xlogreader); |
1292 | 0 | rlocator.spcOid = xlrec->tablespace_id; |
1293 | 0 | rlocator.dbOid = xlrec->db_id; |
1294 | 0 | rlocator.relNumber = 0; |
1295 | 0 | BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0); |
1296 | 0 | } |
1297 | 0 | else if (info == XLOG_DBASE_DROP) |
1298 | 0 | { |
1299 | 0 | xl_dbase_drop_rec *xlrec; |
1300 | 0 | RelFileLocator rlocator; |
1301 | 0 | int i; |
1302 | |
|
1303 | 0 | xlrec = (xl_dbase_drop_rec *) XLogRecGetData(xlogreader); |
1304 | 0 | rlocator.dbOid = xlrec->db_id; |
1305 | 0 | rlocator.relNumber = 0; |
1306 | 0 | for (i = 0; i < xlrec->ntablespaces; ++i) |
1307 | 0 | { |
1308 | 0 | rlocator.spcOid = xlrec->tablespace_ids[i]; |
1309 | 0 | BlockRefTableSetLimitBlock(brtab, &rlocator, MAIN_FORKNUM, 0); |
1310 | 0 | } |
1311 | 0 | } |
1312 | 0 | } |
1313 | | |
1314 | | /* |
1315 | | * Special handling for WAL records with RM_SMGR_ID. |
1316 | | */ |
1317 | | static void |
1318 | | SummarizeSmgrRecord(XLogReaderState *xlogreader, BlockRefTable *brtab) |
1319 | 0 | { |
1320 | 0 | uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; |
1321 | |
|
1322 | 0 | if (info == XLOG_SMGR_CREATE) |
1323 | 0 | { |
1324 | 0 | xl_smgr_create *xlrec; |
1325 | | |
1326 | | /* |
1327 | | * If a new relation fork is created on disk, there is no point |
1328 | | * tracking anything about which blocks have been modified, because |
1329 | | * the whole thing will be new. Hence, set the limit block for this |
1330 | | * fork to 0. |
1331 | | * |
1332 | | * Ignore the FSM fork, which is not fully WAL-logged. |
1333 | | */ |
1334 | 0 | xlrec = (xl_smgr_create *) XLogRecGetData(xlogreader); |
1335 | |
|
1336 | 0 | if (xlrec->forkNum != FSM_FORKNUM) |
1337 | 0 | BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator, |
1338 | 0 | xlrec->forkNum, 0); |
1339 | 0 | } |
1340 | 0 | else if (info == XLOG_SMGR_TRUNCATE) |
1341 | 0 | { |
1342 | 0 | xl_smgr_truncate *xlrec; |
1343 | |
|
1344 | 0 | xlrec = (xl_smgr_truncate *) XLogRecGetData(xlogreader); |
1345 | | |
1346 | | /* |
1347 | | * If a relation fork is truncated on disk, there is no point in |
1348 | | * tracking anything about block modifications beyond the truncation |
1349 | | * point. |
1350 | | * |
1351 | | * We ignore SMGR_TRUNCATE_FSM here because the FSM isn't fully |
1352 | | * WAL-logged and thus we can't track modified blocks for it anyway. |
1353 | | */ |
1354 | 0 | if ((xlrec->flags & SMGR_TRUNCATE_HEAP) != 0) |
1355 | 0 | BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator, |
1356 | 0 | MAIN_FORKNUM, xlrec->blkno); |
1357 | 0 | if ((xlrec->flags & SMGR_TRUNCATE_VM) != 0) |
1358 | 0 | BlockRefTableSetLimitBlock(brtab, &xlrec->rlocator, |
1359 | 0 | VISIBILITYMAP_FORKNUM, xlrec->blkno); |
1360 | 0 | } |
1361 | 0 | } |
1362 | | |
1363 | | /* |
1364 | | * Special handling for WAL records with RM_XACT_ID. |
1365 | | */ |
1366 | | static void |
1367 | | SummarizeXactRecord(XLogReaderState *xlogreader, BlockRefTable *brtab) |
1368 | 0 | { |
1369 | 0 | uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; |
1370 | 0 | uint8 xact_info = info & XLOG_XACT_OPMASK; |
1371 | |
|
1372 | 0 | if (xact_info == XLOG_XACT_COMMIT || |
1373 | 0 | xact_info == XLOG_XACT_COMMIT_PREPARED) |
1374 | 0 | { |
1375 | 0 | xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(xlogreader); |
1376 | 0 | xl_xact_parsed_commit parsed; |
1377 | 0 | int i; |
1378 | | |
1379 | | /* |
1380 | | * Don't track modified blocks for any relations that were removed on |
1381 | | * commit. |
1382 | | */ |
1383 | 0 | ParseCommitRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed); |
1384 | 0 | for (i = 0; i < parsed.nrels; ++i) |
1385 | 0 | { |
1386 | 0 | ForkNumber forknum; |
1387 | |
|
1388 | 0 | for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum) |
1389 | 0 | if (forknum != FSM_FORKNUM) |
1390 | 0 | BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i], |
1391 | 0 | forknum, 0); |
1392 | 0 | } |
1393 | 0 | } |
1394 | 0 | else if (xact_info == XLOG_XACT_ABORT || |
1395 | 0 | xact_info == XLOG_XACT_ABORT_PREPARED) |
1396 | 0 | { |
1397 | 0 | xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(xlogreader); |
1398 | 0 | xl_xact_parsed_abort parsed; |
1399 | 0 | int i; |
1400 | | |
1401 | | /* |
1402 | | * Don't track modified blocks for any relations that were removed on |
1403 | | * abort. |
1404 | | */ |
1405 | 0 | ParseAbortRecord(XLogRecGetInfo(xlogreader), xlrec, &parsed); |
1406 | 0 | for (i = 0; i < parsed.nrels; ++i) |
1407 | 0 | { |
1408 | 0 | ForkNumber forknum; |
1409 | |
|
1410 | 0 | for (forknum = 0; forknum <= MAX_FORKNUM; ++forknum) |
1411 | 0 | if (forknum != FSM_FORKNUM) |
1412 | 0 | BlockRefTableSetLimitBlock(brtab, &parsed.xlocators[i], |
1413 | 0 | forknum, 0); |
1414 | 0 | } |
1415 | 0 | } |
1416 | 0 | } |
1417 | | |
1418 | | /* |
1419 | | * Special handling for WAL records with RM_XLOG_ID. |
1420 | | * |
1421 | | * The return value is true if WAL summarization should stop before this |
1422 | | * record and false otherwise. When the return value is true, |
1423 | | * *new_fast_forward indicates whether future processing should be done |
1424 | | * in fast forward mode (i.e. read WAL without emitting summaries) or not. |
1425 | | */ |
1426 | | static bool |
1427 | | SummarizeXlogRecord(XLogReaderState *xlogreader, bool *new_fast_forward) |
1428 | 0 | { |
1429 | 0 | uint8 info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK; |
1430 | 0 | int record_wal_level; |
1431 | |
|
1432 | 0 | if (info == XLOG_CHECKPOINT_REDO) |
1433 | 0 | { |
1434 | | /* Payload is wal_level at the time record was written. */ |
1435 | 0 | memcpy(&record_wal_level, XLogRecGetData(xlogreader), sizeof(int)); |
1436 | 0 | } |
1437 | 0 | else if (info == XLOG_CHECKPOINT_SHUTDOWN) |
1438 | 0 | { |
1439 | 0 | CheckPoint rec_ckpt; |
1440 | | |
1441 | | /* Extract wal_level at time record was written from payload. */ |
1442 | 0 | memcpy(&rec_ckpt, XLogRecGetData(xlogreader), sizeof(CheckPoint)); |
1443 | 0 | record_wal_level = rec_ckpt.wal_level; |
1444 | 0 | } |
1445 | 0 | else if (info == XLOG_PARAMETER_CHANGE) |
1446 | 0 | { |
1447 | 0 | xl_parameter_change xlrec; |
1448 | | |
1449 | | /* Extract wal_level at time record was written from payload. */ |
1450 | 0 | memcpy(&xlrec, XLogRecGetData(xlogreader), |
1451 | 0 | sizeof(xl_parameter_change)); |
1452 | 0 | record_wal_level = xlrec.wal_level; |
1453 | 0 | } |
1454 | 0 | else if (info == XLOG_END_OF_RECOVERY) |
1455 | 0 | { |
1456 | 0 | xl_end_of_recovery xlrec; |
1457 | | |
1458 | | /* Extract wal_level at time record was written from payload. */ |
1459 | 0 | memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery)); |
1460 | 0 | record_wal_level = xlrec.wal_level; |
1461 | 0 | } |
1462 | 0 | else |
1463 | 0 | { |
1464 | | /* No special handling required. Return false. */ |
1465 | 0 | return false; |
1466 | 0 | } |
1467 | | |
1468 | | /* |
1469 | | * Redo can only begin at an XLOG_CHECKPOINT_REDO or |
1470 | | * XLOG_CHECKPOINT_SHUTDOWN record, so we want WAL summarization to begin |
1471 | | * at those points. Hence, when those records are encountered, return |
1472 | | * true, so that we stop just before summarizing either of those records. |
1473 | | * |
1474 | | * We also reach here if we just saw XLOG_END_OF_RECOVERY or |
1475 | | * XLOG_PARAMETER_CHANGE. These are not places where recovery can start, |
1476 | | * but they're still relevant here. A new timeline can begin with |
1477 | | * XLOG_END_OF_RECOVERY, so we need to confirm the WAL level at that |
1478 | | * point; and a restart can provoke XLOG_PARAMETER_CHANGE after an |
1479 | | * intervening change to postgresql.conf, which might force us to stop |
1480 | | * summarizing. |
1481 | | */ |
1482 | 0 | *new_fast_forward = (record_wal_level == WAL_LEVEL_MINIMAL); |
1483 | 0 | return true; |
1484 | 0 | } |
1485 | | |
1486 | | /* |
1487 | | * Similar to read_local_xlog_page, but limited to read from one particular |
1488 | | * timeline. If the end of WAL is reached, it will wait for more if reading |
1489 | | * from the current timeline, or give up if reading from a historic timeline. |
1490 | | * In the latter case, it will also set private_data->end_of_wal = true. |
1491 | | * |
1492 | | * Caller must set private_data->tli to the TLI of interest, |
1493 | | * private_data->read_upto to the lowest LSN that is not known to be safe |
1494 | | * to read on that timeline, and private_data->historic to true if and only |
1495 | | * if the timeline is not the current timeline. This function will update |
1496 | | * private_data->read_upto and private_data->historic if more WAL appears |
1497 | | * on the current timeline or if the current timeline becomes historic. |
1498 | | */ |
1499 | | static int |
1500 | | summarizer_read_local_xlog_page(XLogReaderState *state, |
1501 | | XLogRecPtr targetPagePtr, int reqLen, |
1502 | | XLogRecPtr targetRecPtr, char *cur_page) |
1503 | 0 | { |
1504 | 0 | int count; |
1505 | 0 | WALReadError errinfo; |
1506 | 0 | SummarizerReadLocalXLogPrivate *private_data; |
1507 | |
|
1508 | 0 | ProcessWalSummarizerInterrupts(); |
1509 | |
|
1510 | 0 | private_data = (SummarizerReadLocalXLogPrivate *) |
1511 | 0 | state->private_data; |
1512 | |
|
1513 | 0 | while (1) |
1514 | 0 | { |
1515 | 0 | if (targetPagePtr + XLOG_BLCKSZ <= private_data->read_upto) |
1516 | 0 | { |
1517 | | /* |
1518 | | * more than one block available; read only that block, have |
1519 | | * caller come back if they need more. |
1520 | | */ |
1521 | 0 | count = XLOG_BLCKSZ; |
1522 | 0 | break; |
1523 | 0 | } |
1524 | 0 | else if (targetPagePtr + reqLen > private_data->read_upto) |
1525 | 0 | { |
1526 | | /* We don't seem to have enough data. */ |
1527 | 0 | if (private_data->historic) |
1528 | 0 | { |
1529 | | /* |
1530 | | * This is a historic timeline, so there will never be any |
1531 | | * more data than we have currently. |
1532 | | */ |
1533 | 0 | private_data->end_of_wal = true; |
1534 | 0 | return -1; |
1535 | 0 | } |
1536 | 0 | else |
1537 | 0 | { |
1538 | 0 | XLogRecPtr latest_lsn; |
1539 | 0 | TimeLineID latest_tli; |
1540 | | |
1541 | | /* |
1542 | | * This is - or at least was up until very recently - the |
1543 | | * current timeline, so more data might show up. Delay here |
1544 | | * so we don't tight-loop. |
1545 | | */ |
1546 | 0 | ProcessWalSummarizerInterrupts(); |
1547 | 0 | summarizer_wait_for_wal(); |
1548 | | |
1549 | | /* Recheck end-of-WAL. */ |
1550 | 0 | latest_lsn = GetLatestLSN(&latest_tli); |
1551 | 0 | if (private_data->tli == latest_tli) |
1552 | 0 | { |
1553 | | /* Still the current timeline, update max LSN. */ |
1554 | 0 | Assert(latest_lsn >= private_data->read_upto); |
1555 | 0 | private_data->read_upto = latest_lsn; |
1556 | 0 | } |
1557 | 0 | else |
1558 | 0 | { |
1559 | 0 | List *tles = readTimeLineHistory(latest_tli); |
1560 | 0 | XLogRecPtr switchpoint; |
1561 | | |
1562 | | /* |
1563 | | * The timeline we're scanning is no longer the latest |
1564 | | * one. Figure out when it ended. |
1565 | | */ |
1566 | 0 | private_data->historic = true; |
1567 | 0 | switchpoint = tliSwitchPoint(private_data->tli, tles, |
1568 | 0 | NULL); |
1569 | | |
1570 | | /* |
1571 | | * Allow reads up to exactly the switch point. |
1572 | | * |
1573 | | * It's possible that this will cause read_upto to move |
1574 | | * backwards, because we might have been promoted before |
1575 | | * reaching the end of the previous timeline. In that |
1576 | | * case, the next loop iteration will likely conclude that |
1577 | | * we've reached end of WAL. |
1578 | | */ |
1579 | 0 | private_data->read_upto = switchpoint; |
1580 | | |
1581 | | /* Debugging output. */ |
1582 | 0 | ereport(DEBUG1, |
1583 | 0 | errmsg_internal("timeline %u became historic, can read up to %X/%X", |
1584 | 0 | private_data->tli, LSN_FORMAT_ARGS(private_data->read_upto))); |
1585 | 0 | } |
1586 | | |
1587 | | /* Go around and try again. */ |
1588 | 0 | } |
1589 | 0 | } |
1590 | 0 | else |
1591 | 0 | { |
1592 | | /* enough bytes available to satisfy the request */ |
1593 | 0 | count = private_data->read_upto - targetPagePtr; |
1594 | 0 | break; |
1595 | 0 | } |
1596 | 0 | } |
1597 | | |
1598 | 0 | if (!WALRead(state, cur_page, targetPagePtr, count, |
1599 | 0 | private_data->tli, &errinfo)) |
1600 | 0 | WALReadRaiseError(&errinfo); |
1601 | | |
1602 | | /* Track that we read a page, for sleep time calculation. */ |
1603 | 0 | ++pages_read_since_last_sleep; |
1604 | | |
1605 | | /* number of valid bytes in the buffer */ |
1606 | 0 | return count; |
1607 | 0 | } |
1608 | | |
1609 | | /* |
1610 | | * Sleep for long enough that we believe it's likely that more WAL will |
1611 | | * be available afterwards. |
1612 | | */ |
1613 | | static void |
1614 | | summarizer_wait_for_wal(void) |
1615 | 0 | { |
1616 | 0 | if (pages_read_since_last_sleep == 0) |
1617 | 0 | { |
1618 | | /* |
1619 | | * No pages were read since the last sleep, so double the sleep time, |
1620 | | * but not beyond the maximum allowable value. |
1621 | | */ |
1622 | 0 | sleep_quanta = Min(sleep_quanta * 2, MAX_SLEEP_QUANTA); |
1623 | 0 | } |
1624 | 0 | else if (pages_read_since_last_sleep > 1) |
1625 | 0 | { |
1626 | | /* |
1627 | | * Multiple pages were read since the last sleep, so reduce the sleep |
1628 | | * time. |
1629 | | * |
1630 | | * A large burst of activity should be able to quickly reduce the |
1631 | | * sleep time to the minimum, but we don't want a handful of extra WAL |
1632 | | * records to provoke a strong reaction. We choose to reduce the sleep |
1633 | | * time by 1 quantum for each page read beyond the first, which is a |
1634 | | * fairly arbitrary way of trying to be reactive without overreacting. |
1635 | | */ |
1636 | 0 | if (pages_read_since_last_sleep > sleep_quanta - 1) |
1637 | 0 | sleep_quanta = 1; |
1638 | 0 | else |
1639 | 0 | sleep_quanta -= pages_read_since_last_sleep; |
1640 | 0 | } |
1641 | | |
1642 | | /* Report pending statistics to the cumulative stats system. */ |
1643 | 0 | pgstat_report_wal(false); |
1644 | | |
1645 | | /* OK, now sleep. */ |
1646 | 0 | (void) WaitLatch(MyLatch, |
1647 | 0 | WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, |
1648 | 0 | sleep_quanta * MS_PER_SLEEP_QUANTUM, |
1649 | 0 | WAIT_EVENT_WAL_SUMMARIZER_WAL); |
1650 | 0 | ResetLatch(MyLatch); |
1651 | | |
1652 | | /* Reset count of pages read. */ |
1653 | 0 | pages_read_since_last_sleep = 0; |
1654 | 0 | } |
1655 | | |
1656 | | /* |
1657 | | * Remove WAL summaries whose mtimes are older than wal_summary_keep_time. |
1658 | | */ |
1659 | | static void |
1660 | | MaybeRemoveOldWalSummaries(void) |
1661 | 0 | { |
1662 | 0 | XLogRecPtr redo_pointer = GetRedoRecPtr(); |
1663 | 0 | List *wslist; |
1664 | 0 | time_t cutoff_time; |
1665 | | |
1666 | | /* If WAL summary removal is disabled, don't do anything. */ |
1667 | 0 | if (wal_summary_keep_time == 0) |
1668 | 0 | return; |
1669 | | |
1670 | | /* |
1671 | | * If the redo pointer has not advanced, don't do anything. |
1672 | | * |
1673 | | * This has the effect that we only try to remove old WAL summary files |
1674 | | * once per checkpoint cycle. |
1675 | | */ |
1676 | 0 | if (redo_pointer == redo_pointer_at_last_summary_removal) |
1677 | 0 | return; |
1678 | 0 | redo_pointer_at_last_summary_removal = redo_pointer; |
1679 | | |
1680 | | /* |
1681 | | * Files should only be removed if the last modification time precedes the |
1682 | | * cutoff time we compute here. |
1683 | | */ |
1684 | 0 | cutoff_time = time(NULL) - wal_summary_keep_time * SECS_PER_MINUTE; |
1685 | | |
1686 | | /* Get all the summaries that currently exist. */ |
1687 | 0 | wslist = GetWalSummaries(0, InvalidXLogRecPtr, InvalidXLogRecPtr); |
1688 | | |
1689 | | /* Loop until all summaries have been considered for removal. */ |
1690 | 0 | while (wslist != NIL) |
1691 | 0 | { |
1692 | 0 | ListCell *lc; |
1693 | 0 | XLogSegNo oldest_segno; |
1694 | 0 | XLogRecPtr oldest_lsn = InvalidXLogRecPtr; |
1695 | 0 | TimeLineID selected_tli; |
1696 | |
|
1697 | 0 | ProcessWalSummarizerInterrupts(); |
1698 | | |
1699 | | /* |
1700 | | * Pick a timeline for which some summary files still exist on disk, |
1701 | | * and find the oldest LSN that still exists on disk for that |
1702 | | * timeline. |
1703 | | */ |
1704 | 0 | selected_tli = ((WalSummaryFile *) linitial(wslist))->tli; |
1705 | 0 | oldest_segno = XLogGetOldestSegno(selected_tli); |
1706 | 0 | if (oldest_segno != 0) |
1707 | 0 | XLogSegNoOffsetToRecPtr(oldest_segno, 0, wal_segment_size, |
1708 | 0 | oldest_lsn); |
1709 | | |
1710 | | |
1711 | | /* Consider each WAL file on the selected timeline in turn. */ |
1712 | 0 | foreach(lc, wslist) |
1713 | 0 | { |
1714 | 0 | WalSummaryFile *ws = lfirst(lc); |
1715 | |
|
1716 | 0 | ProcessWalSummarizerInterrupts(); |
1717 | | |
1718 | | /* If it's not on this timeline, it's not time to consider it. */ |
1719 | 0 | if (selected_tli != ws->tli) |
1720 | 0 | continue; |
1721 | | |
1722 | | /* |
1723 | | * If the WAL doesn't exist any more, we can remove it if the file |
1724 | | * modification time is old enough. |
1725 | | */ |
1726 | 0 | if (XLogRecPtrIsInvalid(oldest_lsn) || ws->end_lsn <= oldest_lsn) |
1727 | 0 | RemoveWalSummaryIfOlderThan(ws, cutoff_time); |
1728 | | |
1729 | | /* |
1730 | | * Whether we removed the file or not, we need not consider it |
1731 | | * again. |
1732 | | */ |
1733 | 0 | wslist = foreach_delete_current(wslist, lc); |
1734 | 0 | pfree(ws); |
1735 | 0 | } |
1736 | 0 | } |
1737 | 0 | } |