/src/postgres/src/backend/postmaster/checkpointer.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * checkpointer.c |
4 | | * |
5 | | * The checkpointer is new as of Postgres 9.2. It handles all checkpoints. |
6 | | * Checkpoints are automatically dispatched after a certain amount of time has |
7 | | * elapsed since the last one, and it can be signaled to perform requested |
8 | | * checkpoints as well. (The GUC parameter that mandates a checkpoint every |
9 | | * so many WAL segments is implemented by having backends signal when they |
10 | | * fill WAL segments; the checkpointer itself doesn't watch for the |
11 | | * condition.) |
12 | | * |
13 | | * The normal termination sequence is that checkpointer is instructed to |
14 | | * execute the shutdown checkpoint by SIGINT. After that checkpointer waits |
15 | | * to be terminated via SIGUSR2, which instructs the checkpointer to exit(0). |
16 | | * All backends must be stopped before SIGINT or SIGUSR2 is issued! |
17 | | * |
18 | | * Emergency termination is by SIGQUIT; like any backend, the checkpointer |
19 | | * will simply abort and exit on SIGQUIT. |
20 | | * |
21 | | * If the checkpointer exits unexpectedly, the postmaster treats that the same |
22 | | * as a backend crash: shared memory may be corrupted, so remaining backends |
23 | | * should be killed by SIGQUIT and then a recovery cycle started. (Even if |
24 | | * shared memory isn't corrupted, we have lost information about which |
25 | | * files need to be fsync'd for the next checkpoint, and so a system |
26 | | * restart needs to be forced.) |
27 | | * |
28 | | * |
29 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
30 | | * |
31 | | * |
32 | | * IDENTIFICATION |
33 | | * src/backend/postmaster/checkpointer.c |
34 | | * |
35 | | *------------------------------------------------------------------------- |
36 | | */ |
37 | | #include "postgres.h" |
38 | | |
39 | | #include <sys/time.h> |
40 | | #include <time.h> |
41 | | |
42 | | #include "access/xlog.h" |
43 | | #include "access/xlog_internal.h" |
44 | | #include "access/xlogrecovery.h" |
45 | | #include "catalog/pg_authid.h" |
46 | | #include "commands/defrem.h" |
47 | | #include "libpq/pqsignal.h" |
48 | | #include "miscadmin.h" |
49 | | #include "pgstat.h" |
50 | | #include "postmaster/auxprocess.h" |
51 | | #include "postmaster/bgwriter.h" |
52 | | #include "postmaster/interrupt.h" |
53 | | #include "replication/syncrep.h" |
54 | | #include "storage/aio_subsys.h" |
55 | | #include "storage/bufmgr.h" |
56 | | #include "storage/condition_variable.h" |
57 | | #include "storage/fd.h" |
58 | | #include "storage/ipc.h" |
59 | | #include "storage/lwlock.h" |
60 | | #include "storage/pmsignal.h" |
61 | | #include "storage/proc.h" |
62 | | #include "storage/procsignal.h" |
63 | | #include "storage/shmem.h" |
64 | | #include "storage/smgr.h" |
65 | | #include "storage/spin.h" |
66 | | #include "utils/acl.h" |
67 | | #include "utils/guc.h" |
68 | | #include "utils/memutils.h" |
69 | | #include "utils/resowner.h" |
70 | | |
71 | | |
72 | | /*---------- |
73 | | * Shared memory area for communication between checkpointer and backends |
74 | | * |
75 | | * The ckpt counters allow backends to watch for completion of a checkpoint |
76 | | * request they send. Here's how it works: |
77 | | * * At start of a checkpoint, checkpointer reads (and clears) the request |
78 | | * flags and increments ckpt_started, while holding ckpt_lck. |
79 | | * * On completion of a checkpoint, checkpointer sets ckpt_done to |
80 | | * equal ckpt_started. |
81 | | * * On failure of a checkpoint, checkpointer increments ckpt_failed |
82 | | * and sets ckpt_done to equal ckpt_started. |
83 | | * |
84 | | * The algorithm for backends is: |
85 | | * 1. Record current values of ckpt_failed and ckpt_started, and |
86 | | * set request flags, while holding ckpt_lck. |
87 | | * 2. Send signal to request checkpoint. |
88 | | * 3. Sleep until ckpt_started changes. Now you know a checkpoint has |
89 | | * begun since you started this algorithm (although *not* that it was |
90 | | * specifically initiated by your signal), and that it is using your flags. |
91 | | * 4. Record new value of ckpt_started. |
92 | | * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo |
93 | | * arithmetic here in case counters wrap around.) Now you know a |
94 | | * checkpoint has started and completed, but not whether it was |
95 | | * successful. |
96 | | * 6. If ckpt_failed is different from the originally saved value, |
97 | | * assume request failed; otherwise it was definitely successful. |
98 | | * |
99 | | * ckpt_flags holds the OR of the checkpoint request flags sent by all |
100 | | * requesting backends since the last checkpoint start. The flags are |
101 | | * chosen so that OR'ing is the correct way to combine multiple requests. |
102 | | * |
103 | | * The requests array holds fsync requests sent by backends and not yet |
104 | | * absorbed by the checkpointer. |
105 | | * |
106 | | * Unlike the checkpoint fields, requests related fields are protected by |
107 | | * CheckpointerCommLock. |
108 | | *---------- |
109 | | */ |
110 | | typedef struct |
111 | | { |
112 | | SyncRequestType type; /* request type */ |
113 | | FileTag ftag; /* file identifier */ |
114 | | } CheckpointerRequest; |
115 | | |
116 | | typedef struct |
117 | | { |
118 | | pid_t checkpointer_pid; /* PID (0 if not started) */ |
119 | | |
120 | | slock_t ckpt_lck; /* protects all the ckpt_* fields */ |
121 | | |
122 | | int ckpt_started; /* advances when checkpoint starts */ |
123 | | int ckpt_done; /* advances when checkpoint done */ |
124 | | int ckpt_failed; /* advances when checkpoint fails */ |
125 | | |
126 | | int ckpt_flags; /* checkpoint flags, as defined in xlog.h */ |
127 | | |
128 | | ConditionVariable start_cv; /* signaled when ckpt_started advances */ |
129 | | ConditionVariable done_cv; /* signaled when ckpt_done advances */ |
130 | | |
131 | | int num_requests; /* current # of requests */ |
132 | | int max_requests; /* allocated array size */ |
133 | | |
134 | | int head; /* Index of the first request in the ring |
135 | | * buffer */ |
136 | | int tail; /* Index of the last request in the ring |
137 | | * buffer */ |
138 | | |
139 | | /* The ring buffer of pending checkpointer requests */ |
140 | | CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER]; |
141 | | } CheckpointerShmemStruct; |
142 | | |
143 | | static CheckpointerShmemStruct *CheckpointerShmem; |
144 | | |
145 | | /* interval for calling AbsorbSyncRequests in CheckpointWriteDelay */ |
146 | 0 | #define WRITES_PER_ABSORB 1000 |
147 | | |
148 | | /* Maximum number of checkpointer requests to process in one batch */ |
149 | | #define CKPT_REQ_BATCH_SIZE 10000 |
150 | | |
151 | | /* Max number of requests the checkpointer request queue can hold */ |
152 | | #define MAX_CHECKPOINT_REQUESTS 10000000 |
153 | | |
154 | | /* |
155 | | * GUC parameters |
156 | | */ |
157 | | int CheckPointTimeout = 300; |
158 | | int CheckPointWarning = 30; |
159 | | double CheckPointCompletionTarget = 0.9; |
160 | | |
161 | | /* |
162 | | * Private state |
163 | | */ |
164 | | static bool ckpt_active = false; |
165 | | static volatile sig_atomic_t ShutdownXLOGPending = false; |
166 | | |
167 | | /* these values are valid when ckpt_active is true: */ |
168 | | static pg_time_t ckpt_start_time; |
169 | | static XLogRecPtr ckpt_start_recptr; |
170 | | static double ckpt_cached_elapsed; |
171 | | |
172 | | static pg_time_t last_checkpoint_time; |
173 | | static pg_time_t last_xlog_switch_time; |
174 | | |
175 | | /* Prototypes for private functions */ |
176 | | |
177 | | static void ProcessCheckpointerInterrupts(void); |
178 | | static void CheckArchiveTimeout(void); |
179 | | static bool IsCheckpointOnSchedule(double progress); |
180 | | static bool FastCheckpointRequested(void); |
181 | | static bool CompactCheckpointerRequestQueue(void); |
182 | | static void UpdateSharedMemoryConfig(void); |
183 | | |
184 | | /* Signal handlers */ |
185 | | static void ReqShutdownXLOG(SIGNAL_ARGS); |
186 | | |
187 | | |
188 | | /* |
189 | | * Main entry point for checkpointer process |
190 | | * |
191 | | * This is invoked from AuxiliaryProcessMain, which has already created the |
192 | | * basic execution environment, but not enabled signals yet. |
193 | | */ |
194 | | void |
195 | | CheckpointerMain(const void *startup_data, size_t startup_data_len) |
196 | 0 | { |
197 | 0 | sigjmp_buf local_sigjmp_buf; |
198 | 0 | MemoryContext checkpointer_context; |
199 | |
|
200 | 0 | Assert(startup_data_len == 0); |
201 | |
|
202 | 0 | MyBackendType = B_CHECKPOINTER; |
203 | 0 | AuxiliaryProcessMainCommon(); |
204 | |
|
205 | 0 | CheckpointerShmem->checkpointer_pid = MyProcPid; |
206 | | |
207 | | /* |
208 | | * Properly accept or ignore signals the postmaster might send us |
209 | | * |
210 | | * Note: we deliberately ignore SIGTERM, because during a standard Unix |
211 | | * system shutdown cycle, init will SIGTERM all processes at once. We |
212 | | * want to wait for the backends to exit, whereupon the postmaster will |
213 | | * tell us it's okay to shut down (via SIGUSR2). |
214 | | */ |
215 | 0 | pqsignal(SIGHUP, SignalHandlerForConfigReload); |
216 | 0 | pqsignal(SIGINT, ReqShutdownXLOG); |
217 | 0 | pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */ |
218 | | /* SIGQUIT handler was already set up by InitPostmasterChild */ |
219 | 0 | pqsignal(SIGALRM, SIG_IGN); |
220 | 0 | pqsignal(SIGPIPE, SIG_IGN); |
221 | 0 | pqsignal(SIGUSR1, procsignal_sigusr1_handler); |
222 | 0 | pqsignal(SIGUSR2, SignalHandlerForShutdownRequest); |
223 | | |
224 | | /* |
225 | | * Reset some signals that are accepted by postmaster but not here |
226 | | */ |
227 | 0 | pqsignal(SIGCHLD, SIG_DFL); |
228 | | |
229 | | /* |
230 | | * Initialize so that first time-driven event happens at the correct time. |
231 | | */ |
232 | 0 | last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL); |
233 | | |
234 | | /* |
235 | | * Write out stats after shutdown. This needs to be called by exactly one |
236 | | * process during a normal shutdown, and since checkpointer is shut down |
237 | | * very late... |
238 | | * |
239 | | * While e.g. walsenders are active after the shutdown checkpoint has been |
240 | | * written (and thus could produce more stats), checkpointer stays around |
241 | | * after the shutdown checkpoint has been written. postmaster will only |
242 | | * signal checkpointer to exit after all processes that could emit stats |
243 | | * have been shut down. |
244 | | */ |
245 | 0 | before_shmem_exit(pgstat_before_server_shutdown, 0); |
246 | | |
247 | | /* |
248 | | * Create a memory context that we will do all our work in. We do this so |
249 | | * that we can reset the context during error recovery and thereby avoid |
250 | | * possible memory leaks. Formerly this code just ran in |
251 | | * TopMemoryContext, but resetting that would be a really bad idea. |
252 | | */ |
253 | 0 | checkpointer_context = AllocSetContextCreate(TopMemoryContext, |
254 | 0 | "Checkpointer", |
255 | 0 | ALLOCSET_DEFAULT_SIZES); |
256 | 0 | MemoryContextSwitchTo(checkpointer_context); |
257 | | |
258 | | /* |
259 | | * If an exception is encountered, processing resumes here. |
260 | | * |
261 | | * You might wonder why this isn't coded as an infinite loop around a |
262 | | * PG_TRY construct. The reason is that this is the bottom of the |
263 | | * exception stack, and so with PG_TRY there would be no exception handler |
264 | | * in force at all during the CATCH part. By leaving the outermost setjmp |
265 | | * always active, we have at least some chance of recovering from an error |
266 | | * during error recovery. (If we get into an infinite loop thereby, it |
267 | | * will soon be stopped by overflow of elog.c's internal state stack.) |
268 | | * |
269 | | * Note that we use sigsetjmp(..., 1), so that the prevailing signal mask |
270 | | * (to wit, BlockSig) will be restored when longjmp'ing to here. Thus, |
271 | | * signals other than SIGQUIT will be blocked until we complete error |
272 | | * recovery. It might seem that this policy makes the HOLD_INTERRUPTS() |
273 | | * call redundant, but it is not since InterruptPending might be set |
274 | | * already. |
275 | | */ |
276 | 0 | if (sigsetjmp(local_sigjmp_buf, 1) != 0) |
277 | 0 | { |
278 | | /* Since not using PG_TRY, must reset error stack by hand */ |
279 | 0 | error_context_stack = NULL; |
280 | | |
281 | | /* Prevent interrupts while cleaning up */ |
282 | 0 | HOLD_INTERRUPTS(); |
283 | | |
284 | | /* Report the error to the server log */ |
285 | 0 | EmitErrorReport(); |
286 | | |
287 | | /* |
288 | | * These operations are really just a minimal subset of |
289 | | * AbortTransaction(). We don't have very many resources to worry |
290 | | * about in checkpointer, but we do have LWLocks, buffers, and temp |
291 | | * files. |
292 | | */ |
293 | 0 | LWLockReleaseAll(); |
294 | 0 | ConditionVariableCancelSleep(); |
295 | 0 | pgstat_report_wait_end(); |
296 | 0 | pgaio_error_cleanup(); |
297 | 0 | UnlockBuffers(); |
298 | 0 | ReleaseAuxProcessResources(false); |
299 | 0 | AtEOXact_Buffers(false); |
300 | 0 | AtEOXact_SMgr(); |
301 | 0 | AtEOXact_Files(false); |
302 | 0 | AtEOXact_HashTables(false); |
303 | | |
304 | | /* Warn any waiting backends that the checkpoint failed. */ |
305 | 0 | if (ckpt_active) |
306 | 0 | { |
307 | 0 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
308 | 0 | CheckpointerShmem->ckpt_failed++; |
309 | 0 | CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; |
310 | 0 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
311 | |
|
312 | 0 | ConditionVariableBroadcast(&CheckpointerShmem->done_cv); |
313 | |
|
314 | 0 | ckpt_active = false; |
315 | 0 | } |
316 | | |
317 | | /* |
318 | | * Now return to normal top-level context and clear ErrorContext for |
319 | | * next time. |
320 | | */ |
321 | 0 | MemoryContextSwitchTo(checkpointer_context); |
322 | 0 | FlushErrorState(); |
323 | | |
324 | | /* Flush any leaked data in the top-level context */ |
325 | 0 | MemoryContextReset(checkpointer_context); |
326 | | |
327 | | /* Now we can allow interrupts again */ |
328 | 0 | RESUME_INTERRUPTS(); |
329 | | |
330 | | /* |
331 | | * Sleep at least 1 second after any error. A write error is likely |
332 | | * to be repeated, and we don't want to be filling the error logs as |
333 | | * fast as we can. |
334 | | */ |
335 | 0 | pg_usleep(1000000L); |
336 | 0 | } |
337 | | |
338 | | /* We can now handle ereport(ERROR) */ |
339 | 0 | PG_exception_stack = &local_sigjmp_buf; |
340 | | |
341 | | /* |
342 | | * Unblock signals (they were blocked when the postmaster forked us) |
343 | | */ |
344 | 0 | sigprocmask(SIG_SETMASK, &UnBlockSig, NULL); |
345 | | |
346 | | /* |
347 | | * Ensure all shared memory values are set correctly for the config. Doing |
348 | | * this here ensures no race conditions from other concurrent updaters. |
349 | | */ |
350 | 0 | UpdateSharedMemoryConfig(); |
351 | | |
352 | | /* |
353 | | * Advertise our proc number that backends can use to wake us up while |
354 | | * we're sleeping. |
355 | | */ |
356 | 0 | ProcGlobal->checkpointerProc = MyProcNumber; |
357 | | |
358 | | /* |
359 | | * Loop until we've been asked to write the shutdown checkpoint or |
360 | | * terminate. |
361 | | */ |
362 | 0 | for (;;) |
363 | 0 | { |
364 | 0 | bool do_checkpoint = false; |
365 | 0 | int flags = 0; |
366 | 0 | pg_time_t now; |
367 | 0 | int elapsed_secs; |
368 | 0 | int cur_timeout; |
369 | 0 | bool chkpt_or_rstpt_requested = false; |
370 | 0 | bool chkpt_or_rstpt_timed = false; |
371 | | |
372 | | /* Clear any already-pending wakeups */ |
373 | 0 | ResetLatch(MyLatch); |
374 | | |
375 | | /* |
376 | | * Process any requests or signals received recently. |
377 | | */ |
378 | 0 | AbsorbSyncRequests(); |
379 | |
|
380 | 0 | ProcessCheckpointerInterrupts(); |
381 | 0 | if (ShutdownXLOGPending || ShutdownRequestPending) |
382 | 0 | break; |
383 | | |
384 | | /* |
385 | | * Detect a pending checkpoint request by checking whether the flags |
386 | | * word in shared memory is nonzero. We shouldn't need to acquire the |
387 | | * ckpt_lck for this. |
388 | | */ |
389 | 0 | if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) |
390 | 0 | { |
391 | 0 | do_checkpoint = true; |
392 | 0 | chkpt_or_rstpt_requested = true; |
393 | 0 | } |
394 | | |
395 | | /* |
396 | | * Force a checkpoint if too much time has elapsed since the last one. |
397 | | * Note that we count a timed checkpoint in stats only when this |
398 | | * occurs without an external request, but we set the CAUSE_TIME flag |
399 | | * bit even if there is also an external request. |
400 | | */ |
401 | 0 | now = (pg_time_t) time(NULL); |
402 | 0 | elapsed_secs = now - last_checkpoint_time; |
403 | 0 | if (elapsed_secs >= CheckPointTimeout) |
404 | 0 | { |
405 | 0 | if (!do_checkpoint) |
406 | 0 | chkpt_or_rstpt_timed = true; |
407 | 0 | do_checkpoint = true; |
408 | 0 | flags |= CHECKPOINT_CAUSE_TIME; |
409 | 0 | } |
410 | | |
411 | | /* |
412 | | * Do a checkpoint if requested. |
413 | | */ |
414 | 0 | if (do_checkpoint) |
415 | 0 | { |
416 | 0 | bool ckpt_performed = false; |
417 | 0 | bool do_restartpoint; |
418 | | |
419 | | /* Check if we should perform a checkpoint or a restartpoint. */ |
420 | 0 | do_restartpoint = RecoveryInProgress(); |
421 | | |
422 | | /* |
423 | | * Atomically fetch the request flags to figure out what kind of a |
424 | | * checkpoint we should perform, and increase the started-counter |
425 | | * to acknowledge that we've started a new checkpoint. |
426 | | */ |
427 | 0 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
428 | 0 | flags |= CheckpointerShmem->ckpt_flags; |
429 | 0 | CheckpointerShmem->ckpt_flags = 0; |
430 | 0 | CheckpointerShmem->ckpt_started++; |
431 | 0 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
432 | |
|
433 | 0 | ConditionVariableBroadcast(&CheckpointerShmem->start_cv); |
434 | | |
435 | | /* |
436 | | * The end-of-recovery checkpoint is a real checkpoint that's |
437 | | * performed while we're still in recovery. |
438 | | */ |
439 | 0 | if (flags & CHECKPOINT_END_OF_RECOVERY) |
440 | 0 | do_restartpoint = false; |
441 | |
|
442 | 0 | if (chkpt_or_rstpt_timed) |
443 | 0 | { |
444 | 0 | chkpt_or_rstpt_timed = false; |
445 | 0 | if (do_restartpoint) |
446 | 0 | PendingCheckpointerStats.restartpoints_timed++; |
447 | 0 | else |
448 | 0 | PendingCheckpointerStats.num_timed++; |
449 | 0 | } |
450 | |
|
451 | 0 | if (chkpt_or_rstpt_requested) |
452 | 0 | { |
453 | 0 | chkpt_or_rstpt_requested = false; |
454 | 0 | if (do_restartpoint) |
455 | 0 | PendingCheckpointerStats.restartpoints_requested++; |
456 | 0 | else |
457 | 0 | PendingCheckpointerStats.num_requested++; |
458 | 0 | } |
459 | | |
460 | | /* |
461 | | * We will warn if (a) too soon since last checkpoint (whatever |
462 | | * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag |
463 | | * since the last checkpoint start. Note in particular that this |
464 | | * implementation will not generate warnings caused by |
465 | | * CheckPointTimeout < CheckPointWarning. |
466 | | */ |
467 | 0 | if (!do_restartpoint && |
468 | 0 | (flags & CHECKPOINT_CAUSE_XLOG) && |
469 | 0 | elapsed_secs < CheckPointWarning) |
470 | 0 | ereport(LOG, |
471 | 0 | (errmsg_plural("checkpoints are occurring too frequently (%d second apart)", |
472 | 0 | "checkpoints are occurring too frequently (%d seconds apart)", |
473 | 0 | elapsed_secs, |
474 | 0 | elapsed_secs), |
475 | 0 | errhint("Consider increasing the configuration parameter \"%s\".", "max_wal_size"))); |
476 | | |
477 | | /* |
478 | | * Initialize checkpointer-private variables used during |
479 | | * checkpoint. |
480 | | */ |
481 | 0 | ckpt_active = true; |
482 | 0 | if (do_restartpoint) |
483 | 0 | ckpt_start_recptr = GetXLogReplayRecPtr(NULL); |
484 | 0 | else |
485 | 0 | ckpt_start_recptr = GetInsertRecPtr(); |
486 | 0 | ckpt_start_time = now; |
487 | 0 | ckpt_cached_elapsed = 0; |
488 | | |
489 | | /* |
490 | | * Do the checkpoint. |
491 | | */ |
492 | 0 | if (!do_restartpoint) |
493 | 0 | ckpt_performed = CreateCheckPoint(flags); |
494 | 0 | else |
495 | 0 | ckpt_performed = CreateRestartPoint(flags); |
496 | | |
497 | | /* |
498 | | * After any checkpoint, free all smgr objects. Otherwise we |
499 | | * would never do so for dropped relations, as the checkpointer |
500 | | * does not process shared invalidation messages or call |
501 | | * AtEOXact_SMgr(). |
502 | | */ |
503 | 0 | smgrdestroyall(); |
504 | | |
505 | | /* |
506 | | * Indicate checkpoint completion to any waiting backends. |
507 | | */ |
508 | 0 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
509 | 0 | CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started; |
510 | 0 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
511 | |
|
512 | 0 | ConditionVariableBroadcast(&CheckpointerShmem->done_cv); |
513 | |
|
514 | 0 | if (!do_restartpoint) |
515 | 0 | { |
516 | | /* |
517 | | * Note we record the checkpoint start time not end time as |
518 | | * last_checkpoint_time. This is so that time-driven |
519 | | * checkpoints happen at a predictable spacing. |
520 | | */ |
521 | 0 | last_checkpoint_time = now; |
522 | |
|
523 | 0 | if (ckpt_performed) |
524 | 0 | PendingCheckpointerStats.num_performed++; |
525 | 0 | } |
526 | 0 | else |
527 | 0 | { |
528 | 0 | if (ckpt_performed) |
529 | 0 | { |
530 | | /* |
531 | | * The same as for checkpoint. Please see the |
532 | | * corresponding comment. |
533 | | */ |
534 | 0 | last_checkpoint_time = now; |
535 | |
|
536 | 0 | PendingCheckpointerStats.restartpoints_performed++; |
537 | 0 | } |
538 | 0 | else |
539 | 0 | { |
540 | | /* |
541 | | * We were not able to perform the restartpoint |
542 | | * (checkpoints throw an ERROR in case of error). Most |
543 | | * likely because we have not received any new checkpoint |
544 | | * WAL records since the last restartpoint. Try again in |
545 | | * 15 s. |
546 | | */ |
547 | 0 | last_checkpoint_time = now - CheckPointTimeout + 15; |
548 | 0 | } |
549 | 0 | } |
550 | |
|
551 | 0 | ckpt_active = false; |
552 | | |
553 | | /* |
554 | | * We may have received an interrupt during the checkpoint and the |
555 | | * latch might have been reset (e.g. in CheckpointWriteDelay). |
556 | | */ |
557 | 0 | ProcessCheckpointerInterrupts(); |
558 | 0 | if (ShutdownXLOGPending || ShutdownRequestPending) |
559 | 0 | break; |
560 | 0 | } |
561 | | |
562 | | /* Check for archive_timeout and switch xlog files if necessary. */ |
563 | 0 | CheckArchiveTimeout(); |
564 | | |
565 | | /* Report pending statistics to the cumulative stats system */ |
566 | 0 | pgstat_report_checkpointer(); |
567 | 0 | pgstat_report_wal(true); |
568 | | |
569 | | /* |
570 | | * If any checkpoint flags have been set, redo the loop to handle the |
571 | | * checkpoint without sleeping. |
572 | | */ |
573 | 0 | if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags) |
574 | 0 | continue; |
575 | | |
576 | | /* |
577 | | * Sleep until we are signaled or it's time for another checkpoint or |
578 | | * xlog file switch. |
579 | | */ |
580 | 0 | now = (pg_time_t) time(NULL); |
581 | 0 | elapsed_secs = now - last_checkpoint_time; |
582 | 0 | if (elapsed_secs >= CheckPointTimeout) |
583 | 0 | continue; /* no sleep for us ... */ |
584 | 0 | cur_timeout = CheckPointTimeout - elapsed_secs; |
585 | 0 | if (XLogArchiveTimeout > 0 && !RecoveryInProgress()) |
586 | 0 | { |
587 | 0 | elapsed_secs = now - last_xlog_switch_time; |
588 | 0 | if (elapsed_secs >= XLogArchiveTimeout) |
589 | 0 | continue; /* no sleep for us ... */ |
590 | 0 | cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs); |
591 | 0 | } |
592 | | |
593 | 0 | (void) WaitLatch(MyLatch, |
594 | 0 | WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, |
595 | 0 | cur_timeout * 1000L /* convert to ms */ , |
596 | 0 | WAIT_EVENT_CHECKPOINTER_MAIN); |
597 | 0 | } |
598 | | |
599 | | /* |
600 | | * From here on, elog(ERROR) should end with exit(1), not send control |
601 | | * back to the sigsetjmp block above. |
602 | | */ |
603 | 0 | ExitOnAnyError = true; |
604 | |
|
605 | 0 | if (ShutdownXLOGPending) |
606 | 0 | { |
607 | | /* |
608 | | * Close down the database. |
609 | | * |
610 | | * Since ShutdownXLOG() creates restartpoint or checkpoint, and |
611 | | * updates the statistics, increment the checkpoint request and flush |
612 | | * out pending statistic. |
613 | | */ |
614 | 0 | PendingCheckpointerStats.num_requested++; |
615 | 0 | ShutdownXLOG(0, 0); |
616 | 0 | pgstat_report_checkpointer(); |
617 | 0 | pgstat_report_wal(true); |
618 | | |
619 | | /* |
620 | | * Tell postmaster that we're done. |
621 | | */ |
622 | 0 | SendPostmasterSignal(PMSIGNAL_XLOG_IS_SHUTDOWN); |
623 | 0 | ShutdownXLOGPending = false; |
624 | 0 | } |
625 | | |
626 | | /* |
627 | | * Wait until we're asked to shut down. By separating the writing of the |
628 | | * shutdown checkpoint from checkpointer exiting, checkpointer can perform |
629 | | * some should-be-as-late-as-possible work like writing out stats. |
630 | | */ |
631 | 0 | for (;;) |
632 | 0 | { |
633 | | /* Clear any already-pending wakeups */ |
634 | 0 | ResetLatch(MyLatch); |
635 | |
|
636 | 0 | ProcessCheckpointerInterrupts(); |
637 | |
|
638 | 0 | if (ShutdownRequestPending) |
639 | 0 | break; |
640 | | |
641 | 0 | (void) WaitLatch(MyLatch, |
642 | 0 | WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, |
643 | 0 | 0, |
644 | 0 | WAIT_EVENT_CHECKPOINTER_SHUTDOWN); |
645 | 0 | } |
646 | | |
647 | | /* Normal exit from the checkpointer is here */ |
648 | 0 | proc_exit(0); /* done */ |
649 | 0 | } |
650 | | |
651 | | /* |
652 | | * Process any new interrupts. |
653 | | */ |
654 | | static void |
655 | | ProcessCheckpointerInterrupts(void) |
656 | 0 | { |
657 | 0 | if (ProcSignalBarrierPending) |
658 | 0 | ProcessProcSignalBarrier(); |
659 | |
|
660 | 0 | if (ConfigReloadPending) |
661 | 0 | { |
662 | 0 | ConfigReloadPending = false; |
663 | 0 | ProcessConfigFile(PGC_SIGHUP); |
664 | | |
665 | | /* |
666 | | * Checkpointer is the last process to shut down, so we ask it to hold |
667 | | * the keys for a range of other tasks required most of which have |
668 | | * nothing to do with checkpointing at all. |
669 | | * |
670 | | * For various reasons, some config values can change dynamically so |
671 | | * the primary copy of them is held in shared memory to make sure all |
672 | | * backends see the same value. We make Checkpointer responsible for |
673 | | * updating the shared memory copy if the parameter setting changes |
674 | | * because of SIGHUP. |
675 | | */ |
676 | 0 | UpdateSharedMemoryConfig(); |
677 | 0 | } |
678 | | |
679 | | /* Perform logging of memory contexts of this process */ |
680 | 0 | if (LogMemoryContextPending) |
681 | 0 | ProcessLogMemoryContextInterrupt(); |
682 | 0 | } |
683 | | |
684 | | /* |
685 | | * CheckArchiveTimeout -- check for archive_timeout and switch xlog files |
686 | | * |
687 | | * This will switch to a new WAL file and force an archive file write if |
688 | | * meaningful activity is recorded in the current WAL file. This includes most |
689 | | * writes, including just a single checkpoint record, but excludes WAL records |
690 | | * that were inserted with the XLOG_MARK_UNIMPORTANT flag being set (like |
691 | | * snapshots of running transactions). Such records, depending on |
692 | | * configuration, occur on regular intervals and don't contain important |
693 | | * information. This avoids generating archives with a few unimportant |
694 | | * records. |
695 | | */ |
696 | | static void |
697 | | CheckArchiveTimeout(void) |
698 | | { |
699 | | pg_time_t now; |
700 | | pg_time_t last_time; |
701 | | XLogRecPtr last_switch_lsn; |
702 | | |
703 | | if (XLogArchiveTimeout <= 0 || RecoveryInProgress()) |
704 | | return; |
705 | | |
706 | | now = (pg_time_t) time(NULL); |
707 | | |
708 | | /* First we do a quick check using possibly-stale local state. */ |
709 | | if ((int) (now - last_xlog_switch_time) < XLogArchiveTimeout) |
710 | | return; |
711 | | |
712 | | /* |
713 | | * Update local state ... note that last_xlog_switch_time is the last time |
714 | | * a switch was performed *or requested*. |
715 | | */ |
716 | | last_time = GetLastSegSwitchData(&last_switch_lsn); |
717 | | |
718 | | last_xlog_switch_time = Max(last_xlog_switch_time, last_time); |
719 | | |
720 | | /* Now we can do the real checks */ |
721 | | if ((int) (now - last_xlog_switch_time) >= XLogArchiveTimeout) |
722 | | { |
723 | | /* |
724 | | * Switch segment only when "important" WAL has been logged since the |
725 | | * last segment switch (last_switch_lsn points to end of segment |
726 | | * switch occurred in). |
727 | | */ |
728 | | if (GetLastImportantRecPtr() > last_switch_lsn) |
729 | | { |
730 | | XLogRecPtr switchpoint; |
731 | | |
732 | | /* mark switch as unimportant, avoids triggering checkpoints */ |
733 | | switchpoint = RequestXLogSwitch(true); |
734 | | |
735 | | /* |
736 | | * If the returned pointer points exactly to a segment boundary, |
737 | | * assume nothing happened. |
738 | | */ |
739 | | if (XLogSegmentOffset(switchpoint, wal_segment_size) != 0) |
740 | | elog(DEBUG1, "write-ahead log switch forced (\"archive_timeout\"=%d)", |
741 | | XLogArchiveTimeout); |
742 | | } |
743 | | |
744 | | /* |
745 | | * Update state in any case, so we don't retry constantly when the |
746 | | * system is idle. |
747 | | */ |
748 | | last_xlog_switch_time = now; |
749 | | } |
750 | | } |
751 | | |
752 | | /* |
753 | | * Returns true if a fast checkpoint request is pending. (Note that this does |
754 | | * not check the *current* checkpoint's FAST flag, but whether there is one |
755 | | * pending behind it.) |
756 | | */ |
757 | | static bool |
758 | | FastCheckpointRequested(void) |
759 | 0 | { |
760 | 0 | volatile CheckpointerShmemStruct *cps = CheckpointerShmem; |
761 | | |
762 | | /* |
763 | | * We don't need to acquire the ckpt_lck in this case because we're only |
764 | | * looking at a single flag bit. |
765 | | */ |
766 | 0 | if (cps->ckpt_flags & CHECKPOINT_FAST) |
767 | 0 | return true; |
768 | 0 | return false; |
769 | 0 | } |
770 | | |
771 | | /* |
772 | | * CheckpointWriteDelay -- control rate of checkpoint |
773 | | * |
774 | | * This function is called after each page write performed by BufferSync(). |
775 | | * It is responsible for throttling BufferSync()'s write rate to hit |
776 | | * checkpoint_completion_target. |
777 | | * |
778 | | * The checkpoint request flags should be passed in; currently the only one |
779 | | * examined is CHECKPOINT_FAST, which disables delays between writes. |
780 | | * |
781 | | * 'progress' is an estimate of how much of the work has been done, as a |
782 | | * fraction between 0.0 meaning none, and 1.0 meaning all done. |
783 | | */ |
784 | | void |
785 | | CheckpointWriteDelay(int flags, double progress) |
786 | 0 | { |
787 | 0 | static int absorb_counter = WRITES_PER_ABSORB; |
788 | | |
789 | | /* Do nothing if checkpoint is being executed by non-checkpointer process */ |
790 | 0 | if (!AmCheckpointerProcess()) |
791 | 0 | return; |
792 | | |
793 | | /* |
794 | | * Perform the usual duties and take a nap, unless we're behind schedule, |
795 | | * in which case we just try to catch up as quickly as possible. |
796 | | */ |
797 | 0 | if (!(flags & CHECKPOINT_FAST) && |
798 | 0 | !ShutdownXLOGPending && |
799 | 0 | !ShutdownRequestPending && |
800 | 0 | !FastCheckpointRequested() && |
801 | 0 | IsCheckpointOnSchedule(progress)) |
802 | 0 | { |
803 | 0 | if (ConfigReloadPending) |
804 | 0 | { |
805 | 0 | ConfigReloadPending = false; |
806 | 0 | ProcessConfigFile(PGC_SIGHUP); |
807 | | /* update shmem copies of config variables */ |
808 | 0 | UpdateSharedMemoryConfig(); |
809 | 0 | } |
810 | |
|
811 | 0 | AbsorbSyncRequests(); |
812 | 0 | absorb_counter = WRITES_PER_ABSORB; |
813 | |
|
814 | 0 | CheckArchiveTimeout(); |
815 | | |
816 | | /* Report interim statistics to the cumulative stats system */ |
817 | 0 | pgstat_report_checkpointer(); |
818 | | |
819 | | /* |
820 | | * This sleep used to be connected to bgwriter_delay, typically 200ms. |
821 | | * That resulted in more frequent wakeups if not much work to do. |
822 | | * Checkpointer and bgwriter are no longer related so take the Big |
823 | | * Sleep. |
824 | | */ |
825 | 0 | WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, |
826 | 0 | 100, |
827 | 0 | WAIT_EVENT_CHECKPOINT_WRITE_DELAY); |
828 | 0 | ResetLatch(MyLatch); |
829 | 0 | } |
830 | 0 | else if (--absorb_counter <= 0) |
831 | 0 | { |
832 | | /* |
833 | | * Absorb pending fsync requests after each WRITES_PER_ABSORB write |
834 | | * operations even when we don't sleep, to prevent overflow of the |
835 | | * fsync request queue. |
836 | | */ |
837 | 0 | AbsorbSyncRequests(); |
838 | 0 | absorb_counter = WRITES_PER_ABSORB; |
839 | 0 | } |
840 | | |
841 | | /* Check for barrier events. */ |
842 | 0 | if (ProcSignalBarrierPending) |
843 | 0 | ProcessProcSignalBarrier(); |
844 | 0 | } |
845 | | |
846 | | /* |
847 | | * IsCheckpointOnSchedule -- are we on schedule to finish this checkpoint |
848 | | * (or restartpoint) in time? |
849 | | * |
850 | | * Compares the current progress against the time/segments elapsed since last |
851 | | * checkpoint, and returns true if the progress we've made this far is greater |
852 | | * than the elapsed time/segments. |
853 | | */ |
854 | | static bool |
855 | | IsCheckpointOnSchedule(double progress) |
856 | 0 | { |
857 | 0 | XLogRecPtr recptr; |
858 | 0 | struct timeval now; |
859 | 0 | double elapsed_xlogs, |
860 | 0 | elapsed_time; |
861 | |
|
862 | 0 | Assert(ckpt_active); |
863 | | |
864 | | /* Scale progress according to checkpoint_completion_target. */ |
865 | 0 | progress *= CheckPointCompletionTarget; |
866 | | |
867 | | /* |
868 | | * Check against the cached value first. Only do the more expensive |
869 | | * calculations once we reach the target previously calculated. Since |
870 | | * neither time or WAL insert pointer moves backwards, a freshly |
871 | | * calculated value can only be greater than or equal to the cached value. |
872 | | */ |
873 | 0 | if (progress < ckpt_cached_elapsed) |
874 | 0 | return false; |
875 | | |
876 | | /* |
877 | | * Check progress against WAL segments written and CheckPointSegments. |
878 | | * |
879 | | * We compare the current WAL insert location against the location |
880 | | * computed before calling CreateCheckPoint. The code in XLogInsert that |
881 | | * actually triggers a checkpoint when CheckPointSegments is exceeded |
882 | | * compares against RedoRecPtr, so this is not completely accurate. |
883 | | * However, it's good enough for our purposes, we're only calculating an |
884 | | * estimate anyway. |
885 | | * |
886 | | * During recovery, we compare last replayed WAL record's location with |
887 | | * the location computed before calling CreateRestartPoint. That maintains |
888 | | * the same pacing as we have during checkpoints in normal operation, but |
889 | | * we might exceed max_wal_size by a fair amount. That's because there can |
890 | | * be a large gap between a checkpoint's redo-pointer and the checkpoint |
891 | | * record itself, and we only start the restartpoint after we've seen the |
892 | | * checkpoint record. (The gap is typically up to CheckPointSegments * |
893 | | * checkpoint_completion_target where checkpoint_completion_target is the |
894 | | * value that was in effect when the WAL was generated). |
895 | | */ |
896 | 0 | if (RecoveryInProgress()) |
897 | 0 | recptr = GetXLogReplayRecPtr(NULL); |
898 | 0 | else |
899 | 0 | recptr = GetInsertRecPtr(); |
900 | 0 | elapsed_xlogs = (((double) (recptr - ckpt_start_recptr)) / |
901 | 0 | wal_segment_size) / CheckPointSegments; |
902 | |
|
903 | 0 | if (progress < elapsed_xlogs) |
904 | 0 | { |
905 | 0 | ckpt_cached_elapsed = elapsed_xlogs; |
906 | 0 | return false; |
907 | 0 | } |
908 | | |
909 | | /* |
910 | | * Check progress against time elapsed and checkpoint_timeout. |
911 | | */ |
912 | 0 | gettimeofday(&now, NULL); |
913 | 0 | elapsed_time = ((double) ((pg_time_t) now.tv_sec - ckpt_start_time) + |
914 | 0 | now.tv_usec / 1000000.0) / CheckPointTimeout; |
915 | |
|
916 | 0 | if (progress < elapsed_time) |
917 | 0 | { |
918 | 0 | ckpt_cached_elapsed = elapsed_time; |
919 | 0 | return false; |
920 | 0 | } |
921 | | |
922 | | /* It looks like we're on schedule. */ |
923 | 0 | return true; |
924 | 0 | } |
925 | | |
926 | | |
927 | | /* -------------------------------- |
928 | | * signal handler routines |
929 | | * -------------------------------- |
930 | | */ |
931 | | |
932 | | /* SIGINT: set flag to trigger writing of shutdown checkpoint */ |
933 | | static void |
934 | | ReqShutdownXLOG(SIGNAL_ARGS) |
935 | 0 | { |
936 | 0 | ShutdownXLOGPending = true; |
937 | 0 | SetLatch(MyLatch); |
938 | 0 | } |
939 | | |
940 | | |
941 | | /* -------------------------------- |
942 | | * communication with backends |
943 | | * -------------------------------- |
944 | | */ |
945 | | |
946 | | /* |
947 | | * CheckpointerShmemSize |
948 | | * Compute space needed for checkpointer-related shared memory |
949 | | */ |
950 | | Size |
951 | | CheckpointerShmemSize(void) |
952 | 0 | { |
953 | 0 | Size size; |
954 | | |
955 | | /* |
956 | | * The size of the requests[] array is arbitrarily set equal to NBuffers. |
957 | | * But there is a cap of MAX_CHECKPOINT_REQUESTS to prevent accumulating |
958 | | * too many checkpoint requests in the ring buffer. |
959 | | */ |
960 | 0 | size = offsetof(CheckpointerShmemStruct, requests); |
961 | 0 | size = add_size(size, mul_size(Min(NBuffers, |
962 | 0 | MAX_CHECKPOINT_REQUESTS), |
963 | 0 | sizeof(CheckpointerRequest))); |
964 | |
|
965 | 0 | return size; |
966 | 0 | } |
967 | | |
968 | | /* |
969 | | * CheckpointerShmemInit |
970 | | * Allocate and initialize checkpointer-related shared memory |
971 | | */ |
972 | | void |
973 | | CheckpointerShmemInit(void) |
974 | 0 | { |
975 | 0 | Size size = CheckpointerShmemSize(); |
976 | 0 | bool found; |
977 | |
|
978 | 0 | CheckpointerShmem = (CheckpointerShmemStruct *) |
979 | 0 | ShmemInitStruct("Checkpointer Data", |
980 | 0 | size, |
981 | 0 | &found); |
982 | |
|
983 | 0 | if (!found) |
984 | 0 | { |
985 | | /* |
986 | | * First time through, so initialize. Note that we zero the whole |
987 | | * requests array; this is so that CompactCheckpointerRequestQueue can |
988 | | * assume that any pad bytes in the request structs are zeroes. |
989 | | */ |
990 | 0 | MemSet(CheckpointerShmem, 0, size); |
991 | 0 | SpinLockInit(&CheckpointerShmem->ckpt_lck); |
992 | 0 | CheckpointerShmem->max_requests = Min(NBuffers, MAX_CHECKPOINT_REQUESTS); |
993 | 0 | CheckpointerShmem->head = CheckpointerShmem->tail = 0; |
994 | 0 | ConditionVariableInit(&CheckpointerShmem->start_cv); |
995 | 0 | ConditionVariableInit(&CheckpointerShmem->done_cv); |
996 | 0 | } |
997 | 0 | } |
998 | | |
999 | | /* |
1000 | | * ExecCheckpoint |
1001 | | * Primary entry point for manual CHECKPOINT commands |
1002 | | * |
1003 | | * This is mainly a wrapper for RequestCheckpoint(). |
1004 | | */ |
1005 | | void |
1006 | | ExecCheckpoint(ParseState *pstate, CheckPointStmt *stmt) |
1007 | 0 | { |
1008 | 0 | bool fast = true; |
1009 | 0 | bool unlogged = false; |
1010 | |
|
1011 | 0 | foreach_ptr(DefElem, opt, stmt->options) |
1012 | 0 | { |
1013 | 0 | if (strcmp(opt->defname, "mode") == 0) |
1014 | 0 | { |
1015 | 0 | char *mode = defGetString(opt); |
1016 | |
|
1017 | 0 | if (strcmp(mode, "spread") == 0) |
1018 | 0 | fast = false; |
1019 | 0 | else if (strcmp(mode, "fast") != 0) |
1020 | 0 | ereport(ERROR, |
1021 | 0 | (errcode(ERRCODE_SYNTAX_ERROR), |
1022 | 0 | errmsg("unrecognized MODE option \"%s\"", mode), |
1023 | 0 | parser_errposition(pstate, opt->location))); |
1024 | 0 | } |
1025 | 0 | else if (strcmp(opt->defname, "flush_unlogged") == 0) |
1026 | 0 | unlogged = defGetBoolean(opt); |
1027 | 0 | else |
1028 | 0 | ereport(ERROR, |
1029 | 0 | (errcode(ERRCODE_SYNTAX_ERROR), |
1030 | 0 | errmsg("unrecognized CHECKPOINT option \"%s\"", opt->defname), |
1031 | 0 | parser_errposition(pstate, opt->location))); |
1032 | 0 | } |
1033 | | |
1034 | 0 | if (!has_privs_of_role(GetUserId(), ROLE_PG_CHECKPOINT)) |
1035 | 0 | ereport(ERROR, |
1036 | 0 | (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), |
1037 | | /* translator: %s is name of an SQL command (e.g., CHECKPOINT) */ |
1038 | 0 | errmsg("permission denied to execute %s command", |
1039 | 0 | "CHECKPOINT"), |
1040 | 0 | errdetail("Only roles with privileges of the \"%s\" role may execute this command.", |
1041 | 0 | "pg_checkpoint"))); |
1042 | | |
1043 | 0 | RequestCheckpoint(CHECKPOINT_WAIT | |
1044 | 0 | (fast ? CHECKPOINT_FAST : 0) | |
1045 | 0 | (unlogged ? CHECKPOINT_FLUSH_UNLOGGED : 0) | |
1046 | 0 | (RecoveryInProgress() ? 0 : CHECKPOINT_FORCE)); |
1047 | 0 | } |
1048 | | |
1049 | | /* |
1050 | | * RequestCheckpoint |
1051 | | * Called in backend processes to request a checkpoint |
1052 | | * |
1053 | | * flags is a bitwise OR of the following: |
1054 | | * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown. |
1055 | | * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery. |
1056 | | * CHECKPOINT_FAST: finish the checkpoint ASAP, |
1057 | | * ignoring checkpoint_completion_target parameter. |
1058 | | * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred |
1059 | | * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or |
1060 | | * CHECKPOINT_END_OF_RECOVERY, and the CHECKPOINT command). |
1061 | | * CHECKPOINT_WAIT: wait for completion before returning (otherwise, |
1062 | | * just signal checkpointer to do it, and return). |
1063 | | * CHECKPOINT_CAUSE_XLOG: checkpoint is requested due to xlog filling. |
1064 | | * (This affects logging, and in particular enables CheckPointWarning.) |
1065 | | */ |
1066 | | void |
1067 | | RequestCheckpoint(int flags) |
1068 | 0 | { |
1069 | 0 | int ntries; |
1070 | 0 | int old_failed, |
1071 | 0 | old_started; |
1072 | | |
1073 | | /* |
1074 | | * If in a standalone backend, just do it ourselves. |
1075 | | */ |
1076 | 0 | if (!IsPostmasterEnvironment) |
1077 | 0 | { |
1078 | | /* |
1079 | | * There's no point in doing slow checkpoints in a standalone backend, |
1080 | | * because there's no other backends the checkpoint could disrupt. |
1081 | | */ |
1082 | 0 | CreateCheckPoint(flags | CHECKPOINT_FAST); |
1083 | | |
1084 | | /* Free all smgr objects, as CheckpointerMain() normally would. */ |
1085 | 0 | smgrdestroyall(); |
1086 | |
|
1087 | 0 | return; |
1088 | 0 | } |
1089 | | |
1090 | | /* |
1091 | | * Atomically set the request flags, and take a snapshot of the counters. |
1092 | | * When we see ckpt_started > old_started, we know the flags we set here |
1093 | | * have been seen by checkpointer. |
1094 | | * |
1095 | | * Note that we OR the flags with any existing flags, to avoid overriding |
1096 | | * a "stronger" request by another backend. The flag senses must be |
1097 | | * chosen to make this work! |
1098 | | */ |
1099 | 0 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
1100 | |
|
1101 | 0 | old_failed = CheckpointerShmem->ckpt_failed; |
1102 | 0 | old_started = CheckpointerShmem->ckpt_started; |
1103 | 0 | CheckpointerShmem->ckpt_flags |= (flags | CHECKPOINT_REQUESTED); |
1104 | |
|
1105 | 0 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
1106 | | |
1107 | | /* |
1108 | | * Set checkpointer's latch to request checkpoint. It's possible that the |
1109 | | * checkpointer hasn't started yet, so we will retry a few times if |
1110 | | * needed. (Actually, more than a few times, since on slow or overloaded |
1111 | | * buildfarm machines, it's been observed that the checkpointer can take |
1112 | | * several seconds to start.) However, if not told to wait for the |
1113 | | * checkpoint to occur, we consider failure to set the latch to be |
1114 | | * nonfatal and merely LOG it. The checkpointer should see the request |
1115 | | * when it does start, with or without the SetLatch(). |
1116 | | */ |
1117 | 0 | #define MAX_SIGNAL_TRIES 600 /* max wait 60.0 sec */ |
1118 | 0 | for (ntries = 0;; ntries++) |
1119 | 0 | { |
1120 | 0 | volatile PROC_HDR *procglobal = ProcGlobal; |
1121 | 0 | ProcNumber checkpointerProc = procglobal->checkpointerProc; |
1122 | |
|
1123 | 0 | if (checkpointerProc == INVALID_PROC_NUMBER) |
1124 | 0 | { |
1125 | 0 | if (ntries >= MAX_SIGNAL_TRIES || !(flags & CHECKPOINT_WAIT)) |
1126 | 0 | { |
1127 | 0 | elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG, |
1128 | 0 | "could not notify checkpoint: checkpointer is not running"); |
1129 | 0 | break; |
1130 | 0 | } |
1131 | 0 | } |
1132 | 0 | else |
1133 | 0 | { |
1134 | 0 | SetLatch(&GetPGProcByNumber(checkpointerProc)->procLatch); |
1135 | | /* notified successfully */ |
1136 | 0 | break; |
1137 | 0 | } |
1138 | | |
1139 | 0 | CHECK_FOR_INTERRUPTS(); |
1140 | 0 | pg_usleep(100000L); /* wait 0.1 sec, then retry */ |
1141 | 0 | } |
1142 | | |
1143 | | /* |
1144 | | * If requested, wait for completion. We detect completion according to |
1145 | | * the algorithm given above. |
1146 | | */ |
1147 | 0 | if (flags & CHECKPOINT_WAIT) |
1148 | 0 | { |
1149 | 0 | int new_started, |
1150 | 0 | new_failed; |
1151 | | |
1152 | | /* Wait for a new checkpoint to start. */ |
1153 | 0 | ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv); |
1154 | 0 | for (;;) |
1155 | 0 | { |
1156 | 0 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
1157 | 0 | new_started = CheckpointerShmem->ckpt_started; |
1158 | 0 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
1159 | |
|
1160 | 0 | if (new_started != old_started) |
1161 | 0 | break; |
1162 | | |
1163 | 0 | ConditionVariableSleep(&CheckpointerShmem->start_cv, |
1164 | 0 | WAIT_EVENT_CHECKPOINT_START); |
1165 | 0 | } |
1166 | 0 | ConditionVariableCancelSleep(); |
1167 | | |
1168 | | /* |
1169 | | * We are waiting for ckpt_done >= new_started, in a modulo sense. |
1170 | | */ |
1171 | 0 | ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv); |
1172 | 0 | for (;;) |
1173 | 0 | { |
1174 | 0 | int new_done; |
1175 | |
|
1176 | 0 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
1177 | 0 | new_done = CheckpointerShmem->ckpt_done; |
1178 | 0 | new_failed = CheckpointerShmem->ckpt_failed; |
1179 | 0 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
1180 | |
|
1181 | 0 | if (new_done - new_started >= 0) |
1182 | 0 | break; |
1183 | | |
1184 | 0 | ConditionVariableSleep(&CheckpointerShmem->done_cv, |
1185 | 0 | WAIT_EVENT_CHECKPOINT_DONE); |
1186 | 0 | } |
1187 | 0 | ConditionVariableCancelSleep(); |
1188 | |
|
1189 | 0 | if (new_failed != old_failed) |
1190 | 0 | ereport(ERROR, |
1191 | 0 | (errmsg("checkpoint request failed"), |
1192 | 0 | errhint("Consult recent messages in the server log for details."))); |
1193 | 0 | } |
1194 | 0 | } |
1195 | | |
1196 | | /* |
1197 | | * ForwardSyncRequest |
1198 | | * Forward a file-fsync request from a backend to the checkpointer |
1199 | | * |
1200 | | * Whenever a backend is compelled to write directly to a relation |
1201 | | * (which should be seldom, if the background writer is getting its job done), |
1202 | | * the backend calls this routine to pass over knowledge that the relation |
1203 | | * is dirty and must be fsync'd before next checkpoint. We also use this |
1204 | | * opportunity to count such writes for statistical purposes. |
1205 | | * |
1206 | | * To avoid holding the lock for longer than necessary, we normally write |
1207 | | * to the requests[] queue without checking for duplicates. The checkpointer |
1208 | | * will have to eliminate dups internally anyway. However, if we discover |
1209 | | * that the queue is full, we make a pass over the entire queue to compact |
1210 | | * it. This is somewhat expensive, but the alternative is for the backend |
1211 | | * to perform its own fsync, which is far more expensive in practice. It |
1212 | | * is theoretically possible a backend fsync might still be necessary, if |
1213 | | * the queue is full and contains no duplicate entries. In that case, we |
1214 | | * let the backend know by returning false. |
1215 | | */ |
1216 | | bool |
1217 | | ForwardSyncRequest(const FileTag *ftag, SyncRequestType type) |
1218 | 0 | { |
1219 | 0 | CheckpointerRequest *request; |
1220 | 0 | bool too_full; |
1221 | 0 | int insert_pos; |
1222 | |
|
1223 | 0 | if (!IsUnderPostmaster) |
1224 | 0 | return false; /* probably shouldn't even get here */ |
1225 | | |
1226 | 0 | if (AmCheckpointerProcess()) |
1227 | 0 | elog(ERROR, "ForwardSyncRequest must not be called in checkpointer"); |
1228 | | |
1229 | 0 | LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); |
1230 | | |
1231 | | /* |
1232 | | * If the checkpointer isn't running or the request queue is full, the |
1233 | | * backend will have to perform its own fsync request. But before forcing |
1234 | | * that to happen, we can try to compact the request queue. |
1235 | | */ |
1236 | 0 | if (CheckpointerShmem->checkpointer_pid == 0 || |
1237 | 0 | (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests && |
1238 | 0 | !CompactCheckpointerRequestQueue())) |
1239 | 0 | { |
1240 | 0 | LWLockRelease(CheckpointerCommLock); |
1241 | 0 | return false; |
1242 | 0 | } |
1243 | | |
1244 | | /* OK, insert request */ |
1245 | 0 | insert_pos = CheckpointerShmem->tail; |
1246 | 0 | request = &CheckpointerShmem->requests[insert_pos]; |
1247 | 0 | request->ftag = *ftag; |
1248 | 0 | request->type = type; |
1249 | |
|
1250 | 0 | CheckpointerShmem->tail = (CheckpointerShmem->tail + 1) % CheckpointerShmem->max_requests; |
1251 | 0 | CheckpointerShmem->num_requests++; |
1252 | | |
1253 | | /* If queue is more than half full, nudge the checkpointer to empty it */ |
1254 | 0 | too_full = (CheckpointerShmem->num_requests >= |
1255 | 0 | CheckpointerShmem->max_requests / 2); |
1256 | |
|
1257 | 0 | LWLockRelease(CheckpointerCommLock); |
1258 | | |
1259 | | /* ... but not till after we release the lock */ |
1260 | 0 | if (too_full) |
1261 | 0 | { |
1262 | 0 | volatile PROC_HDR *procglobal = ProcGlobal; |
1263 | 0 | ProcNumber checkpointerProc = procglobal->checkpointerProc; |
1264 | |
|
1265 | 0 | if (checkpointerProc != INVALID_PROC_NUMBER) |
1266 | 0 | SetLatch(&GetPGProcByNumber(checkpointerProc)->procLatch); |
1267 | 0 | } |
1268 | |
|
1269 | 0 | return true; |
1270 | 0 | } |
1271 | | |
1272 | | /* |
1273 | | * CompactCheckpointerRequestQueue |
1274 | | * Remove duplicates from the request queue to avoid backend fsyncs. |
1275 | | * Returns "true" if any entries were removed. |
1276 | | * |
1277 | | * Although a full fsync request queue is not common, it can lead to severe |
1278 | | * performance problems when it does happen. So far, this situation has |
1279 | | * only been observed to occur when the system is under heavy write load, |
1280 | | * and especially during the "sync" phase of a checkpoint. Without this |
1281 | | * logic, each backend begins doing an fsync for every block written, which |
1282 | | * gets very expensive and can slow down the whole system. |
1283 | | * |
1284 | | * Trying to do this every time the queue is full could lose if there |
1285 | | * aren't any removable entries. But that should be vanishingly rare in |
1286 | | * practice: there's one queue entry per shared buffer. |
1287 | | */ |
1288 | | static bool |
1289 | | CompactCheckpointerRequestQueue(void) |
1290 | | { |
1291 | | struct CheckpointerSlotMapping |
1292 | | { |
1293 | | CheckpointerRequest request; |
1294 | | int ring_idx; |
1295 | | }; |
1296 | | |
1297 | | int n; |
1298 | | int num_skipped = 0; |
1299 | | int head; |
1300 | | int max_requests; |
1301 | | int num_requests; |
1302 | | int read_idx, |
1303 | | write_idx; |
1304 | | HASHCTL ctl; |
1305 | | HTAB *htab; |
1306 | | bool *skip_slot; |
1307 | | |
1308 | | /* must hold CheckpointerCommLock in exclusive mode */ |
1309 | | Assert(LWLockHeldByMe(CheckpointerCommLock)); |
1310 | | |
1311 | | /* Avoid memory allocations in a critical section. */ |
1312 | | if (CritSectionCount > 0) |
1313 | | return false; |
1314 | | |
1315 | | max_requests = CheckpointerShmem->max_requests; |
1316 | | num_requests = CheckpointerShmem->num_requests; |
1317 | | |
1318 | | /* Initialize skip_slot array */ |
1319 | | skip_slot = palloc0(sizeof(bool) * max_requests); |
1320 | | |
1321 | | head = CheckpointerShmem->head; |
1322 | | |
1323 | | /* Initialize temporary hash table */ |
1324 | | ctl.keysize = sizeof(CheckpointerRequest); |
1325 | | ctl.entrysize = sizeof(struct CheckpointerSlotMapping); |
1326 | | ctl.hcxt = CurrentMemoryContext; |
1327 | | |
1328 | | htab = hash_create("CompactCheckpointerRequestQueue", |
1329 | | CheckpointerShmem->num_requests, |
1330 | | &ctl, |
1331 | | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
1332 | | |
1333 | | /* |
1334 | | * The basic idea here is that a request can be skipped if it's followed |
1335 | | * by a later, identical request. It might seem more sensible to work |
1336 | | * backwards from the end of the queue and check whether a request is |
1337 | | * *preceded* by an earlier, identical request, in the hopes of doing less |
1338 | | * copying. But that might change the semantics, if there's an |
1339 | | * intervening SYNC_FORGET_REQUEST or SYNC_FILTER_REQUEST, so we do it |
1340 | | * this way. It would be possible to be even smarter if we made the code |
1341 | | * below understand the specific semantics of such requests (it could blow |
1342 | | * away preceding entries that would end up being canceled anyhow), but |
1343 | | * it's not clear that the extra complexity would buy us anything. |
1344 | | */ |
1345 | | read_idx = head; |
1346 | | for (n = 0; n < num_requests; n++) |
1347 | | { |
1348 | | CheckpointerRequest *request; |
1349 | | struct CheckpointerSlotMapping *slotmap; |
1350 | | bool found; |
1351 | | |
1352 | | /* |
1353 | | * We use the request struct directly as a hashtable key. This |
1354 | | * assumes that any padding bytes in the structs are consistently the |
1355 | | * same, which should be okay because we zeroed them in |
1356 | | * CheckpointerShmemInit. Note also that RelFileLocator had better |
1357 | | * contain no pad bytes. |
1358 | | */ |
1359 | | request = &CheckpointerShmem->requests[read_idx]; |
1360 | | slotmap = hash_search(htab, request, HASH_ENTER, &found); |
1361 | | if (found) |
1362 | | { |
1363 | | /* Duplicate, so mark the previous occurrence as skippable */ |
1364 | | skip_slot[slotmap->ring_idx] = true; |
1365 | | num_skipped++; |
1366 | | } |
1367 | | /* Remember slot containing latest occurrence of this request value */ |
1368 | | slotmap->ring_idx = read_idx; |
1369 | | |
1370 | | /* Move to the next request in the ring buffer */ |
1371 | | read_idx = (read_idx + 1) % max_requests; |
1372 | | } |
1373 | | |
1374 | | /* Done with the hash table. */ |
1375 | | hash_destroy(htab); |
1376 | | |
1377 | | /* If no duplicates, we're out of luck. */ |
1378 | | if (!num_skipped) |
1379 | | { |
1380 | | pfree(skip_slot); |
1381 | | return false; |
1382 | | } |
1383 | | |
1384 | | /* We found some duplicates; remove them. */ |
1385 | | read_idx = write_idx = head; |
1386 | | for (n = 0; n < num_requests; n++) |
1387 | | { |
1388 | | /* If this slot is NOT skipped, keep it */ |
1389 | | if (!skip_slot[read_idx]) |
1390 | | { |
1391 | | /* If the read and write positions are different, copy the request */ |
1392 | | if (write_idx != read_idx) |
1393 | | CheckpointerShmem->requests[write_idx] = |
1394 | | CheckpointerShmem->requests[read_idx]; |
1395 | | |
1396 | | /* Advance the write position */ |
1397 | | write_idx = (write_idx + 1) % max_requests; |
1398 | | } |
1399 | | |
1400 | | read_idx = (read_idx + 1) % max_requests; |
1401 | | } |
1402 | | |
1403 | | /* |
1404 | | * Update ring buffer state: head remains the same, tail moves, count |
1405 | | * decreases |
1406 | | */ |
1407 | | CheckpointerShmem->tail = write_idx; |
1408 | | CheckpointerShmem->num_requests -= num_skipped; |
1409 | | |
1410 | | ereport(DEBUG1, |
1411 | | (errmsg_internal("compacted fsync request queue from %d entries to %d entries", |
1412 | | num_requests, CheckpointerShmem->num_requests))); |
1413 | | |
1414 | | /* Cleanup. */ |
1415 | | pfree(skip_slot); |
1416 | | return true; |
1417 | | } |
1418 | | |
1419 | | /* |
1420 | | * AbsorbSyncRequests |
1421 | | * Retrieve queued sync requests and pass them to sync mechanism. |
1422 | | * |
1423 | | * This is exported because it must be called during CreateCheckPoint; |
1424 | | * we have to be sure we have accepted all pending requests just before |
1425 | | * we start fsync'ing. Since CreateCheckPoint sometimes runs in |
1426 | | * non-checkpointer processes, do nothing if not checkpointer. |
1427 | | */ |
1428 | | void |
1429 | | AbsorbSyncRequests(void) |
1430 | 0 | { |
1431 | 0 | CheckpointerRequest *requests = NULL; |
1432 | 0 | CheckpointerRequest *request; |
1433 | 0 | int n, |
1434 | 0 | i; |
1435 | 0 | bool loop; |
1436 | |
|
1437 | 0 | if (!AmCheckpointerProcess()) |
1438 | 0 | return; |
1439 | | |
1440 | 0 | do |
1441 | 0 | { |
1442 | 0 | LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE); |
1443 | | |
1444 | | /*--- |
1445 | | * We try to avoid holding the lock for a long time by: |
1446 | | * 1. Copying the request array and processing the requests after |
1447 | | * releasing the lock; |
1448 | | * 2. Processing not the whole queue, but only batches of |
1449 | | * CKPT_REQ_BATCH_SIZE at once. |
1450 | | * |
1451 | | * Once we have cleared the requests from shared memory, we must |
1452 | | * PANIC if we then fail to absorb them (e.g., because our hashtable |
1453 | | * runs out of memory). This is because the system cannot run safely |
1454 | | * if we are unable to fsync what we have been told to fsync. |
1455 | | * Fortunately, the hashtable is so small that the problem is quite |
1456 | | * unlikely to arise in practice. |
1457 | | * |
1458 | | * Note: The maximum possible size of a ring buffer is |
1459 | | * MAX_CHECKPOINT_REQUESTS entries, which fit into a maximum palloc |
1460 | | * allocation size of 1Gb. Our maximum batch size, |
1461 | | * CKPT_REQ_BATCH_SIZE, is even smaller. |
1462 | | */ |
1463 | 0 | n = Min(CheckpointerShmem->num_requests, CKPT_REQ_BATCH_SIZE); |
1464 | 0 | if (n > 0) |
1465 | 0 | { |
1466 | 0 | if (!requests) |
1467 | 0 | requests = (CheckpointerRequest *) palloc(n * sizeof(CheckpointerRequest)); |
1468 | |
|
1469 | 0 | for (i = 0; i < n; i++) |
1470 | 0 | { |
1471 | 0 | requests[i] = CheckpointerShmem->requests[CheckpointerShmem->head]; |
1472 | 0 | CheckpointerShmem->head = (CheckpointerShmem->head + 1) % CheckpointerShmem->max_requests; |
1473 | 0 | } |
1474 | |
|
1475 | 0 | CheckpointerShmem->num_requests -= n; |
1476 | |
|
1477 | 0 | } |
1478 | |
|
1479 | 0 | START_CRIT_SECTION(); |
1480 | | |
1481 | | /* Are there any requests in the queue? If so, keep going. */ |
1482 | 0 | loop = CheckpointerShmem->num_requests != 0; |
1483 | |
|
1484 | 0 | LWLockRelease(CheckpointerCommLock); |
1485 | |
|
1486 | 0 | for (request = requests; n > 0; request++, n--) |
1487 | 0 | RememberSyncRequest(&request->ftag, request->type); |
1488 | |
|
1489 | 0 | END_CRIT_SECTION(); |
1490 | 0 | } while (loop); |
1491 | |
|
1492 | 0 | if (requests) |
1493 | 0 | pfree(requests); |
1494 | 0 | } |
1495 | | |
1496 | | /* |
1497 | | * Update any shared memory configurations based on config parameters |
1498 | | */ |
1499 | | static void |
1500 | | UpdateSharedMemoryConfig(void) |
1501 | | { |
1502 | | /* update global shmem state for sync rep */ |
1503 | | SyncRepUpdateSyncStandbysDefined(); |
1504 | | |
1505 | | /* |
1506 | | * If full_page_writes has been changed by SIGHUP, we update it in shared |
1507 | | * memory and write an XLOG_FPW_CHANGE record. |
1508 | | */ |
1509 | | UpdateFullPageWrites(); |
1510 | | |
1511 | | elog(DEBUG2, "checkpointer updated shared memory configuration values"); |
1512 | | } |
1513 | | |
1514 | | /* |
1515 | | * FirstCallSinceLastCheckpoint allows a process to take an action once |
1516 | | * per checkpoint cycle by asynchronously checking for checkpoint completion. |
1517 | | */ |
1518 | | bool |
1519 | | FirstCallSinceLastCheckpoint(void) |
1520 | 0 | { |
1521 | 0 | static int ckpt_done = 0; |
1522 | 0 | int new_done; |
1523 | 0 | bool FirstCall = false; |
1524 | |
|
1525 | 0 | SpinLockAcquire(&CheckpointerShmem->ckpt_lck); |
1526 | 0 | new_done = CheckpointerShmem->ckpt_done; |
1527 | 0 | SpinLockRelease(&CheckpointerShmem->ckpt_lck); |
1528 | |
|
1529 | 0 | if (new_done != ckpt_done) |
1530 | 0 | FirstCall = true; |
1531 | |
|
1532 | 0 | ckpt_done = new_done; |
1533 | |
|
1534 | 0 | return FirstCall; |
1535 | 0 | } |