/src/postgres/src/backend/storage/sync/sync.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * sync.c |
4 | | * File synchronization management code. |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/storage/sync/sync.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | #include "postgres.h" |
16 | | |
17 | | #include <unistd.h> |
18 | | #include <fcntl.h> |
19 | | #include <sys/file.h> |
20 | | |
21 | | #include "access/clog.h" |
22 | | #include "access/commit_ts.h" |
23 | | #include "access/multixact.h" |
24 | | #include "access/xlog.h" |
25 | | #include "miscadmin.h" |
26 | | #include "pgstat.h" |
27 | | #include "portability/instr_time.h" |
28 | | #include "postmaster/bgwriter.h" |
29 | | #include "storage/fd.h" |
30 | | #include "storage/latch.h" |
31 | | #include "storage/md.h" |
32 | | #include "utils/hsearch.h" |
33 | | #include "utils/memutils.h" |
34 | | |
35 | | /* |
36 | | * In some contexts (currently, standalone backends and the checkpointer) |
37 | | * we keep track of pending fsync operations: we need to remember all relation |
38 | | * segments that have been written since the last checkpoint, so that we can |
39 | | * fsync them down to disk before completing the next checkpoint. This hash |
40 | | * table remembers the pending operations. We use a hash table mostly as |
41 | | * a convenient way of merging duplicate requests. |
42 | | * |
43 | | * We use a similar mechanism to remember no-longer-needed files that can |
44 | | * be deleted after the next checkpoint, but we use a linked list instead of |
45 | | * a hash table, because we don't expect there to be any duplicate requests. |
46 | | * |
47 | | * These mechanisms are only used for non-temp relations; we never fsync |
48 | | * temp rels, nor do we need to postpone their deletion (see comments in |
49 | | * mdunlink). |
50 | | * |
51 | | * (Regular backends do not track pending operations locally, but forward |
52 | | * them to the checkpointer.) |
53 | | */ |
54 | | typedef uint16 CycleCtr; /* can be any convenient integer size */ |
55 | | |
56 | | typedef struct |
57 | | { |
58 | | FileTag tag; /* identifies handler and file */ |
59 | | CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */ |
60 | | bool canceled; /* canceled is true if we canceled "recently" */ |
61 | | } PendingFsyncEntry; |
62 | | |
63 | | typedef struct |
64 | | { |
65 | | FileTag tag; /* identifies handler and file */ |
66 | | CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */ |
67 | | bool canceled; /* true if request has been canceled */ |
68 | | } PendingUnlinkEntry; |
69 | | |
70 | | static HTAB *pendingOps = NULL; |
71 | | static List *pendingUnlinks = NIL; |
72 | | static MemoryContext pendingOpsCxt; /* context for the above */ |
73 | | |
74 | | static CycleCtr sync_cycle_ctr = 0; |
75 | | static CycleCtr checkpoint_cycle_ctr = 0; |
76 | | |
77 | | /* Intervals for calling AbsorbSyncRequests */ |
78 | 0 | #define FSYNCS_PER_ABSORB 10 |
79 | 0 | #define UNLINKS_PER_ABSORB 10 |
80 | | |
81 | | /* |
82 | | * Function pointers for handling sync and unlink requests. |
83 | | */ |
84 | | typedef struct SyncOps |
85 | | { |
86 | | int (*sync_syncfiletag) (const FileTag *ftag, char *path); |
87 | | int (*sync_unlinkfiletag) (const FileTag *ftag, char *path); |
88 | | bool (*sync_filetagmatches) (const FileTag *ftag, |
89 | | const FileTag *candidate); |
90 | | } SyncOps; |
91 | | |
92 | | /* |
93 | | * These indexes must correspond to the values of the SyncRequestHandler enum. |
94 | | */ |
95 | | static const SyncOps syncsw[] = { |
96 | | /* magnetic disk */ |
97 | | [SYNC_HANDLER_MD] = { |
98 | | .sync_syncfiletag = mdsyncfiletag, |
99 | | .sync_unlinkfiletag = mdunlinkfiletag, |
100 | | .sync_filetagmatches = mdfiletagmatches |
101 | | }, |
102 | | /* pg_xact */ |
103 | | [SYNC_HANDLER_CLOG] = { |
104 | | .sync_syncfiletag = clogsyncfiletag |
105 | | }, |
106 | | /* pg_commit_ts */ |
107 | | [SYNC_HANDLER_COMMIT_TS] = { |
108 | | .sync_syncfiletag = committssyncfiletag |
109 | | }, |
110 | | /* pg_multixact/offsets */ |
111 | | [SYNC_HANDLER_MULTIXACT_OFFSET] = { |
112 | | .sync_syncfiletag = multixactoffsetssyncfiletag |
113 | | }, |
114 | | /* pg_multixact/members */ |
115 | | [SYNC_HANDLER_MULTIXACT_MEMBER] = { |
116 | | .sync_syncfiletag = multixactmemberssyncfiletag |
117 | | } |
118 | | }; |
119 | | |
120 | | /* |
121 | | * Initialize data structures for the file sync tracking. |
122 | | */ |
123 | | void |
124 | | InitSync(void) |
125 | 0 | { |
126 | | /* |
127 | | * Create pending-operations hashtable if we need it. Currently, we need |
128 | | * it if we are standalone (not under a postmaster) or if we are a |
129 | | * checkpointer auxiliary process. |
130 | | */ |
131 | 0 | if (!IsUnderPostmaster || AmCheckpointerProcess()) |
132 | 0 | { |
133 | 0 | HASHCTL hash_ctl; |
134 | | |
135 | | /* |
136 | | * XXX: The checkpointer needs to add entries to the pending ops table |
137 | | * when absorbing fsync requests. That is done within a critical |
138 | | * section, which isn't usually allowed, but we make an exception. It |
139 | | * means that there's a theoretical possibility that you run out of |
140 | | * memory while absorbing fsync requests, which leads to a PANIC. |
141 | | * Fortunately the hash table is small so that's unlikely to happen in |
142 | | * practice. |
143 | | */ |
144 | 0 | pendingOpsCxt = AllocSetContextCreate(TopMemoryContext, |
145 | 0 | "Pending ops context", |
146 | 0 | ALLOCSET_DEFAULT_SIZES); |
147 | 0 | MemoryContextAllowInCriticalSection(pendingOpsCxt, true); |
148 | |
|
149 | 0 | hash_ctl.keysize = sizeof(FileTag); |
150 | 0 | hash_ctl.entrysize = sizeof(PendingFsyncEntry); |
151 | 0 | hash_ctl.hcxt = pendingOpsCxt; |
152 | 0 | pendingOps = hash_create("Pending Ops Table", |
153 | 0 | 100L, |
154 | 0 | &hash_ctl, |
155 | 0 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
156 | 0 | pendingUnlinks = NIL; |
157 | 0 | } |
158 | 0 | } |
159 | | |
160 | | /* |
161 | | * SyncPreCheckpoint() -- Do pre-checkpoint work |
162 | | * |
163 | | * To distinguish unlink requests that arrived before this checkpoint |
164 | | * started from those that arrived during the checkpoint, we use a cycle |
165 | | * counter similar to the one we use for fsync requests. That cycle |
166 | | * counter is incremented here. |
167 | | * |
168 | | * This must be called *before* the checkpoint REDO point is determined. |
169 | | * That ensures that we won't delete files too soon. Since this calls |
170 | | * AbsorbSyncRequests(), which performs memory allocations, it cannot be |
171 | | * called within a critical section. |
172 | | * |
173 | | * Note that we can't do anything here that depends on the assumption |
174 | | * that the checkpoint will be completed. |
175 | | */ |
176 | | void |
177 | | SyncPreCheckpoint(void) |
178 | 0 | { |
179 | | /* |
180 | | * Operations such as DROP TABLESPACE assume that the next checkpoint will |
181 | | * process all recently forwarded unlink requests, but if they aren't |
182 | | * absorbed prior to advancing the cycle counter, they won't be processed |
183 | | * until a future checkpoint. The following absorb ensures that any |
184 | | * unlink requests forwarded before the checkpoint began will be processed |
185 | | * in the current checkpoint. |
186 | | */ |
187 | 0 | AbsorbSyncRequests(); |
188 | | |
189 | | /* |
190 | | * Any unlink requests arriving after this point will be assigned the next |
191 | | * cycle counter, and won't be unlinked until next checkpoint. |
192 | | */ |
193 | 0 | checkpoint_cycle_ctr++; |
194 | 0 | } |
195 | | |
196 | | /* |
197 | | * SyncPostCheckpoint() -- Do post-checkpoint work |
198 | | * |
199 | | * Remove any lingering files that can now be safely removed. |
200 | | */ |
201 | | void |
202 | | SyncPostCheckpoint(void) |
203 | 0 | { |
204 | 0 | int absorb_counter; |
205 | 0 | ListCell *lc; |
206 | |
|
207 | 0 | absorb_counter = UNLINKS_PER_ABSORB; |
208 | 0 | foreach(lc, pendingUnlinks) |
209 | 0 | { |
210 | 0 | PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc); |
211 | 0 | char path[MAXPGPATH]; |
212 | | |
213 | | /* Skip over any canceled entries */ |
214 | 0 | if (entry->canceled) |
215 | 0 | continue; |
216 | | |
217 | | /* |
218 | | * New entries are appended to the end, so if the entry is new we've |
219 | | * reached the end of old entries. |
220 | | * |
221 | | * Note: if just the right number of consecutive checkpoints fail, we |
222 | | * could be fooled here by cycle_ctr wraparound. However, the only |
223 | | * consequence is that we'd delay unlinking for one more checkpoint, |
224 | | * which is perfectly tolerable. |
225 | | */ |
226 | 0 | if (entry->cycle_ctr == checkpoint_cycle_ctr) |
227 | 0 | break; |
228 | | |
229 | | /* Unlink the file */ |
230 | 0 | if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, |
231 | 0 | path) < 0) |
232 | 0 | { |
233 | | /* |
234 | | * There's a race condition, when the database is dropped at the |
235 | | * same time that we process the pending unlink requests. If the |
236 | | * DROP DATABASE deletes the file before we do, we will get ENOENT |
237 | | * here. rmtree() also has to ignore ENOENT errors, to deal with |
238 | | * the possibility that we delete the file first. |
239 | | */ |
240 | 0 | if (errno != ENOENT) |
241 | 0 | ereport(WARNING, |
242 | 0 | (errcode_for_file_access(), |
243 | 0 | errmsg("could not remove file \"%s\": %m", path))); |
244 | 0 | } |
245 | | |
246 | | /* Mark the list entry as canceled, just in case */ |
247 | 0 | entry->canceled = true; |
248 | | |
249 | | /* |
250 | | * As in ProcessSyncRequests, we don't want to stop absorbing fsync |
251 | | * requests for a long time when there are many deletions to be done. |
252 | | * We can safely call AbsorbSyncRequests() at this point in the loop. |
253 | | */ |
254 | 0 | if (--absorb_counter <= 0) |
255 | 0 | { |
256 | 0 | AbsorbSyncRequests(); |
257 | 0 | absorb_counter = UNLINKS_PER_ABSORB; |
258 | 0 | } |
259 | 0 | } |
260 | | |
261 | | /* |
262 | | * If we reached the end of the list, we can just remove the whole list |
263 | | * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise, |
264 | | * we must keep the entries at or after "lc". |
265 | | */ |
266 | 0 | if (lc == NULL) |
267 | 0 | { |
268 | 0 | list_free_deep(pendingUnlinks); |
269 | 0 | pendingUnlinks = NIL; |
270 | 0 | } |
271 | 0 | else |
272 | 0 | { |
273 | 0 | int ntodelete = list_cell_number(pendingUnlinks, lc); |
274 | |
|
275 | 0 | for (int i = 0; i < ntodelete; i++) |
276 | 0 | pfree(list_nth(pendingUnlinks, i)); |
277 | |
|
278 | 0 | pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete); |
279 | 0 | } |
280 | 0 | } |
281 | | |
282 | | /* |
283 | | * ProcessSyncRequests() -- Process queued fsync requests. |
284 | | */ |
285 | | void |
286 | | ProcessSyncRequests(void) |
287 | 0 | { |
288 | 0 | static bool sync_in_progress = false; |
289 | |
|
290 | 0 | HASH_SEQ_STATUS hstat; |
291 | 0 | PendingFsyncEntry *entry; |
292 | 0 | int absorb_counter; |
293 | | |
294 | | /* Statistics on sync times */ |
295 | 0 | int processed = 0; |
296 | 0 | instr_time sync_start, |
297 | 0 | sync_end, |
298 | 0 | sync_diff; |
299 | 0 | uint64 elapsed; |
300 | 0 | uint64 longest = 0; |
301 | 0 | uint64 total_elapsed = 0; |
302 | | |
303 | | /* |
304 | | * This is only called during checkpoints, and checkpoints should only |
305 | | * occur in processes that have created a pendingOps. |
306 | | */ |
307 | 0 | if (!pendingOps) |
308 | 0 | elog(ERROR, "cannot sync without a pendingOps table"); |
309 | | |
310 | | /* |
311 | | * If we are in the checkpointer, the sync had better include all fsync |
312 | | * requests that were queued by backends up to this point. The tightest |
313 | | * race condition that could occur is that a buffer that must be written |
314 | | * and fsync'd for the checkpoint could have been dumped by a backend just |
315 | | * before it was visited by BufferSync(). We know the backend will have |
316 | | * queued an fsync request before clearing the buffer's dirtybit, so we |
317 | | * are safe as long as we do an Absorb after completing BufferSync(). |
318 | | */ |
319 | 0 | AbsorbSyncRequests(); |
320 | | |
321 | | /* |
322 | | * To avoid excess fsync'ing (in the worst case, maybe a never-terminating |
323 | | * checkpoint), we want to ignore fsync requests that are entered into the |
324 | | * hashtable after this point --- they should be processed next time, |
325 | | * instead. We use sync_cycle_ctr to tell old entries apart from new |
326 | | * ones: new ones will have cycle_ctr equal to the incremented value of |
327 | | * sync_cycle_ctr. |
328 | | * |
329 | | * In normal circumstances, all entries present in the table at this point |
330 | | * will have cycle_ctr exactly equal to the current (about to be old) |
331 | | * value of sync_cycle_ctr. However, if we fail partway through the |
332 | | * fsync'ing loop, then older values of cycle_ctr might remain when we |
333 | | * come back here to try again. Repeated checkpoint failures would |
334 | | * eventually wrap the counter around to the point where an old entry |
335 | | * might appear new, causing us to skip it, possibly allowing a checkpoint |
336 | | * to succeed that should not have. To forestall wraparound, any time the |
337 | | * previous ProcessSyncRequests() failed to complete, run through the |
338 | | * table and forcibly set cycle_ctr = sync_cycle_ctr. |
339 | | * |
340 | | * Think not to merge this loop with the main loop, as the problem is |
341 | | * exactly that that loop may fail before having visited all the entries. |
342 | | * From a performance point of view it doesn't matter anyway, as this path |
343 | | * will never be taken in a system that's functioning normally. |
344 | | */ |
345 | 0 | if (sync_in_progress) |
346 | 0 | { |
347 | | /* prior try failed, so update any stale cycle_ctr values */ |
348 | 0 | hash_seq_init(&hstat, pendingOps); |
349 | 0 | while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) |
350 | 0 | { |
351 | 0 | entry->cycle_ctr = sync_cycle_ctr; |
352 | 0 | } |
353 | 0 | } |
354 | | |
355 | | /* Advance counter so that new hashtable entries are distinguishable */ |
356 | 0 | sync_cycle_ctr++; |
357 | | |
358 | | /* Set flag to detect failure if we don't reach the end of the loop */ |
359 | 0 | sync_in_progress = true; |
360 | | |
361 | | /* Now scan the hashtable for fsync requests to process */ |
362 | 0 | absorb_counter = FSYNCS_PER_ABSORB; |
363 | 0 | hash_seq_init(&hstat, pendingOps); |
364 | 0 | while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) |
365 | 0 | { |
366 | 0 | int failures; |
367 | | |
368 | | /* |
369 | | * If the entry is new then don't process it this time; it is new. |
370 | | * Note "continue" bypasses the hash-remove call at the bottom of the |
371 | | * loop. |
372 | | */ |
373 | 0 | if (entry->cycle_ctr == sync_cycle_ctr) |
374 | 0 | continue; |
375 | | |
376 | | /* Else assert we haven't missed it */ |
377 | 0 | Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr); |
378 | | |
379 | | /* |
380 | | * If fsync is off then we don't have to bother opening the file at |
381 | | * all. (We delay checking until this point so that changing fsync on |
382 | | * the fly behaves sensibly.) |
383 | | */ |
384 | 0 | if (enableFsync) |
385 | 0 | { |
386 | | /* |
387 | | * If in checkpointer, we want to absorb pending requests every so |
388 | | * often to prevent overflow of the fsync request queue. It is |
389 | | * unspecified whether newly-added entries will be visited by |
390 | | * hash_seq_search, but we don't care since we don't need to |
391 | | * process them anyway. |
392 | | */ |
393 | 0 | if (--absorb_counter <= 0) |
394 | 0 | { |
395 | 0 | AbsorbSyncRequests(); |
396 | 0 | absorb_counter = FSYNCS_PER_ABSORB; |
397 | 0 | } |
398 | | |
399 | | /* |
400 | | * The fsync table could contain requests to fsync segments that |
401 | | * have been deleted (unlinked) by the time we get to them. Rather |
402 | | * than just hoping an ENOENT (or EACCES on Windows) error can be |
403 | | * ignored, what we do on error is absorb pending requests and |
404 | | * then retry. Since mdunlink() queues a "cancel" message before |
405 | | * actually unlinking, the fsync request is guaranteed to be |
406 | | * marked canceled after the absorb if it really was this case. |
407 | | * DROP DATABASE likewise has to tell us to forget fsync requests |
408 | | * before it starts deletions. |
409 | | */ |
410 | 0 | for (failures = 0; !entry->canceled; failures++) |
411 | 0 | { |
412 | 0 | char path[MAXPGPATH]; |
413 | |
|
414 | 0 | INSTR_TIME_SET_CURRENT(sync_start); |
415 | 0 | if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag, |
416 | 0 | path) == 0) |
417 | 0 | { |
418 | | /* Success; update statistics about sync timing */ |
419 | 0 | INSTR_TIME_SET_CURRENT(sync_end); |
420 | 0 | sync_diff = sync_end; |
421 | 0 | INSTR_TIME_SUBTRACT(sync_diff, sync_start); |
422 | 0 | elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); |
423 | 0 | if (elapsed > longest) |
424 | 0 | longest = elapsed; |
425 | 0 | total_elapsed += elapsed; |
426 | 0 | processed++; |
427 | |
|
428 | 0 | if (log_checkpoints) |
429 | 0 | elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms", |
430 | 0 | processed, |
431 | 0 | path, |
432 | 0 | (double) elapsed / 1000); |
433 | | |
434 | 0 | break; /* out of retry loop */ |
435 | 0 | } |
436 | | |
437 | | /* |
438 | | * It is possible that the relation has been dropped or |
439 | | * truncated since the fsync request was entered. Therefore, |
440 | | * allow ENOENT, but only if we didn't fail already on this |
441 | | * file. |
442 | | */ |
443 | 0 | if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) |
444 | 0 | ereport(data_sync_elevel(ERROR), |
445 | 0 | (errcode_for_file_access(), |
446 | 0 | errmsg("could not fsync file \"%s\": %m", |
447 | 0 | path))); |
448 | 0 | else |
449 | 0 | ereport(DEBUG1, |
450 | 0 | (errcode_for_file_access(), |
451 | 0 | errmsg_internal("could not fsync file \"%s\" but retrying: %m", |
452 | 0 | path))); |
453 | | |
454 | | /* |
455 | | * Absorb incoming requests and check to see if a cancel |
456 | | * arrived for this relation fork. |
457 | | */ |
458 | 0 | AbsorbSyncRequests(); |
459 | 0 | absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ |
460 | 0 | } /* end retry loop */ |
461 | 0 | } |
462 | | |
463 | | /* We are done with this entry, remove it */ |
464 | 0 | if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL) |
465 | 0 | elog(ERROR, "pendingOps corrupted"); |
466 | 0 | } /* end loop over hashtable entries */ |
467 | | |
468 | | /* Return sync performance metrics for report at checkpoint end */ |
469 | 0 | CheckpointStats.ckpt_sync_rels = processed; |
470 | 0 | CheckpointStats.ckpt_longest_sync = longest; |
471 | 0 | CheckpointStats.ckpt_agg_sync_time = total_elapsed; |
472 | | |
473 | | /* Flag successful completion of ProcessSyncRequests */ |
474 | 0 | sync_in_progress = false; |
475 | 0 | } |
476 | | |
477 | | /* |
478 | | * RememberSyncRequest() -- callback from checkpointer side of sync request |
479 | | * |
480 | | * We stuff fsync requests into the local hash table for execution |
481 | | * during the checkpointer's next checkpoint. UNLINK requests go into a |
482 | | * separate linked list, however, because they get processed separately. |
483 | | * |
484 | | * See sync.h for more information on the types of sync requests supported. |
485 | | */ |
486 | | void |
487 | | RememberSyncRequest(const FileTag *ftag, SyncRequestType type) |
488 | 0 | { |
489 | 0 | Assert(pendingOps); |
490 | |
|
491 | 0 | if (type == SYNC_FORGET_REQUEST) |
492 | 0 | { |
493 | 0 | PendingFsyncEntry *entry; |
494 | | |
495 | | /* Cancel previously entered request */ |
496 | 0 | entry = (PendingFsyncEntry *) hash_search(pendingOps, |
497 | 0 | ftag, |
498 | 0 | HASH_FIND, |
499 | 0 | NULL); |
500 | 0 | if (entry != NULL) |
501 | 0 | entry->canceled = true; |
502 | 0 | } |
503 | 0 | else if (type == SYNC_FILTER_REQUEST) |
504 | 0 | { |
505 | 0 | HASH_SEQ_STATUS hstat; |
506 | 0 | PendingFsyncEntry *pfe; |
507 | 0 | ListCell *cell; |
508 | | |
509 | | /* Cancel matching fsync requests */ |
510 | 0 | hash_seq_init(&hstat, pendingOps); |
511 | 0 | while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) |
512 | 0 | { |
513 | 0 | if (pfe->tag.handler == ftag->handler && |
514 | 0 | syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag)) |
515 | 0 | pfe->canceled = true; |
516 | 0 | } |
517 | | |
518 | | /* Cancel matching unlink requests */ |
519 | 0 | foreach(cell, pendingUnlinks) |
520 | 0 | { |
521 | 0 | PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell); |
522 | |
|
523 | 0 | if (pue->tag.handler == ftag->handler && |
524 | 0 | syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag)) |
525 | 0 | pue->canceled = true; |
526 | 0 | } |
527 | 0 | } |
528 | 0 | else if (type == SYNC_UNLINK_REQUEST) |
529 | 0 | { |
530 | | /* Unlink request: put it in the linked list */ |
531 | 0 | MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); |
532 | 0 | PendingUnlinkEntry *entry; |
533 | |
|
534 | 0 | entry = palloc(sizeof(PendingUnlinkEntry)); |
535 | 0 | entry->tag = *ftag; |
536 | 0 | entry->cycle_ctr = checkpoint_cycle_ctr; |
537 | 0 | entry->canceled = false; |
538 | |
|
539 | 0 | pendingUnlinks = lappend(pendingUnlinks, entry); |
540 | |
|
541 | 0 | MemoryContextSwitchTo(oldcxt); |
542 | 0 | } |
543 | 0 | else |
544 | 0 | { |
545 | | /* Normal case: enter a request to fsync this segment */ |
546 | 0 | MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); |
547 | 0 | PendingFsyncEntry *entry; |
548 | 0 | bool found; |
549 | |
|
550 | 0 | Assert(type == SYNC_REQUEST); |
551 | |
|
552 | 0 | entry = (PendingFsyncEntry *) hash_search(pendingOps, |
553 | 0 | ftag, |
554 | 0 | HASH_ENTER, |
555 | 0 | &found); |
556 | | /* if new entry, or was previously canceled, initialize it */ |
557 | 0 | if (!found || entry->canceled) |
558 | 0 | { |
559 | 0 | entry->cycle_ctr = sync_cycle_ctr; |
560 | 0 | entry->canceled = false; |
561 | 0 | } |
562 | | |
563 | | /* |
564 | | * NB: it's intentional that we don't change cycle_ctr if the entry |
565 | | * already exists. The cycle_ctr must represent the oldest fsync |
566 | | * request that could be in the entry. |
567 | | */ |
568 | |
|
569 | 0 | MemoryContextSwitchTo(oldcxt); |
570 | 0 | } |
571 | 0 | } |
572 | | |
573 | | /* |
574 | | * Register the sync request locally, or forward it to the checkpointer. |
575 | | * |
576 | | * If retryOnError is true, we'll keep trying if there is no space in the |
577 | | * queue. Return true if we succeeded, or false if there wasn't space. |
578 | | */ |
579 | | bool |
580 | | RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, |
581 | | bool retryOnError) |
582 | 0 | { |
583 | 0 | bool ret; |
584 | |
|
585 | 0 | if (pendingOps != NULL) |
586 | 0 | { |
587 | | /* standalone backend or startup process: fsync state is local */ |
588 | 0 | RememberSyncRequest(ftag, type); |
589 | 0 | return true; |
590 | 0 | } |
591 | | |
592 | 0 | for (;;) |
593 | 0 | { |
594 | | /* |
595 | | * Notify the checkpointer about it. If we fail to queue a message in |
596 | | * retryOnError mode, we have to sleep and try again ... ugly, but |
597 | | * hopefully won't happen often. |
598 | | * |
599 | | * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an |
600 | | * error in the case of SYNC_UNLINK_REQUEST would leave the |
601 | | * no-longer-used file still present on disk, which would be bad, so |
602 | | * I'm inclined to assume that the checkpointer will always empty the |
603 | | * queue soon. |
604 | | */ |
605 | 0 | ret = ForwardSyncRequest(ftag, type); |
606 | | |
607 | | /* |
608 | | * If we are successful in queueing the request, or we failed and were |
609 | | * instructed not to retry on error, break. |
610 | | */ |
611 | 0 | if (ret || (!ret && !retryOnError)) |
612 | 0 | break; |
613 | | |
614 | 0 | WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10, |
615 | 0 | WAIT_EVENT_REGISTER_SYNC_REQUEST); |
616 | 0 | } |
617 | |
|
618 | 0 | return ret; |
619 | 0 | } |