Coverage Report

Created: 2025-08-12 06:43

/src/postgres/src/backend/storage/sync/sync.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * sync.c
4
 *    File synchronization management code.
5
 *
6
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7
 * Portions Copyright (c) 1994, Regents of the University of California
8
 *
9
 *
10
 * IDENTIFICATION
11
 *    src/backend/storage/sync/sync.c
12
 *
13
 *-------------------------------------------------------------------------
14
 */
15
#include "postgres.h"
16
17
#include <unistd.h>
18
#include <fcntl.h>
19
#include <sys/file.h>
20
21
#include "access/clog.h"
22
#include "access/commit_ts.h"
23
#include "access/multixact.h"
24
#include "access/xlog.h"
25
#include "miscadmin.h"
26
#include "pgstat.h"
27
#include "portability/instr_time.h"
28
#include "postmaster/bgwriter.h"
29
#include "storage/fd.h"
30
#include "storage/latch.h"
31
#include "storage/md.h"
32
#include "utils/hsearch.h"
33
#include "utils/memutils.h"
34
35
/*
36
 * In some contexts (currently, standalone backends and the checkpointer)
37
 * we keep track of pending fsync operations: we need to remember all relation
38
 * segments that have been written since the last checkpoint, so that we can
39
 * fsync them down to disk before completing the next checkpoint.  This hash
40
 * table remembers the pending operations.  We use a hash table mostly as
41
 * a convenient way of merging duplicate requests.
42
 *
43
 * We use a similar mechanism to remember no-longer-needed files that can
44
 * be deleted after the next checkpoint, but we use a linked list instead of
45
 * a hash table, because we don't expect there to be any duplicate requests.
46
 *
47
 * These mechanisms are only used for non-temp relations; we never fsync
48
 * temp rels, nor do we need to postpone their deletion (see comments in
49
 * mdunlink).
50
 *
51
 * (Regular backends do not track pending operations locally, but forward
52
 * them to the checkpointer.)
53
 */
54
typedef uint16 CycleCtr;    /* can be any convenient integer size */
55
56
typedef struct
57
{
58
  FileTag   tag;      /* identifies handler and file */
59
  CycleCtr  cycle_ctr;    /* sync_cycle_ctr of oldest request */
60
  bool    canceled;   /* canceled is true if we canceled "recently" */
61
} PendingFsyncEntry;
62
63
typedef struct
64
{
65
  FileTag   tag;      /* identifies handler and file */
66
  CycleCtr  cycle_ctr;    /* checkpoint_cycle_ctr when request was made */
67
  bool    canceled;   /* true if request has been canceled */
68
} PendingUnlinkEntry;
69
70
static HTAB *pendingOps = NULL;
71
static List *pendingUnlinks = NIL;
72
static MemoryContext pendingOpsCxt; /* context for the above  */
73
74
static CycleCtr sync_cycle_ctr = 0;
75
static CycleCtr checkpoint_cycle_ctr = 0;
76
77
/* Intervals for calling AbsorbSyncRequests */
78
0
#define FSYNCS_PER_ABSORB   10
79
0
#define UNLINKS_PER_ABSORB    10
80
81
/*
82
 * Function pointers for handling sync and unlink requests.
83
 */
84
typedef struct SyncOps
85
{
86
  int     (*sync_syncfiletag) (const FileTag *ftag, char *path);
87
  int     (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
88
  bool    (*sync_filetagmatches) (const FileTag *ftag,
89
                    const FileTag *candidate);
90
} SyncOps;
91
92
/*
93
 * These indexes must correspond to the values of the SyncRequestHandler enum.
94
 */
95
static const SyncOps syncsw[] = {
96
  /* magnetic disk */
97
  [SYNC_HANDLER_MD] = {
98
    .sync_syncfiletag = mdsyncfiletag,
99
    .sync_unlinkfiletag = mdunlinkfiletag,
100
    .sync_filetagmatches = mdfiletagmatches
101
  },
102
  /* pg_xact */
103
  [SYNC_HANDLER_CLOG] = {
104
    .sync_syncfiletag = clogsyncfiletag
105
  },
106
  /* pg_commit_ts */
107
  [SYNC_HANDLER_COMMIT_TS] = {
108
    .sync_syncfiletag = committssyncfiletag
109
  },
110
  /* pg_multixact/offsets */
111
  [SYNC_HANDLER_MULTIXACT_OFFSET] = {
112
    .sync_syncfiletag = multixactoffsetssyncfiletag
113
  },
114
  /* pg_multixact/members */
115
  [SYNC_HANDLER_MULTIXACT_MEMBER] = {
116
    .sync_syncfiletag = multixactmemberssyncfiletag
117
  }
118
};
119
120
/*
121
 * Initialize data structures for the file sync tracking.
122
 */
123
void
124
InitSync(void)
125
0
{
126
  /*
127
   * Create pending-operations hashtable if we need it.  Currently, we need
128
   * it if we are standalone (not under a postmaster) or if we are a
129
   * checkpointer auxiliary process.
130
   */
131
0
  if (!IsUnderPostmaster || AmCheckpointerProcess())
132
0
  {
133
0
    HASHCTL   hash_ctl;
134
135
    /*
136
     * XXX: The checkpointer needs to add entries to the pending ops table
137
     * when absorbing fsync requests.  That is done within a critical
138
     * section, which isn't usually allowed, but we make an exception. It
139
     * means that there's a theoretical possibility that you run out of
140
     * memory while absorbing fsync requests, which leads to a PANIC.
141
     * Fortunately the hash table is small so that's unlikely to happen in
142
     * practice.
143
     */
144
0
    pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
145
0
                        "Pending ops context",
146
0
                        ALLOCSET_DEFAULT_SIZES);
147
0
    MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
148
149
0
    hash_ctl.keysize = sizeof(FileTag);
150
0
    hash_ctl.entrysize = sizeof(PendingFsyncEntry);
151
0
    hash_ctl.hcxt = pendingOpsCxt;
152
0
    pendingOps = hash_create("Pending Ops Table",
153
0
                 100L,
154
0
                 &hash_ctl,
155
0
                 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
156
0
    pendingUnlinks = NIL;
157
0
  }
158
0
}
159
160
/*
161
 * SyncPreCheckpoint() -- Do pre-checkpoint work
162
 *
163
 * To distinguish unlink requests that arrived before this checkpoint
164
 * started from those that arrived during the checkpoint, we use a cycle
165
 * counter similar to the one we use for fsync requests. That cycle
166
 * counter is incremented here.
167
 *
168
 * This must be called *before* the checkpoint REDO point is determined.
169
 * That ensures that we won't delete files too soon.  Since this calls
170
 * AbsorbSyncRequests(), which performs memory allocations, it cannot be
171
 * called within a critical section.
172
 *
173
 * Note that we can't do anything here that depends on the assumption
174
 * that the checkpoint will be completed.
175
 */
176
void
177
SyncPreCheckpoint(void)
178
0
{
179
  /*
180
   * Operations such as DROP TABLESPACE assume that the next checkpoint will
181
   * process all recently forwarded unlink requests, but if they aren't
182
   * absorbed prior to advancing the cycle counter, they won't be processed
183
   * until a future checkpoint.  The following absorb ensures that any
184
   * unlink requests forwarded before the checkpoint began will be processed
185
   * in the current checkpoint.
186
   */
187
0
  AbsorbSyncRequests();
188
189
  /*
190
   * Any unlink requests arriving after this point will be assigned the next
191
   * cycle counter, and won't be unlinked until next checkpoint.
192
   */
193
0
  checkpoint_cycle_ctr++;
194
0
}
195
196
/*
197
 * SyncPostCheckpoint() -- Do post-checkpoint work
198
 *
199
 * Remove any lingering files that can now be safely removed.
200
 */
201
void
202
SyncPostCheckpoint(void)
203
0
{
204
0
  int     absorb_counter;
205
0
  ListCell   *lc;
206
207
0
  absorb_counter = UNLINKS_PER_ABSORB;
208
0
  foreach(lc, pendingUnlinks)
209
0
  {
210
0
    PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
211
0
    char    path[MAXPGPATH];
212
213
    /* Skip over any canceled entries */
214
0
    if (entry->canceled)
215
0
      continue;
216
217
    /*
218
     * New entries are appended to the end, so if the entry is new we've
219
     * reached the end of old entries.
220
     *
221
     * Note: if just the right number of consecutive checkpoints fail, we
222
     * could be fooled here by cycle_ctr wraparound.  However, the only
223
     * consequence is that we'd delay unlinking for one more checkpoint,
224
     * which is perfectly tolerable.
225
     */
226
0
    if (entry->cycle_ctr == checkpoint_cycle_ctr)
227
0
      break;
228
229
    /* Unlink the file */
230
0
    if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
231
0
                              path) < 0)
232
0
    {
233
      /*
234
       * There's a race condition, when the database is dropped at the
235
       * same time that we process the pending unlink requests. If the
236
       * DROP DATABASE deletes the file before we do, we will get ENOENT
237
       * here. rmtree() also has to ignore ENOENT errors, to deal with
238
       * the possibility that we delete the file first.
239
       */
240
0
      if (errno != ENOENT)
241
0
        ereport(WARNING,
242
0
            (errcode_for_file_access(),
243
0
             errmsg("could not remove file \"%s\": %m", path)));
244
0
    }
245
246
    /* Mark the list entry as canceled, just in case */
247
0
    entry->canceled = true;
248
249
    /*
250
     * As in ProcessSyncRequests, we don't want to stop absorbing fsync
251
     * requests for a long time when there are many deletions to be done.
252
     * We can safely call AbsorbSyncRequests() at this point in the loop.
253
     */
254
0
    if (--absorb_counter <= 0)
255
0
    {
256
0
      AbsorbSyncRequests();
257
0
      absorb_counter = UNLINKS_PER_ABSORB;
258
0
    }
259
0
  }
260
261
  /*
262
   * If we reached the end of the list, we can just remove the whole list
263
   * (remembering to pfree all the PendingUnlinkEntry objects).  Otherwise,
264
   * we must keep the entries at or after "lc".
265
   */
266
0
  if (lc == NULL)
267
0
  {
268
0
    list_free_deep(pendingUnlinks);
269
0
    pendingUnlinks = NIL;
270
0
  }
271
0
  else
272
0
  {
273
0
    int     ntodelete = list_cell_number(pendingUnlinks, lc);
274
275
0
    for (int i = 0; i < ntodelete; i++)
276
0
      pfree(list_nth(pendingUnlinks, i));
277
278
0
    pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
279
0
  }
280
0
}
281
282
/*
283
 *  ProcessSyncRequests() -- Process queued fsync requests.
284
 */
285
void
286
ProcessSyncRequests(void)
287
0
{
288
0
  static bool sync_in_progress = false;
289
290
0
  HASH_SEQ_STATUS hstat;
291
0
  PendingFsyncEntry *entry;
292
0
  int     absorb_counter;
293
294
  /* Statistics on sync times */
295
0
  int     processed = 0;
296
0
  instr_time  sync_start,
297
0
        sync_end,
298
0
        sync_diff;
299
0
  uint64    elapsed;
300
0
  uint64    longest = 0;
301
0
  uint64    total_elapsed = 0;
302
303
  /*
304
   * This is only called during checkpoints, and checkpoints should only
305
   * occur in processes that have created a pendingOps.
306
   */
307
0
  if (!pendingOps)
308
0
    elog(ERROR, "cannot sync without a pendingOps table");
309
310
  /*
311
   * If we are in the checkpointer, the sync had better include all fsync
312
   * requests that were queued by backends up to this point.  The tightest
313
   * race condition that could occur is that a buffer that must be written
314
   * and fsync'd for the checkpoint could have been dumped by a backend just
315
   * before it was visited by BufferSync().  We know the backend will have
316
   * queued an fsync request before clearing the buffer's dirtybit, so we
317
   * are safe as long as we do an Absorb after completing BufferSync().
318
   */
319
0
  AbsorbSyncRequests();
320
321
  /*
322
   * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
323
   * checkpoint), we want to ignore fsync requests that are entered into the
324
   * hashtable after this point --- they should be processed next time,
325
   * instead.  We use sync_cycle_ctr to tell old entries apart from new
326
   * ones: new ones will have cycle_ctr equal to the incremented value of
327
   * sync_cycle_ctr.
328
   *
329
   * In normal circumstances, all entries present in the table at this point
330
   * will have cycle_ctr exactly equal to the current (about to be old)
331
   * value of sync_cycle_ctr.  However, if we fail partway through the
332
   * fsync'ing loop, then older values of cycle_ctr might remain when we
333
   * come back here to try again.  Repeated checkpoint failures would
334
   * eventually wrap the counter around to the point where an old entry
335
   * might appear new, causing us to skip it, possibly allowing a checkpoint
336
   * to succeed that should not have.  To forestall wraparound, any time the
337
   * previous ProcessSyncRequests() failed to complete, run through the
338
   * table and forcibly set cycle_ctr = sync_cycle_ctr.
339
   *
340
   * Think not to merge this loop with the main loop, as the problem is
341
   * exactly that that loop may fail before having visited all the entries.
342
   * From a performance point of view it doesn't matter anyway, as this path
343
   * will never be taken in a system that's functioning normally.
344
   */
345
0
  if (sync_in_progress)
346
0
  {
347
    /* prior try failed, so update any stale cycle_ctr values */
348
0
    hash_seq_init(&hstat, pendingOps);
349
0
    while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
350
0
    {
351
0
      entry->cycle_ctr = sync_cycle_ctr;
352
0
    }
353
0
  }
354
355
  /* Advance counter so that new hashtable entries are distinguishable */
356
0
  sync_cycle_ctr++;
357
358
  /* Set flag to detect failure if we don't reach the end of the loop */
359
0
  sync_in_progress = true;
360
361
  /* Now scan the hashtable for fsync requests to process */
362
0
  absorb_counter = FSYNCS_PER_ABSORB;
363
0
  hash_seq_init(&hstat, pendingOps);
364
0
  while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
365
0
  {
366
0
    int     failures;
367
368
    /*
369
     * If the entry is new then don't process it this time; it is new.
370
     * Note "continue" bypasses the hash-remove call at the bottom of the
371
     * loop.
372
     */
373
0
    if (entry->cycle_ctr == sync_cycle_ctr)
374
0
      continue;
375
376
    /* Else assert we haven't missed it */
377
0
    Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
378
379
    /*
380
     * If fsync is off then we don't have to bother opening the file at
381
     * all.  (We delay checking until this point so that changing fsync on
382
     * the fly behaves sensibly.)
383
     */
384
0
    if (enableFsync)
385
0
    {
386
      /*
387
       * If in checkpointer, we want to absorb pending requests every so
388
       * often to prevent overflow of the fsync request queue.  It is
389
       * unspecified whether newly-added entries will be visited by
390
       * hash_seq_search, but we don't care since we don't need to
391
       * process them anyway.
392
       */
393
0
      if (--absorb_counter <= 0)
394
0
      {
395
0
        AbsorbSyncRequests();
396
0
        absorb_counter = FSYNCS_PER_ABSORB;
397
0
      }
398
399
      /*
400
       * The fsync table could contain requests to fsync segments that
401
       * have been deleted (unlinked) by the time we get to them. Rather
402
       * than just hoping an ENOENT (or EACCES on Windows) error can be
403
       * ignored, what we do on error is absorb pending requests and
404
       * then retry. Since mdunlink() queues a "cancel" message before
405
       * actually unlinking, the fsync request is guaranteed to be
406
       * marked canceled after the absorb if it really was this case.
407
       * DROP DATABASE likewise has to tell us to forget fsync requests
408
       * before it starts deletions.
409
       */
410
0
      for (failures = 0; !entry->canceled; failures++)
411
0
      {
412
0
        char    path[MAXPGPATH];
413
414
0
        INSTR_TIME_SET_CURRENT(sync_start);
415
0
        if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
416
0
                                path) == 0)
417
0
        {
418
          /* Success; update statistics about sync timing */
419
0
          INSTR_TIME_SET_CURRENT(sync_end);
420
0
          sync_diff = sync_end;
421
0
          INSTR_TIME_SUBTRACT(sync_diff, sync_start);
422
0
          elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
423
0
          if (elapsed > longest)
424
0
            longest = elapsed;
425
0
          total_elapsed += elapsed;
426
0
          processed++;
427
428
0
          if (log_checkpoints)
429
0
            elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
430
0
               processed,
431
0
               path,
432
0
               (double) elapsed / 1000);
433
434
0
          break;   /* out of retry loop */
435
0
        }
436
437
        /*
438
         * It is possible that the relation has been dropped or
439
         * truncated since the fsync request was entered. Therefore,
440
         * allow ENOENT, but only if we didn't fail already on this
441
         * file.
442
         */
443
0
        if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
444
0
          ereport(data_sync_elevel(ERROR),
445
0
              (errcode_for_file_access(),
446
0
               errmsg("could not fsync file \"%s\": %m",
447
0
                  path)));
448
0
        else
449
0
          ereport(DEBUG1,
450
0
              (errcode_for_file_access(),
451
0
               errmsg_internal("could not fsync file \"%s\" but retrying: %m",
452
0
                       path)));
453
454
        /*
455
         * Absorb incoming requests and check to see if a cancel
456
         * arrived for this relation fork.
457
         */
458
0
        AbsorbSyncRequests();
459
0
        absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
460
0
      }         /* end retry loop */
461
0
    }
462
463
    /* We are done with this entry, remove it */
464
0
    if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
465
0
      elog(ERROR, "pendingOps corrupted");
466
0
  }             /* end loop over hashtable entries */
467
468
  /* Return sync performance metrics for report at checkpoint end */
469
0
  CheckpointStats.ckpt_sync_rels = processed;
470
0
  CheckpointStats.ckpt_longest_sync = longest;
471
0
  CheckpointStats.ckpt_agg_sync_time = total_elapsed;
472
473
  /* Flag successful completion of ProcessSyncRequests */
474
0
  sync_in_progress = false;
475
0
}
476
477
/*
478
 * RememberSyncRequest() -- callback from checkpointer side of sync request
479
 *
480
 * We stuff fsync requests into the local hash table for execution
481
 * during the checkpointer's next checkpoint.  UNLINK requests go into a
482
 * separate linked list, however, because they get processed separately.
483
 *
484
 * See sync.h for more information on the types of sync requests supported.
485
 */
486
void
487
RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
488
0
{
489
0
  Assert(pendingOps);
490
491
0
  if (type == SYNC_FORGET_REQUEST)
492
0
  {
493
0
    PendingFsyncEntry *entry;
494
495
    /* Cancel previously entered request */
496
0
    entry = (PendingFsyncEntry *) hash_search(pendingOps,
497
0
                          ftag,
498
0
                          HASH_FIND,
499
0
                          NULL);
500
0
    if (entry != NULL)
501
0
      entry->canceled = true;
502
0
  }
503
0
  else if (type == SYNC_FILTER_REQUEST)
504
0
  {
505
0
    HASH_SEQ_STATUS hstat;
506
0
    PendingFsyncEntry *pfe;
507
0
    ListCell   *cell;
508
509
    /* Cancel matching fsync requests */
510
0
    hash_seq_init(&hstat, pendingOps);
511
0
    while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
512
0
    {
513
0
      if (pfe->tag.handler == ftag->handler &&
514
0
        syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag))
515
0
        pfe->canceled = true;
516
0
    }
517
518
    /* Cancel matching unlink requests */
519
0
    foreach(cell, pendingUnlinks)
520
0
    {
521
0
      PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell);
522
523
0
      if (pue->tag.handler == ftag->handler &&
524
0
        syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag))
525
0
        pue->canceled = true;
526
0
    }
527
0
  }
528
0
  else if (type == SYNC_UNLINK_REQUEST)
529
0
  {
530
    /* Unlink request: put it in the linked list */
531
0
    MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
532
0
    PendingUnlinkEntry *entry;
533
534
0
    entry = palloc(sizeof(PendingUnlinkEntry));
535
0
    entry->tag = *ftag;
536
0
    entry->cycle_ctr = checkpoint_cycle_ctr;
537
0
    entry->canceled = false;
538
539
0
    pendingUnlinks = lappend(pendingUnlinks, entry);
540
541
0
    MemoryContextSwitchTo(oldcxt);
542
0
  }
543
0
  else
544
0
  {
545
    /* Normal case: enter a request to fsync this segment */
546
0
    MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
547
0
    PendingFsyncEntry *entry;
548
0
    bool    found;
549
550
0
    Assert(type == SYNC_REQUEST);
551
552
0
    entry = (PendingFsyncEntry *) hash_search(pendingOps,
553
0
                          ftag,
554
0
                          HASH_ENTER,
555
0
                          &found);
556
    /* if new entry, or was previously canceled, initialize it */
557
0
    if (!found || entry->canceled)
558
0
    {
559
0
      entry->cycle_ctr = sync_cycle_ctr;
560
0
      entry->canceled = false;
561
0
    }
562
563
    /*
564
     * NB: it's intentional that we don't change cycle_ctr if the entry
565
     * already exists.  The cycle_ctr must represent the oldest fsync
566
     * request that could be in the entry.
567
     */
568
569
0
    MemoryContextSwitchTo(oldcxt);
570
0
  }
571
0
}
572
573
/*
574
 * Register the sync request locally, or forward it to the checkpointer.
575
 *
576
 * If retryOnError is true, we'll keep trying if there is no space in the
577
 * queue.  Return true if we succeeded, or false if there wasn't space.
578
 */
579
bool
580
RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
581
          bool retryOnError)
582
0
{
583
0
  bool    ret;
584
585
0
  if (pendingOps != NULL)
586
0
  {
587
    /* standalone backend or startup process: fsync state is local */
588
0
    RememberSyncRequest(ftag, type);
589
0
    return true;
590
0
  }
591
592
0
  for (;;)
593
0
  {
594
    /*
595
     * Notify the checkpointer about it.  If we fail to queue a message in
596
     * retryOnError mode, we have to sleep and try again ... ugly, but
597
     * hopefully won't happen often.
598
     *
599
     * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
600
     * error in the case of SYNC_UNLINK_REQUEST would leave the
601
     * no-longer-used file still present on disk, which would be bad, so
602
     * I'm inclined to assume that the checkpointer will always empty the
603
     * queue soon.
604
     */
605
0
    ret = ForwardSyncRequest(ftag, type);
606
607
    /*
608
     * If we are successful in queueing the request, or we failed and were
609
     * instructed not to retry on error, break.
610
     */
611
0
    if (ret || (!ret && !retryOnError))
612
0
      break;
613
614
0
    WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
615
0
          WAIT_EVENT_REGISTER_SYNC_REQUEST);
616
0
  }
617
618
0
  return ret;
619
0
}