/src/postgres/src/backend/storage/sync/sync.c

Source (jump to first uncovered line)
/*-------------------------------------------------------------------------
 *
 * sync.c
 *    File synchronization management code.
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *    src/backend/storage/sync/sync.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include <unistd.h>
#include <fcntl.h>
#include <sys/file.h>

#include "access/clog.h"
#include "access/commit_ts.h"
#include "access/multixact.h"
#include "access/xlog.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "portability/instr_time.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
#include "storage/latch.h"
#include "storage/md.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"

/*
 * In some contexts (currently, standalone backends and the checkpointer)
 * we keep track of pending fsync operations: we need to remember all relation
 * segments that have been written since the last checkpoint, so that we can
 * fsync them down to disk before completing the next checkpoint.  This hash
 * table remembers the pending operations.  We use a hash table mostly as
 * a convenient way of merging duplicate requests.
 *
 * We use a similar mechanism to remember no-longer-needed files that can
 * be deleted after the next checkpoint, but we use a linked list instead of
 * a hash table, because we don't expect there to be any duplicate requests.
 *
 * These mechanisms are only used for non-temp relations; we never fsync
 * temp rels, nor do we need to postpone their deletion (see comments in
 * mdunlink).
 *
 * (Regular backends do not track pending operations locally, but forward
 * them to the checkpointer.)
 */
typedef uint16 CycleCtr;    /* can be any convenient integer size */

typedef struct
{
  FileTag   tag;      /* identifies handler and file */
  CycleCtr  cycle_ctr;    /* sync_cycle_ctr of oldest request */
  bool    canceled;   /* canceled is true if we canceled "recently" */
} PendingFsyncEntry;

typedef struct
{
  FileTag   tag;      /* identifies handler and file */
  CycleCtr  cycle_ctr;    /* checkpoint_cycle_ctr when request was made */
  bool    canceled;   /* true if request has been canceled */
} PendingUnlinkEntry;

static HTAB *pendingOps = NULL;
static List *pendingUnlinks = NIL;
static MemoryContext pendingOpsCxt; /* context for the above  */

static CycleCtr sync_cycle_ctr = 0;
static CycleCtr checkpoint_cycle_ctr = 0;

/* Intervals for calling AbsorbSyncRequests */
#define FSYNCS_PER_ABSORB   10
#define UNLINKS_PER_ABSORB    10

/*
 * Function pointers for handling sync and unlink requests.
 */
typedef struct SyncOps
{
  int     (*sync_syncfiletag) (const FileTag *ftag, char *path);
  int     (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
  bool    (*sync_filetagmatches) (const FileTag *ftag,
                    const FileTag *candidate);
} SyncOps;

/*
 * These indexes must correspond to the values of the SyncRequestHandler enum.
 */
static const SyncOps syncsw[] = {
  /* magnetic disk */
  [SYNC_HANDLER_MD] = {
    .sync_syncfiletag = mdsyncfiletag,
    .sync_unlinkfiletag = mdunlinkfiletag,
    .sync_filetagmatches = mdfiletagmatches
  },
  /* pg_xact */
  [SYNC_HANDLER_CLOG] = {
    .sync_syncfiletag = clogsyncfiletag
  },
  /* pg_commit_ts */
  [SYNC_HANDLER_COMMIT_TS] = {
    .sync_syncfiletag = committssyncfiletag
  },
  /* pg_multixact/offsets */
  [SYNC_HANDLER_MULTIXACT_OFFSET] = {
    .sync_syncfiletag = multixactoffsetssyncfiletag
  },
  /* pg_multixact/members */
  [SYNC_HANDLER_MULTIXACT_MEMBER] = {
    .sync_syncfiletag = multixactmemberssyncfiletag
  }
};

/*
 * Initialize data structures for the file sync tracking.
 */
void
InitSync(void)
{
  /*
   * Create pending-operations hashtable if we need it.  Currently, we need
   * it if we are standalone (not under a postmaster) or if we are a
   * checkpointer auxiliary process.
   */
  if (!IsUnderPostmaster || AmCheckpointerProcess())
  {
    HASHCTL   hash_ctl;

    /*
     * XXX: The checkpointer needs to add entries to the pending ops table
     * when absorbing fsync requests.  That is done within a critical
     * section, which isn't usually allowed, but we make an exception. It
     * means that there's a theoretical possibility that you run out of
     * memory while absorbing fsync requests, which leads to a PANIC.
     * Fortunately the hash table is small so that's unlikely to happen in
     * practice.
     */
    pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
                        "Pending ops context",
                        ALLOCSET_DEFAULT_SIZES);
    MemoryContextAllowInCriticalSection(pendingOpsCxt, true);

    hash_ctl.keysize = sizeof(FileTag);
    hash_ctl.entrysize = sizeof(PendingFsyncEntry);
    hash_ctl.hcxt = pendingOpsCxt;
    pendingOps = hash_create("Pending Ops Table",
                 100L,
                 &hash_ctl,
                 HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
    pendingUnlinks = NIL;
  }
}

/*
 * SyncPreCheckpoint() -- Do pre-checkpoint work
 *
 * To distinguish unlink requests that arrived before this checkpoint
 * started from those that arrived during the checkpoint, we use a cycle
 * counter similar to the one we use for fsync requests. That cycle
 * counter is incremented here.
 *
 * This must be called *before* the checkpoint REDO point is determined.
 * That ensures that we won't delete files too soon.  Since this calls
 * AbsorbSyncRequests(), which performs memory allocations, it cannot be
 * called within a critical section.
 *
 * Note that we can't do anything here that depends on the assumption
 * that the checkpoint will be completed.
 */
void
SyncPreCheckpoint(void)
{
  /*
   * Operations such as DROP TABLESPACE assume that the next checkpoint will
   * process all recently forwarded unlink requests, but if they aren't
   * absorbed prior to advancing the cycle counter, they won't be processed
   * until a future checkpoint.  The following absorb ensures that any
   * unlink requests forwarded before the checkpoint began will be processed
   * in the current checkpoint.
   */
  AbsorbSyncRequests();

  /*
   * Any unlink requests arriving after this point will be assigned the next
   * cycle counter, and won't be unlinked until next checkpoint.
   */
  checkpoint_cycle_ctr++;
}

/*
 * SyncPostCheckpoint() -- Do post-checkpoint work
 *
 * Remove any lingering files that can now be safely removed.
 */
void
SyncPostCheckpoint(void)
{
  int     absorb_counter;
  ListCell   *lc;

  absorb_counter = UNLINKS_PER_ABSORB;
  foreach(lc, pendingUnlinks)
  {
    PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
    char    path[MAXPGPATH];

    /* Skip over any canceled entries */
    if (entry->canceled)
      continue;

    /*
     * New entries are appended to the end, so if the entry is new we've
     * reached the end of old entries.
     *
     * Note: if just the right number of consecutive checkpoints fail, we
     * could be fooled here by cycle_ctr wraparound.  However, the only
     * consequence is that we'd delay unlinking for one more checkpoint,
     * which is perfectly tolerable.
     */
    if (entry->cycle_ctr == checkpoint_cycle_ctr)
      break;

    /* Unlink the file */
    if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
                              path) < 0)
    {
      /*
       * There's a race condition, when the database is dropped at the
       * same time that we process the pending unlink requests. If the
       * DROP DATABASE deletes the file before we do, we will get ENOENT
       * here. rmtree() also has to ignore ENOENT errors, to deal with
       * the possibility that we delete the file first.
       */
      if (errno != ENOENT)
        ereport(WARNING,
            (errcode_for_file_access(),
             errmsg("could not remove file \"%s\": %m", path)));
    }

    /* Mark the list entry as canceled, just in case */
    entry->canceled = true;

    /*
     * As in ProcessSyncRequests, we don't want to stop absorbing fsync
     * requests for a long time when there are many deletions to be done.
     * We can safely call AbsorbSyncRequests() at this point in the loop.
     */
    if (--absorb_counter <= 0)
    {
      AbsorbSyncRequests();
      absorb_counter = UNLINKS_PER_ABSORB;
    }
  }

  /*
   * If we reached the end of the list, we can just remove the whole list
   * (remembering to pfree all the PendingUnlinkEntry objects).  Otherwise,
   * we must keep the entries at or after "lc".
   */
  if (lc == NULL)
  {
    list_free_deep(pendingUnlinks);
    pendingUnlinks = NIL;
  }
  else
  {
    int     ntodelete = list_cell_number(pendingUnlinks, lc);

    for (int i = 0; i < ntodelete; i++)
      pfree(list_nth(pendingUnlinks, i));

    pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
  }
}

/*
 *  ProcessSyncRequests() -- Process queued fsync requests.
 */
void
ProcessSyncRequests(void)
{
  static bool sync_in_progress = false;

  HASH_SEQ_STATUS hstat;
  PendingFsyncEntry *entry;
  int     absorb_counter;

  /* Statistics on sync times */
  int     processed = 0;
  instr_time  sync_start,
        sync_end,
        sync_diff;
  uint64    elapsed;
  uint64    longest = 0;
  uint64    total_elapsed = 0;

  /*
   * This is only called during checkpoints, and checkpoints should only
   * occur in processes that have created a pendingOps.
   */
  if (!pendingOps)
    elog(ERROR, "cannot sync without a pendingOps table");

  /*
   * If we are in the checkpointer, the sync had better include all fsync
   * requests that were queued by backends up to this point.  The tightest
   * race condition that could occur is that a buffer that must be written
   * and fsync'd for the checkpoint could have been dumped by a backend just
   * before it was visited by BufferSync().  We know the backend will have
   * queued an fsync request before clearing the buffer's dirtybit, so we
   * are safe as long as we do an Absorb after completing BufferSync().
   */
  AbsorbSyncRequests();

  /*
   * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
   * checkpoint), we want to ignore fsync requests that are entered into the
   * hashtable after this point --- they should be processed next time,
   * instead.  We use sync_cycle_ctr to tell old entries apart from new
   * ones: new ones will have cycle_ctr equal to the incremented value of
   * sync_cycle_ctr.
   *
   * In normal circumstances, all entries present in the table at this point
   * will have cycle_ctr exactly equal to the current (about to be old)
   * value of sync_cycle_ctr.  However, if we fail partway through the
   * fsync'ing loop, then older values of cycle_ctr might remain when we
   * come back here to try again.  Repeated checkpoint failures would
   * eventually wrap the counter around to the point where an old entry
   * might appear new, causing us to skip it, possibly allowing a checkpoint
   * to succeed that should not have.  To forestall wraparound, any time the
   * previous ProcessSyncRequests() failed to complete, run through the
   * table and forcibly set cycle_ctr = sync_cycle_ctr.
   *
   * Think not to merge this loop with the main loop, as the problem is
   * exactly that that loop may fail before having visited all the entries.
   * From a performance point of view it doesn't matter anyway, as this path
   * will never be taken in a system that's functioning normally.
   */
  if (sync_in_progress)
  {
    /* prior try failed, so update any stale cycle_ctr values */
    hash_seq_init(&hstat, pendingOps);
    while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
    {
      entry->cycle_ctr = sync_cycle_ctr;
    }
  }

  /* Advance counter so that new hashtable entries are distinguishable */
  sync_cycle_ctr++;

  /* Set flag to detect failure if we don't reach the end of the loop */
  sync_in_progress = true;

  /* Now scan the hashtable for fsync requests to process */
  absorb_counter = FSYNCS_PER_ABSORB;
  hash_seq_init(&hstat, pendingOps);
  while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
  {
    int     failures;

    /*
     * If the entry is new then don't process it this time; it is new.
     * Note "continue" bypasses the hash-remove call at the bottom of the
     * loop.
     */
    if (entry->cycle_ctr == sync_cycle_ctr)
      continue;

    /* Else assert we haven't missed it */
    Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);

    /*
     * If fsync is off then we don't have to bother opening the file at
     * all.  (We delay checking until this point so that changing fsync on
     * the fly behaves sensibly.)
     */
    if (enableFsync)
    {
      /*
       * If in checkpointer, we want to absorb pending requests every so
       * often to prevent overflow of the fsync request queue.  It is
       * unspecified whether newly-added entries will be visited by
       * hash_seq_search, but we don't care since we don't need to
       * process them anyway.
       */
      if (--absorb_counter <= 0)
      {
        AbsorbSyncRequests();
        absorb_counter = FSYNCS_PER_ABSORB;
      }

      /*
       * The fsync table could contain requests to fsync segments that
       * have been deleted (unlinked) by the time we get to them. Rather
       * than just hoping an ENOENT (or EACCES on Windows) error can be
       * ignored, what we do on error is absorb pending requests and
       * then retry. Since mdunlink() queues a "cancel" message before
       * actually unlinking, the fsync request is guaranteed to be
       * marked canceled after the absorb if it really was this case.
       * DROP DATABASE likewise has to tell us to forget fsync requests
       * before it starts deletions.
       */
      for (failures = 0; !entry->canceled; failures++)
      {
        char    path[MAXPGPATH];

        INSTR_TIME_SET_CURRENT(sync_start);
        if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
                                path) == 0)
        {
          /* Success; update statistics about sync timing */
          INSTR_TIME_SET_CURRENT(sync_end);
          sync_diff = sync_end;
          INSTR_TIME_SUBTRACT(sync_diff, sync_start);
          elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
          if (elapsed > longest)
            longest = elapsed;
          total_elapsed += elapsed;
          processed++;

          if (log_checkpoints)
            elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
               processed,
               path,
               (double) elapsed / 1000);

          break;   /* out of retry loop */
        }

        /*
         * It is possible that the relation has been dropped or
         * truncated since the fsync request was entered. Therefore,
         * allow ENOENT, but only if we didn't fail already on this
         * file.
         */
        if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
          ereport(data_sync_elevel(ERROR),
              (errcode_for_file_access(),
               errmsg("could not fsync file \"%s\": %m",
                  path)));
        else
          ereport(DEBUG1,
              (errcode_for_file_access(),
               errmsg_internal("could not fsync file \"%s\" but retrying: %m",
                       path)));

        /*
         * Absorb incoming requests and check to see if a cancel
         * arrived for this relation fork.
         */
        AbsorbSyncRequests();
        absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
      }         /* end retry loop */
    }

    /* We are done with this entry, remove it */
    if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
      elog(ERROR, "pendingOps corrupted");
  }             /* end loop over hashtable entries */

  /* Return sync performance metrics for report at checkpoint end */
  CheckpointStats.ckpt_sync_rels = processed;
  CheckpointStats.ckpt_longest_sync = longest;
  CheckpointStats.ckpt_agg_sync_time = total_elapsed;

  /* Flag successful completion of ProcessSyncRequests */
  sync_in_progress = false;
}

/*
 * RememberSyncRequest() -- callback from checkpointer side of sync request
 *
 * We stuff fsync requests into the local hash table for execution
 * during the checkpointer's next checkpoint.  UNLINK requests go into a
 * separate linked list, however, because they get processed separately.
 *
 * See sync.h for more information on the types of sync requests supported.
 */
void
RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
{
  Assert(pendingOps);

  if (type == SYNC_FORGET_REQUEST)
  {
    PendingFsyncEntry *entry;

    /* Cancel previously entered request */
    entry = (PendingFsyncEntry *) hash_search(pendingOps,
                          ftag,
                          HASH_FIND,
                          NULL);
    if (entry != NULL)
      entry->canceled = true;
  }
  else if (type == SYNC_FILTER_REQUEST)
  {
    HASH_SEQ_STATUS hstat;
    PendingFsyncEntry *pfe;
    ListCell   *cell;

    /* Cancel matching fsync requests */
    hash_seq_init(&hstat, pendingOps);
    while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
    {
      if (pfe->tag.handler == ftag->handler &&
        syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag))
        pfe->canceled = true;
    }

    /* Cancel matching unlink requests */
    foreach(cell, pendingUnlinks)
    {
      PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell);

      if (pue->tag.handler == ftag->handler &&
        syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag))
        pue->canceled = true;
    }
  }
  else if (type == SYNC_UNLINK_REQUEST)
  {
    /* Unlink request: put it in the linked list */
    MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
    PendingUnlinkEntry *entry;

    entry = palloc(sizeof(PendingUnlinkEntry));
    entry->tag = *ftag;
    entry->cycle_ctr = checkpoint_cycle_ctr;
    entry->canceled = false;

    pendingUnlinks = lappend(pendingUnlinks, entry);

    MemoryContextSwitchTo(oldcxt);
  }
  else
  {
    /* Normal case: enter a request to fsync this segment */
    MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
    PendingFsyncEntry *entry;
    bool    found;

    Assert(type == SYNC_REQUEST);

    entry = (PendingFsyncEntry *) hash_search(pendingOps,
                          ftag,
                          HASH_ENTER,
                          &found);
    /* if new entry, or was previously canceled, initialize it */
    if (!found || entry->canceled)
    {
      entry->cycle_ctr = sync_cycle_ctr;
      entry->canceled = false;
    }

    /*
     * NB: it's intentional that we don't change cycle_ctr if the entry
     * already exists.  The cycle_ctr must represent the oldest fsync
     * request that could be in the entry.
     */

    MemoryContextSwitchTo(oldcxt);
  }
}

/*
 * Register the sync request locally, or forward it to the checkpointer.
 *
 * If retryOnError is true, we'll keep trying if there is no space in the
 * queue.  Return true if we succeeded, or false if there wasn't space.
 */
bool
RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
          bool retryOnError)
{
  bool    ret;

  if (pendingOps != NULL)
  {
    /* standalone backend or startup process: fsync state is local */
    RememberSyncRequest(ftag, type);
    return true;
  }

  for (;;)
  {
    /*
     * Notify the checkpointer about it.  If we fail to queue a message in
     * retryOnError mode, we have to sleep and try again ... ugly, but
     * hopefully won't happen often.
     *
     * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
     * error in the case of SYNC_UNLINK_REQUEST would leave the
     * no-longer-used file still present on disk, which would be bad, so
     * I'm inclined to assume that the checkpointer will always empty the
     * queue soon.
     */
    ret = ForwardSyncRequest(ftag, type);

    /*
     * If we are successful in queueing the request, or we failed and were
     * instructed not to retry on error, break.
     */
    if (ret || (!ret && !retryOnError))
      break;

    WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
          WAIT_EVENT_REGISTER_SYNC_REQUEST);
  }

  return ret;
}

Coverage Report

Created: 2025-08-12 06:43

Line	Count	Source (jump to first uncovered line)
1		/*-------------------------------------------------------------------------
2		*
3		* sync.c
4		* File synchronization management code.
5		*
6		* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7		* Portions Copyright (c) 1994, Regents of the University of California
8		*
9		*
10		* IDENTIFICATION
11		* src/backend/storage/sync/sync.c
12		*
13		*-------------------------------------------------------------------------
14		*/
15		#include "postgres.h"
16
17		#include <unistd.h>
18		#include <fcntl.h>
19		#include <sys/file.h>
20
21		#include "access/clog.h"
22		#include "access/commit_ts.h"
23		#include "access/multixact.h"
24		#include "access/xlog.h"
25		#include "miscadmin.h"
26		#include "pgstat.h"
27		#include "portability/instr_time.h"
28		#include "postmaster/bgwriter.h"
29		#include "storage/fd.h"
30		#include "storage/latch.h"
31		#include "storage/md.h"
32		#include "utils/hsearch.h"
33		#include "utils/memutils.h"
34
35		/*
36		* In some contexts (currently, standalone backends and the checkpointer)
37		* we keep track of pending fsync operations: we need to remember all relation
38		* segments that have been written since the last checkpoint, so that we can
39		* fsync them down to disk before completing the next checkpoint. This hash
40		* table remembers the pending operations. We use a hash table mostly as
41		* a convenient way of merging duplicate requests.
42		*
43		* We use a similar mechanism to remember no-longer-needed files that can
44		* be deleted after the next checkpoint, but we use a linked list instead of
45		* a hash table, because we don't expect there to be any duplicate requests.
46		*
47		* These mechanisms are only used for non-temp relations; we never fsync
48		* temp rels, nor do we need to postpone their deletion (see comments in
49		* mdunlink).
50		*
51		* (Regular backends do not track pending operations locally, but forward
52		* them to the checkpointer.)
53		*/
54		typedef uint16 CycleCtr; /* can be any convenient integer size */
55
56		typedef struct
57		{
58		FileTag tag; /* identifies handler and file */
59		CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
60		bool canceled; /* canceled is true if we canceled "recently" */
61		} PendingFsyncEntry;
62
63		typedef struct
64		{
65		FileTag tag; /* identifies handler and file */
66		CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
67		bool canceled; /* true if request has been canceled */
68		} PendingUnlinkEntry;
69
70		static HTAB *pendingOps = NULL;
71		static List *pendingUnlinks = NIL;
72		static MemoryContext pendingOpsCxt; /* context for the above */
73
74		static CycleCtr sync_cycle_ctr = 0;
75		static CycleCtr checkpoint_cycle_ctr = 0;
76
77		/* Intervals for calling AbsorbSyncRequests */
78	0	#define FSYNCS_PER_ABSORB 10
79	0	#define UNLINKS_PER_ABSORB 10
80
81		/*
82		* Function pointers for handling sync and unlink requests.
83		*/
84		typedef struct SyncOps
85		{
86		int (sync_syncfiletag) (const FileTag ftag, char *path);
87		int (sync_unlinkfiletag) (const FileTag ftag, char *path);
88		bool (sync_filetagmatches) (const FileTag ftag,
89		const FileTag *candidate);
90		} SyncOps;
91
92		/*
93		* These indexes must correspond to the values of the SyncRequestHandler enum.
94		*/
95		static const SyncOps syncsw[] = {
96		/* magnetic disk */
97		[SYNC_HANDLER_MD] = {
98		.sync_syncfiletag = mdsyncfiletag,
99		.sync_unlinkfiletag = mdunlinkfiletag,
100		.sync_filetagmatches = mdfiletagmatches
101		},
102		/* pg_xact */
103		[SYNC_HANDLER_CLOG] = {
104		.sync_syncfiletag = clogsyncfiletag
105		},
106		/* pg_commit_ts */
107		[SYNC_HANDLER_COMMIT_TS] = {
108		.sync_syncfiletag = committssyncfiletag
109		},
110		/* pg_multixact/offsets */
111		[SYNC_HANDLER_MULTIXACT_OFFSET] = {
112		.sync_syncfiletag = multixactoffsetssyncfiletag
113		},
114		/* pg_multixact/members */
115		[SYNC_HANDLER_MULTIXACT_MEMBER] = {
116		.sync_syncfiletag = multixactmemberssyncfiletag
117		}
118		};
119
120		/*
121		* Initialize data structures for the file sync tracking.
122		*/
123		void
124		InitSync(void)
125	0	{
126		/*
127		* Create pending-operations hashtable if we need it. Currently, we need
128		* it if we are standalone (not under a postmaster) or if we are a
129		* checkpointer auxiliary process.
130		*/
131	0	if (!IsUnderPostmaster \|\| AmCheckpointerProcess())
132	0	{
133	0	HASHCTL hash_ctl;
134
135		/*
136		* XXX: The checkpointer needs to add entries to the pending ops table
137		* when absorbing fsync requests. That is done within a critical
138		* section, which isn't usually allowed, but we make an exception. It
139		* means that there's a theoretical possibility that you run out of
140		* memory while absorbing fsync requests, which leads to a PANIC.
141		* Fortunately the hash table is small so that's unlikely to happen in
142		* practice.
143		*/
144	0	pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
145	0	"Pending ops context",
146	0	ALLOCSET_DEFAULT_SIZES);
147	0	MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
148
149	0	hash_ctl.keysize = sizeof(FileTag);
150	0	hash_ctl.entrysize = sizeof(PendingFsyncEntry);
151	0	hash_ctl.hcxt = pendingOpsCxt;
152	0	pendingOps = hash_create("Pending Ops Table",
153	0	100L,
154	0	&hash_ctl,
155	0	HASH_ELEM \| HASH_BLOBS \| HASH_CONTEXT);
156	0	pendingUnlinks = NIL;
157	0	}
158	0	}
159
160		/*
161		* SyncPreCheckpoint() -- Do pre-checkpoint work
162		*
163		* To distinguish unlink requests that arrived before this checkpoint
164		* started from those that arrived during the checkpoint, we use a cycle
165		* counter similar to the one we use for fsync requests. That cycle
166		* counter is incremented here.
167		*
168		* This must be called before the checkpoint REDO point is determined.
169		* That ensures that we won't delete files too soon. Since this calls
170		* AbsorbSyncRequests(), which performs memory allocations, it cannot be
171		* called within a critical section.
172		*
173		* Note that we can't do anything here that depends on the assumption
174		* that the checkpoint will be completed.
175		*/
176		void
177		SyncPreCheckpoint(void)
178	0	{
179		/*
180		* Operations such as DROP TABLESPACE assume that the next checkpoint will
181		* process all recently forwarded unlink requests, but if they aren't
182		* absorbed prior to advancing the cycle counter, they won't be processed
183		* until a future checkpoint. The following absorb ensures that any
184		* unlink requests forwarded before the checkpoint began will be processed
185		* in the current checkpoint.
186		*/
187	0	AbsorbSyncRequests();
188
189		/*
190		* Any unlink requests arriving after this point will be assigned the next
191		* cycle counter, and won't be unlinked until next checkpoint.
192		*/
193	0	checkpoint_cycle_ctr++;
194	0	}
195
196		/*
197		* SyncPostCheckpoint() -- Do post-checkpoint work
198		*
199		* Remove any lingering files that can now be safely removed.
200		*/
201		void
202		SyncPostCheckpoint(void)
203	0	{
204	0	int absorb_counter;
205	0	ListCell *lc;
206
207	0	absorb_counter = UNLINKS_PER_ABSORB;
208	0	foreach(lc, pendingUnlinks)
209	0	{
210	0	PendingUnlinkEntry entry = (PendingUnlinkEntry ) lfirst(lc);
211	0	char path[MAXPGPATH];
212
213		/* Skip over any canceled entries */
214	0	if (entry->canceled)
215	0	continue;
216
217		/*
218		* New entries are appended to the end, so if the entry is new we've
219		* reached the end of old entries.
220		*
221		* Note: if just the right number of consecutive checkpoints fail, we
222		* could be fooled here by cycle_ctr wraparound. However, the only
223		* consequence is that we'd delay unlinking for one more checkpoint,
224		* which is perfectly tolerable.
225		*/
226	0	if (entry->cycle_ctr == checkpoint_cycle_ctr)
227	0	break;
228
229		/* Unlink the file */
230	0	if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
231	0	path) < 0)
232	0	{
233		/*
234		* There's a race condition, when the database is dropped at the
235		* same time that we process the pending unlink requests. If the
236		* DROP DATABASE deletes the file before we do, we will get ENOENT
237		* here. rmtree() also has to ignore ENOENT errors, to deal with
238		* the possibility that we delete the file first.
239		*/
240	0	if (errno != ENOENT)
241	0	ereport(WARNING,
242	0	(errcode_for_file_access(),
243	0	errmsg("could not remove file \"%s\": %m", path)));
244	0	}
245
246		/* Mark the list entry as canceled, just in case */
247	0	entry->canceled = true;
248
249		/*
250		* As in ProcessSyncRequests, we don't want to stop absorbing fsync
251		* requests for a long time when there are many deletions to be done.
252		* We can safely call AbsorbSyncRequests() at this point in the loop.
253		*/
254	0	if (--absorb_counter <= 0)
255	0	{
256	0	AbsorbSyncRequests();
257	0	absorb_counter = UNLINKS_PER_ABSORB;
258	0	}
259	0	}
260
261		/*
262		* If we reached the end of the list, we can just remove the whole list
263		* (remembering to pfree all the PendingUnlinkEntry objects). Otherwise,
264		* we must keep the entries at or after "lc".
265		*/
266	0	if (lc == NULL)
267	0	{
268	0	list_free_deep(pendingUnlinks);
269	0	pendingUnlinks = NIL;
270	0	}
271	0	else
272	0	{
273	0	int ntodelete = list_cell_number(pendingUnlinks, lc);
274
275	0	for (int i = 0; i < ntodelete; i++)
276	0	pfree(list_nth(pendingUnlinks, i));
277
278	0	pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
279	0	}
280	0	}
281
282		/*
283		* ProcessSyncRequests() -- Process queued fsync requests.
284		*/
285		void
286		ProcessSyncRequests(void)
287	0	{
288	0	static bool sync_in_progress = false;
289
290	0	HASH_SEQ_STATUS hstat;
291	0	PendingFsyncEntry *entry;
292	0	int absorb_counter;
293
294		/* Statistics on sync times */
295	0	int processed = 0;
296	0	instr_time sync_start,
297	0	sync_end,
298	0	sync_diff;
299	0	uint64 elapsed;
300	0	uint64 longest = 0;
301	0	uint64 total_elapsed = 0;
302
303		/*
304		* This is only called during checkpoints, and checkpoints should only
305		* occur in processes that have created a pendingOps.
306		*/
307	0	if (!pendingOps)
308	0	elog(ERROR, "cannot sync without a pendingOps table");
309
310		/*
311		* If we are in the checkpointer, the sync had better include all fsync
312		* requests that were queued by backends up to this point. The tightest
313		* race condition that could occur is that a buffer that must be written
314		* and fsync'd for the checkpoint could have been dumped by a backend just
315		* before it was visited by BufferSync(). We know the backend will have
316		* queued an fsync request before clearing the buffer's dirtybit, so we
317		* are safe as long as we do an Absorb after completing BufferSync().
318		*/
319	0	AbsorbSyncRequests();
320
321		/*
322		* To avoid excess fsync'ing (in the worst case, maybe a never-terminating
323		* checkpoint), we want to ignore fsync requests that are entered into the
324		* hashtable after this point --- they should be processed next time,
325		* instead. We use sync_cycle_ctr to tell old entries apart from new
326		* ones: new ones will have cycle_ctr equal to the incremented value of
327		* sync_cycle_ctr.
328		*
329		* In normal circumstances, all entries present in the table at this point
330		* will have cycle_ctr exactly equal to the current (about to be old)
331		* value of sync_cycle_ctr. However, if we fail partway through the
332		* fsync'ing loop, then older values of cycle_ctr might remain when we
333		* come back here to try again. Repeated checkpoint failures would
334		* eventually wrap the counter around to the point where an old entry
335		* might appear new, causing us to skip it, possibly allowing a checkpoint
336		* to succeed that should not have. To forestall wraparound, any time the
337		* previous ProcessSyncRequests() failed to complete, run through the
338		* table and forcibly set cycle_ctr = sync_cycle_ctr.
339		*
340		* Think not to merge this loop with the main loop, as the problem is
341		* exactly that that loop may fail before having visited all the entries.
342		* From a performance point of view it doesn't matter anyway, as this path
343		* will never be taken in a system that's functioning normally.
344		*/
345	0	if (sync_in_progress)
346	0	{
347		/* prior try failed, so update any stale cycle_ctr values */
348	0	hash_seq_init(&hstat, pendingOps);
349	0	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
350	0	{
351	0	entry->cycle_ctr = sync_cycle_ctr;
352	0	}
353	0	}
354
355		/* Advance counter so that new hashtable entries are distinguishable */
356	0	sync_cycle_ctr++;
357
358		/* Set flag to detect failure if we don't reach the end of the loop */
359	0	sync_in_progress = true;
360
361		/* Now scan the hashtable for fsync requests to process */
362	0	absorb_counter = FSYNCS_PER_ABSORB;
363	0	hash_seq_init(&hstat, pendingOps);
364	0	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
365	0	{
366	0	int failures;
367
368		/*
369		* If the entry is new then don't process it this time; it is new.
370		* Note "continue" bypasses the hash-remove call at the bottom of the
371		* loop.
372		*/
373	0	if (entry->cycle_ctr == sync_cycle_ctr)
374	0	continue;
375
376		/* Else assert we haven't missed it */
377	0	Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
378
379		/*
380		* If fsync is off then we don't have to bother opening the file at
381		* all. (We delay checking until this point so that changing fsync on
382		* the fly behaves sensibly.)
383		*/
384	0	if (enableFsync)
385	0	{
386		/*
387		* If in checkpointer, we want to absorb pending requests every so
388		* often to prevent overflow of the fsync request queue. It is
389		* unspecified whether newly-added entries will be visited by
390		* hash_seq_search, but we don't care since we don't need to
391		* process them anyway.
392		*/
393	0	if (--absorb_counter <= 0)
394	0	{
395	0	AbsorbSyncRequests();
396	0	absorb_counter = FSYNCS_PER_ABSORB;
397	0	}
398
399		/*
400		* The fsync table could contain requests to fsync segments that
401		* have been deleted (unlinked) by the time we get to them. Rather
402		* than just hoping an ENOENT (or EACCES on Windows) error can be
403		* ignored, what we do on error is absorb pending requests and
404		* then retry. Since mdunlink() queues a "cancel" message before
405		* actually unlinking, the fsync request is guaranteed to be
406		* marked canceled after the absorb if it really was this case.
407		* DROP DATABASE likewise has to tell us to forget fsync requests
408		* before it starts deletions.
409		*/
410	0	for (failures = 0; !entry->canceled; failures++)
411	0	{
412	0	char path[MAXPGPATH];
413
414	0	INSTR_TIME_SET_CURRENT(sync_start);
415	0	if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
416	0	path) == 0)
417	0	{
418		/* Success; update statistics about sync timing */
419	0	INSTR_TIME_SET_CURRENT(sync_end);
420	0	sync_diff = sync_end;
421	0	INSTR_TIME_SUBTRACT(sync_diff, sync_start);
422	0	elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
423	0	if (elapsed > longest)
424	0	longest = elapsed;
425	0	total_elapsed += elapsed;
426	0	processed++;
427
428	0	if (log_checkpoints)
429	0	elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
430	0	processed,
431	0	path,
432	0	(double) elapsed / 1000);
433
434	0	break; /* out of retry loop */
435	0	}
436
437		/*
438		* It is possible that the relation has been dropped or
439		* truncated since the fsync request was entered. Therefore,
440		* allow ENOENT, but only if we didn't fail already on this
441		* file.
442		*/
443	0	if (!FILE_POSSIBLY_DELETED(errno) \|\| failures > 0)
444	0	ereport(data_sync_elevel(ERROR),
445	0	(errcode_for_file_access(),
446	0	errmsg("could not fsync file \"%s\": %m",
447	0	path)));
448	0	else
449	0	ereport(DEBUG1,
450	0	(errcode_for_file_access(),
451	0	errmsg_internal("could not fsync file \"%s\" but retrying: %m",
452	0	path)));
453
454		/*
455		* Absorb incoming requests and check to see if a cancel
456		* arrived for this relation fork.
457		*/
458	0	AbsorbSyncRequests();
459	0	absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
460	0	} /* end retry loop */
461	0	}
462
463		/* We are done with this entry, remove it */
464	0	if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
465	0	elog(ERROR, "pendingOps corrupted");
466	0	} /* end loop over hashtable entries */
467
468		/* Return sync performance metrics for report at checkpoint end */
469	0	CheckpointStats.ckpt_sync_rels = processed;
470	0	CheckpointStats.ckpt_longest_sync = longest;
471	0	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
472
473		/* Flag successful completion of ProcessSyncRequests */
474	0	sync_in_progress = false;
475	0	}
476
477		/*
478		* RememberSyncRequest() -- callback from checkpointer side of sync request
479		*
480		* We stuff fsync requests into the local hash table for execution
481		* during the checkpointer's next checkpoint. UNLINK requests go into a
482		* separate linked list, however, because they get processed separately.
483		*
484		* See sync.h for more information on the types of sync requests supported.
485		*/
486		void
487		RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
488	0	{
489	0	Assert(pendingOps);
490
491	0	if (type == SYNC_FORGET_REQUEST)
492	0	{
493	0	PendingFsyncEntry *entry;
494
495		/* Cancel previously entered request */
496	0	entry = (PendingFsyncEntry *) hash_search(pendingOps,
497	0	ftag,
498	0	HASH_FIND,
499	0	NULL);
500	0	if (entry != NULL)
501	0	entry->canceled = true;
502	0	}
503	0	else if (type == SYNC_FILTER_REQUEST)
504	0	{
505	0	HASH_SEQ_STATUS hstat;
506	0	PendingFsyncEntry *pfe;
507	0	ListCell *cell;
508
509		/* Cancel matching fsync requests */
510	0	hash_seq_init(&hstat, pendingOps);
511	0	while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
512	0	{
513	0	if (pfe->tag.handler == ftag->handler &&
514	0	syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag))
515	0	pfe->canceled = true;
516	0	}
517
518		/* Cancel matching unlink requests */
519	0	foreach(cell, pendingUnlinks)
520	0	{
521	0	PendingUnlinkEntry pue = (PendingUnlinkEntry ) lfirst(cell);
522
523	0	if (pue->tag.handler == ftag->handler &&
524	0	syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag))
525	0	pue->canceled = true;
526	0	}
527	0	}
528	0	else if (type == SYNC_UNLINK_REQUEST)
529	0	{
530		/* Unlink request: put it in the linked list */
531	0	MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
532	0	PendingUnlinkEntry *entry;
533
534	0	entry = palloc(sizeof(PendingUnlinkEntry));
535	0	entry->tag = *ftag;
536	0	entry->cycle_ctr = checkpoint_cycle_ctr;
537	0	entry->canceled = false;
538
539	0	pendingUnlinks = lappend(pendingUnlinks, entry);
540
541	0	MemoryContextSwitchTo(oldcxt);
542	0	}
543	0	else
544	0	{
545		/* Normal case: enter a request to fsync this segment */
546	0	MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
547	0	PendingFsyncEntry *entry;
548	0	bool found;
549
550	0	Assert(type == SYNC_REQUEST);
551
552	0	entry = (PendingFsyncEntry *) hash_search(pendingOps,
553	0	ftag,
554	0	HASH_ENTER,
555	0	&found);
556		/* if new entry, or was previously canceled, initialize it */
557	0	if (!found \|\| entry->canceled)
558	0	{
559	0	entry->cycle_ctr = sync_cycle_ctr;
560	0	entry->canceled = false;
561	0	}
562
563		/*
564		* NB: it's intentional that we don't change cycle_ctr if the entry
565		* already exists. The cycle_ctr must represent the oldest fsync
566		* request that could be in the entry.
567		*/
568
569	0	MemoryContextSwitchTo(oldcxt);
570	0	}
571	0	}
572
573		/*
574		* Register the sync request locally, or forward it to the checkpointer.
575		*
576		* If retryOnError is true, we'll keep trying if there is no space in the
577		* queue. Return true if we succeeded, or false if there wasn't space.
578		*/
579		bool
580		RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
581		bool retryOnError)
582	0	{
583	0	bool ret;
584
585	0	if (pendingOps != NULL)
586	0	{
587		/* standalone backend or startup process: fsync state is local */
588	0	RememberSyncRequest(ftag, type);
589	0	return true;
590	0	}
591
592	0	for (;;)
593	0	{
594		/*
595		* Notify the checkpointer about it. If we fail to queue a message in
596		* retryOnError mode, we have to sleep and try again ... ugly, but
597		* hopefully won't happen often.
598		*
599		* XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
600		* error in the case of SYNC_UNLINK_REQUEST would leave the
601		* no-longer-used file still present on disk, which would be bad, so
602		* I'm inclined to assume that the checkpointer will always empty the
603		* queue soon.
604		*/
605	0	ret = ForwardSyncRequest(ftag, type);
606
607		/*
608		* If we are successful in queueing the request, or we failed and were
609		* instructed not to retry on error, break.
610		*/
611	0	if (ret \|\| (!ret && !retryOnError))
612	0	break;
613
614	0	WaitLatch(NULL, WL_EXIT_ON_PM_DEATH \| WL_TIMEOUT, 10,
615	0	WAIT_EVENT_REGISTER_SYNC_REQUEST);
616	0	}
617
618	0	return ret;
619	0	}