/src/postgres/src/backend/access/common/syncscan.c

Source
/*-------------------------------------------------------------------------
 *
 * syncscan.c
 *    scan synchronization support
 *
 * When multiple backends run a sequential scan on the same table, we try
 * to keep them synchronized to reduce the overall I/O needed.  The goal is
 * to read each page into shared buffer cache only once, and let all backends
 * that take part in the shared scan process the page before it falls out of
 * the cache.
 *
 * Since the "leader" in a pack of backends doing a seqscan will have to wait
 * for I/O, while the "followers" don't, there is a strong self-synchronizing
 * effect once we can get the backends examining approximately the same part
 * of the table at the same time.  Hence all that is really needed is to get
 * a new backend beginning a seqscan to begin it close to where other backends
 * are reading.  We can scan the table circularly, from block X up to the
 * end and then from block 0 to X-1, to ensure we visit all rows while still
 * participating in the common scan.
 *
 * To accomplish that, we keep track of the scan position of each table, and
 * start new scans close to where the previous scan(s) are.  We don't try to
 * do any extra synchronization to keep the scans together afterwards; some
 * scans might progress much more slowly than others, for example if the
 * results need to be transferred to the client over a slow network, and we
 * don't want such queries to slow down others.
 *
 * There can realistically only be a few large sequential scans on different
 * tables in progress at any time.  Therefore we just keep the scan positions
 * in a small LRU list which we scan every time we need to look up or update a
 * scan position.  The whole mechanism is only applied for tables exceeding
 * a threshold size (but that is not the concern of this module).
 *
 * INTERFACE ROUTINES
 *    ss_get_location   - return current scan location of a relation
 *    ss_report_location  - update current scan location
 *
 *
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *    src/backend/access/common/syncscan.c
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include "access/syncscan.h"
#include "miscadmin.h"
#include "storage/lwlock.h"
#include "storage/shmem.h"
#include "utils/rel.h"


/* GUC variables */
#ifdef TRACE_SYNCSCAN
bool    trace_syncscan = false;
#endif


/*
 * Size of the LRU list.
 *
 * Note: the code assumes that SYNC_SCAN_NELEM > 1.
 *
 * XXX: What's a good value? It should be large enough to hold the
 * maximum number of large tables scanned simultaneously.  But a larger value
 * means more traversing of the LRU list when starting a new scan.
 */
#define SYNC_SCAN_NELEM 20

/*
 * Interval between reports of the location of the current scan, in pages.
 *
 * Note: This should be smaller than the ring size (see buffer/freelist.c)
 * we use for bulk reads.  Otherwise a scan joining other scans might start
 * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
 * there's no guarantee that the new scan will read the page before it leaves
 * the buffer cache anyway, and on the other hand the page is most likely
 * still in the OS cache.
 */
#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)


/*
 * The scan locations structure is essentially a doubly-linked LRU with head
 * and tail pointer, but designed to hold a fixed maximum number of elements in
 * fixed-size shared memory.
 */
typedef struct ss_scan_location_t
{
  RelFileLocator relfilelocator;  /* identity of a relation */
  BlockNumber location;   /* last-reported location in the relation */
} ss_scan_location_t;

typedef struct ss_lru_item_t
{
  struct ss_lru_item_t *prev;
  struct ss_lru_item_t *next;
  ss_scan_location_t location;
} ss_lru_item_t;

typedef struct ss_scan_locations_t
{
  ss_lru_item_t *head;
  ss_lru_item_t *tail;
  ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
} ss_scan_locations_t;

#define SizeOfScanLocations(N) \
  (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))

/* Pointer to struct in shared memory */
static ss_scan_locations_t *scan_locations;

/* prototypes for internal functions */
static BlockNumber ss_search(RelFileLocator relfilelocator,
               BlockNumber location, bool set);


/*
 * SyncScanShmemSize --- report amount of shared memory space needed
 */
Size
SyncScanShmemSize(void)
{
  return SizeOfScanLocations(SYNC_SCAN_NELEM);
}

/*
 * SyncScanShmemInit --- initialize this module's shared memory
 */
void
SyncScanShmemInit(void)
{
  int     i;
  bool    found;

  scan_locations = (ss_scan_locations_t *)
    ShmemInitStruct("Sync Scan Locations List",
            SizeOfScanLocations(SYNC_SCAN_NELEM),
            &found);

  if (!IsUnderPostmaster)
  {
    /* Initialize shared memory area */
    Assert(!found);

    scan_locations->head = &scan_locations->items[0];
    scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];

    for (i = 0; i < SYNC_SCAN_NELEM; i++)
    {
      ss_lru_item_t *item = &scan_locations->items[i];

      /*
       * Initialize all slots with invalid values. As scans are started,
       * these invalid entries will fall off the LRU list and get
       * replaced with real entries.
       */
      item->location.relfilelocator.spcOid = InvalidOid;
      item->location.relfilelocator.dbOid = InvalidOid;
      item->location.relfilelocator.relNumber = InvalidRelFileNumber;
      item->location.location = InvalidBlockNumber;

      item->prev = (i > 0) ?
        (&scan_locations->items[i - 1]) : NULL;
      item->next = (i < SYNC_SCAN_NELEM - 1) ?
        (&scan_locations->items[i + 1]) : NULL;
    }
  }
  else
    Assert(found);
}

/*
 * ss_search --- search the scan_locations structure for an entry with the
 *    given relfilelocator.
 *
 * If "set" is true, the location is updated to the given location.  If no
 * entry for the given relfilelocator is found, it will be created at the head
 * of the list with the given location, even if "set" is false.
 *
 * In any case, the location after possible update is returned.
 *
 * Caller is responsible for having acquired suitable lock on the shared
 * data structure.
 */
static BlockNumber
ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
{
  ss_lru_item_t *item;

  item = scan_locations->head;
  for (;;)
  {
    bool    match;

    match = RelFileLocatorEquals(item->location.relfilelocator,
                   relfilelocator);

    if (match || item->next == NULL)
    {
      /*
       * If we reached the end of list and no match was found, take over
       * the last entry
       */
      if (!match)
      {
        item->location.relfilelocator = relfilelocator;
        item->location.location = location;
      }
      else if (set)
        item->location.location = location;

      /* Move the entry to the front of the LRU list */
      if (item != scan_locations->head)
      {
        /* unlink */
        if (item == scan_locations->tail)
          scan_locations->tail = item->prev;
        item->prev->next = item->next;
        if (item->next)
          item->next->prev = item->prev;

        /* link */
        item->prev = NULL;
        item->next = scan_locations->head;
        scan_locations->head->prev = item;
        scan_locations->head = item;
      }

      return item->location.location;
    }

    item = item->next;
  }

  /* not reached */
}

/*
 * ss_get_location --- get the optimal starting location for scan
 *
 * Returns the last-reported location of a sequential scan on the
 * relation, or 0 if no valid location is found.
 *
 * We expect the caller has just done RelationGetNumberOfBlocks(), and
 * so that number is passed in rather than computing it again.  The result
 * is guaranteed less than relnblocks (assuming that's > 0).
 */
BlockNumber
ss_get_location(Relation rel, BlockNumber relnblocks)
{
  BlockNumber startloc;

  LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
  startloc = ss_search(rel->rd_locator, 0, false);
  LWLockRelease(SyncScanLock);

  /*
   * If the location is not a valid block number for this scan, start at 0.
   *
   * This can happen if for instance a VACUUM truncated the table since the
   * location was saved.
   */
  if (startloc >= relnblocks)
    startloc = 0;

#ifdef TRACE_SYNCSCAN
  if (trace_syncscan)
    elog(LOG,
       "SYNC_SCAN: start \"%s\" (size %u) at %u",
       RelationGetRelationName(rel), relnblocks, startloc);
#endif

  return startloc;
}

/*
 * ss_report_location --- update the current scan location
 *
 * Writes an entry into the shared Sync Scan state of the form
 * (relfilelocator, blocknumber), overwriting any existing entry for the
 * same relfilelocator.
 */
void
ss_report_location(Relation rel, BlockNumber location)
{
#ifdef TRACE_SYNCSCAN
  if (trace_syncscan)
  {
    if ((location % 1024) == 0)
      elog(LOG,
         "SYNC_SCAN: scanning \"%s\" at %u",
         RelationGetRelationName(rel), location);
  }
#endif

  /*
   * To reduce lock contention, only report scan progress every N pages. For
   * the same reason, don't block if the lock isn't immediately available.
   * Missing a few updates isn't critical, it just means that a new scan
   * that wants to join the pack will start a little bit behind the head of
   * the scan.  Hopefully the pages are still in OS cache and the scan
   * catches up quickly.
   */
  if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
  {
    if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
    {
      (void) ss_search(rel->rd_locator, location, true);
      LWLockRelease(SyncScanLock);
    }
#ifdef TRACE_SYNCSCAN
    else if (trace_syncscan)
      elog(LOG,
         "SYNC_SCAN: missed update for \"%s\" at %u",
         RelationGetRelationName(rel), location);
#endif
  }
}

Coverage Report

Created: 2025-09-27 06:52

Line	Count	Source
1		/*-------------------------------------------------------------------------
2		*
3		* syncscan.c
4		* scan synchronization support
5		*
6		* When multiple backends run a sequential scan on the same table, we try
7		* to keep them synchronized to reduce the overall I/O needed. The goal is
8		* to read each page into shared buffer cache only once, and let all backends
9		* that take part in the shared scan process the page before it falls out of
10		* the cache.
11		*
12		* Since the "leader" in a pack of backends doing a seqscan will have to wait
13		* for I/O, while the "followers" don't, there is a strong self-synchronizing
14		* effect once we can get the backends examining approximately the same part
15		* of the table at the same time. Hence all that is really needed is to get
16		* a new backend beginning a seqscan to begin it close to where other backends
17		* are reading. We can scan the table circularly, from block X up to the
18		* end and then from block 0 to X-1, to ensure we visit all rows while still
19		* participating in the common scan.
20		*
21		* To accomplish that, we keep track of the scan position of each table, and
22		* start new scans close to where the previous scan(s) are. We don't try to
23		* do any extra synchronization to keep the scans together afterwards; some
24		* scans might progress much more slowly than others, for example if the
25		* results need to be transferred to the client over a slow network, and we
26		* don't want such queries to slow down others.
27		*
28		* There can realistically only be a few large sequential scans on different
29		* tables in progress at any time. Therefore we just keep the scan positions
30		* in a small LRU list which we scan every time we need to look up or update a
31		* scan position. The whole mechanism is only applied for tables exceeding
32		* a threshold size (but that is not the concern of this module).
33		*
34		* INTERFACE ROUTINES
35		* ss_get_location - return current scan location of a relation
36		* ss_report_location - update current scan location
37		*
38		*
39		* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
40		* Portions Copyright (c) 1994, Regents of the University of California
41		*
42		* IDENTIFICATION
43		* src/backend/access/common/syncscan.c
44		*
45		*-------------------------------------------------------------------------
46		*/
47		#include "postgres.h"
48
49		#include "access/syncscan.h"
50		#include "miscadmin.h"
51		#include "storage/lwlock.h"
52		#include "storage/shmem.h"
53		#include "utils/rel.h"
54
55
56		/* GUC variables */
57		#ifdef TRACE_SYNCSCAN
58		bool trace_syncscan = false;
59		#endif
60
61
62		/*
63		* Size of the LRU list.
64		*
65		* Note: the code assumes that SYNC_SCAN_NELEM > 1.
66		*
67		* XXX: What's a good value? It should be large enough to hold the
68		* maximum number of large tables scanned simultaneously. But a larger value
69		* means more traversing of the LRU list when starting a new scan.
70		*/
71	0	#define SYNC_SCAN_NELEM 20
72
73		/*
74		* Interval between reports of the location of the current scan, in pages.
75		*
76		* Note: This should be smaller than the ring size (see buffer/freelist.c)
77		* we use for bulk reads. Otherwise a scan joining other scans might start
78		* from a page that's no longer in the buffer cache. This is a bit fuzzy;
79		* there's no guarantee that the new scan will read the page before it leaves
80		* the buffer cache anyway, and on the other hand the page is most likely
81		* still in the OS cache.
82		*/
83	0	#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
84
85
86		/*
87		* The scan locations structure is essentially a doubly-linked LRU with head
88		* and tail pointer, but designed to hold a fixed maximum number of elements in
89		* fixed-size shared memory.
90		*/
91		typedef struct ss_scan_location_t
92		{
93		RelFileLocator relfilelocator; /* identity of a relation */
94		BlockNumber location; /* last-reported location in the relation */
95		} ss_scan_location_t;
96
97		typedef struct ss_lru_item_t
98		{
99		struct ss_lru_item_t *prev;
100		struct ss_lru_item_t *next;
101		ss_scan_location_t location;
102		} ss_lru_item_t;
103
104		typedef struct ss_scan_locations_t
105		{
106		ss_lru_item_t *head;
107		ss_lru_item_t *tail;
108		ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
109		} ss_scan_locations_t;
110
111		#define SizeOfScanLocations(N) \
112	0	(offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
113
114		/* Pointer to struct in shared memory */
115		static ss_scan_locations_t *scan_locations;
116
117		/* prototypes for internal functions */
118		static BlockNumber ss_search(RelFileLocator relfilelocator,
119		BlockNumber location, bool set);
120
121
122		/*
123		* SyncScanShmemSize --- report amount of shared memory space needed
124		*/
125		Size
126		SyncScanShmemSize(void)
127	0	{
128	0	return SizeOfScanLocations(SYNC_SCAN_NELEM);
129	0	}
130
131		/*
132		* SyncScanShmemInit --- initialize this module's shared memory
133		*/
134		void
135		SyncScanShmemInit(void)
136	0	{
137	0	int i;
138	0	bool found;
139
140	0	scan_locations = (ss_scan_locations_t *)
141	0	ShmemInitStruct("Sync Scan Locations List",
142	0	SizeOfScanLocations(SYNC_SCAN_NELEM),
143	0	&found);
144
145	0	if (!IsUnderPostmaster)
146	0	{
147		/* Initialize shared memory area */
148	0	Assert(!found);
149
150	0	scan_locations->head = &scan_locations->items[0];
151	0	scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
152
153	0	for (i = 0; i < SYNC_SCAN_NELEM; i++)
154	0	{
155	0	ss_lru_item_t *item = &scan_locations->items[i];
156
157		/*
158		* Initialize all slots with invalid values. As scans are started,
159		* these invalid entries will fall off the LRU list and get
160		* replaced with real entries.
161		*/
162	0	item->location.relfilelocator.spcOid = InvalidOid;
163	0	item->location.relfilelocator.dbOid = InvalidOid;
164	0	item->location.relfilelocator.relNumber = InvalidRelFileNumber;
165	0	item->location.location = InvalidBlockNumber;
166
167	0	item->prev = (i > 0) ?
168	0	(&scan_locations->items[i - 1]) : NULL;
169	0	item->next = (i < SYNC_SCAN_NELEM - 1) ?
170	0	(&scan_locations->items[i + 1]) : NULL;
171	0	}
172	0	}
173	0	else
174	0	Assert(found);
175	0	}
176
177		/*
178		* ss_search --- search the scan_locations structure for an entry with the
179		* given relfilelocator.
180		*
181		* If "set" is true, the location is updated to the given location. If no
182		* entry for the given relfilelocator is found, it will be created at the head
183		* of the list with the given location, even if "set" is false.
184		*
185		* In any case, the location after possible update is returned.
186		*
187		* Caller is responsible for having acquired suitable lock on the shared
188		* data structure.
189		*/
190		static BlockNumber
191		ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
192	0	{
193	0	ss_lru_item_t *item;
194
195	0	item = scan_locations->head;
196	0	for (;;)
197	0	{
198	0	bool match;
199
200	0	match = RelFileLocatorEquals(item->location.relfilelocator,
201	0	relfilelocator);
202
203	0	if (match \|\| item->next == NULL)
204	0	{
205		/*
206		* If we reached the end of list and no match was found, take over
207		* the last entry
208		*/
209	0	if (!match)
210	0	{
211	0	item->location.relfilelocator = relfilelocator;
212	0	item->location.location = location;
213	0	}
214	0	else if (set)
215	0	item->location.location = location;
216
217		/* Move the entry to the front of the LRU list */
218	0	if (item != scan_locations->head)
219	0	{
220		/* unlink */
221	0	if (item == scan_locations->tail)
222	0	scan_locations->tail = item->prev;
223	0	item->prev->next = item->next;
224	0	if (item->next)
225	0	item->next->prev = item->prev;
226
227		/* link */
228	0	item->prev = NULL;
229	0	item->next = scan_locations->head;
230	0	scan_locations->head->prev = item;
231	0	scan_locations->head = item;
232	0	}
233
234	0	return item->location.location;
235	0	}
236
237	0	item = item->next;
238	0	}
239
240		/* not reached */
241	0	}
242
243		/*
244		* ss_get_location --- get the optimal starting location for scan
245		*
246		* Returns the last-reported location of a sequential scan on the
247		* relation, or 0 if no valid location is found.
248		*
249		* We expect the caller has just done RelationGetNumberOfBlocks(), and
250		* so that number is passed in rather than computing it again. The result
251		* is guaranteed less than relnblocks (assuming that's > 0).
252		*/
253		BlockNumber
254		ss_get_location(Relation rel, BlockNumber relnblocks)
255	0	{
256	0	BlockNumber startloc;
257
258	0	LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
259	0	startloc = ss_search(rel->rd_locator, 0, false);
260	0	LWLockRelease(SyncScanLock);
261
262		/*
263		* If the location is not a valid block number for this scan, start at 0.
264		*
265		* This can happen if for instance a VACUUM truncated the table since the
266		* location was saved.
267		*/
268	0	if (startloc >= relnblocks)
269	0	startloc = 0;
270
271		#ifdef TRACE_SYNCSCAN
272		if (trace_syncscan)
273		elog(LOG,
274		"SYNC_SCAN: start \"%s\" (size %u) at %u",
275		RelationGetRelationName(rel), relnblocks, startloc);
276		#endif
277
278	0	return startloc;
279	0	}
280
281		/*
282		* ss_report_location --- update the current scan location
283		*
284		* Writes an entry into the shared Sync Scan state of the form
285		* (relfilelocator, blocknumber), overwriting any existing entry for the
286		* same relfilelocator.
287		*/
288		void
289		ss_report_location(Relation rel, BlockNumber location)
290	0	{
291		#ifdef TRACE_SYNCSCAN
292		if (trace_syncscan)
293		{
294		if ((location % 1024) == 0)
295		elog(LOG,
296		"SYNC_SCAN: scanning \"%s\" at %u",
297		RelationGetRelationName(rel), location);
298		}
299		#endif
300
301		/*
302		* To reduce lock contention, only report scan progress every N pages. For
303		* the same reason, don't block if the lock isn't immediately available.
304		* Missing a few updates isn't critical, it just means that a new scan
305		* that wants to join the pack will start a little bit behind the head of
306		* the scan. Hopefully the pages are still in OS cache and the scan
307		* catches up quickly.
308		*/
309	0	if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
310	0	{
311	0	if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
312	0	{
313	0	(void) ss_search(rel->rd_locator, location, true);
314	0	LWLockRelease(SyncScanLock);
315	0	}
316		#ifdef TRACE_SYNCSCAN
317		else if (trace_syncscan)
318		elog(LOG,
319		"SYNC_SCAN: missed update for \"%s\" at %u",
320		RelationGetRelationName(rel), location);
321		#endif
322	0	}
323	0	}