Coverage Report

Created: 2025-09-27 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/postgres/src/backend/access/common/syncscan.c
Line
Count
Source
1
/*-------------------------------------------------------------------------
2
 *
3
 * syncscan.c
4
 *    scan synchronization support
5
 *
6
 * When multiple backends run a sequential scan on the same table, we try
7
 * to keep them synchronized to reduce the overall I/O needed.  The goal is
8
 * to read each page into shared buffer cache only once, and let all backends
9
 * that take part in the shared scan process the page before it falls out of
10
 * the cache.
11
 *
12
 * Since the "leader" in a pack of backends doing a seqscan will have to wait
13
 * for I/O, while the "followers" don't, there is a strong self-synchronizing
14
 * effect once we can get the backends examining approximately the same part
15
 * of the table at the same time.  Hence all that is really needed is to get
16
 * a new backend beginning a seqscan to begin it close to where other backends
17
 * are reading.  We can scan the table circularly, from block X up to the
18
 * end and then from block 0 to X-1, to ensure we visit all rows while still
19
 * participating in the common scan.
20
 *
21
 * To accomplish that, we keep track of the scan position of each table, and
22
 * start new scans close to where the previous scan(s) are.  We don't try to
23
 * do any extra synchronization to keep the scans together afterwards; some
24
 * scans might progress much more slowly than others, for example if the
25
 * results need to be transferred to the client over a slow network, and we
26
 * don't want such queries to slow down others.
27
 *
28
 * There can realistically only be a few large sequential scans on different
29
 * tables in progress at any time.  Therefore we just keep the scan positions
30
 * in a small LRU list which we scan every time we need to look up or update a
31
 * scan position.  The whole mechanism is only applied for tables exceeding
32
 * a threshold size (but that is not the concern of this module).
33
 *
34
 * INTERFACE ROUTINES
35
 *    ss_get_location   - return current scan location of a relation
36
 *    ss_report_location  - update current scan location
37
 *
38
 *
39
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
40
 * Portions Copyright (c) 1994, Regents of the University of California
41
 *
42
 * IDENTIFICATION
43
 *    src/backend/access/common/syncscan.c
44
 *
45
 *-------------------------------------------------------------------------
46
 */
47
#include "postgres.h"
48
49
#include "access/syncscan.h"
50
#include "miscadmin.h"
51
#include "storage/lwlock.h"
52
#include "storage/shmem.h"
53
#include "utils/rel.h"
54
55
56
/* GUC variables */
57
#ifdef TRACE_SYNCSCAN
58
bool    trace_syncscan = false;
59
#endif
60
61
62
/*
63
 * Size of the LRU list.
64
 *
65
 * Note: the code assumes that SYNC_SCAN_NELEM > 1.
66
 *
67
 * XXX: What's a good value? It should be large enough to hold the
68
 * maximum number of large tables scanned simultaneously.  But a larger value
69
 * means more traversing of the LRU list when starting a new scan.
70
 */
71
0
#define SYNC_SCAN_NELEM 20
72
73
/*
74
 * Interval between reports of the location of the current scan, in pages.
75
 *
76
 * Note: This should be smaller than the ring size (see buffer/freelist.c)
77
 * we use for bulk reads.  Otherwise a scan joining other scans might start
78
 * from a page that's no longer in the buffer cache.  This is a bit fuzzy;
79
 * there's no guarantee that the new scan will read the page before it leaves
80
 * the buffer cache anyway, and on the other hand the page is most likely
81
 * still in the OS cache.
82
 */
83
0
#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
84
85
86
/*
87
 * The scan locations structure is essentially a doubly-linked LRU with head
88
 * and tail pointer, but designed to hold a fixed maximum number of elements in
89
 * fixed-size shared memory.
90
 */
91
typedef struct ss_scan_location_t
92
{
93
  RelFileLocator relfilelocator;  /* identity of a relation */
94
  BlockNumber location;   /* last-reported location in the relation */
95
} ss_scan_location_t;
96
97
typedef struct ss_lru_item_t
98
{
99
  struct ss_lru_item_t *prev;
100
  struct ss_lru_item_t *next;
101
  ss_scan_location_t location;
102
} ss_lru_item_t;
103
104
typedef struct ss_scan_locations_t
105
{
106
  ss_lru_item_t *head;
107
  ss_lru_item_t *tail;
108
  ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
109
} ss_scan_locations_t;
110
111
#define SizeOfScanLocations(N) \
112
0
  (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
113
114
/* Pointer to struct in shared memory */
115
static ss_scan_locations_t *scan_locations;
116
117
/* prototypes for internal functions */
118
static BlockNumber ss_search(RelFileLocator relfilelocator,
119
               BlockNumber location, bool set);
120
121
122
/*
123
 * SyncScanShmemSize --- report amount of shared memory space needed
124
 */
125
Size
126
SyncScanShmemSize(void)
127
0
{
128
0
  return SizeOfScanLocations(SYNC_SCAN_NELEM);
129
0
}
130
131
/*
132
 * SyncScanShmemInit --- initialize this module's shared memory
133
 */
134
void
135
SyncScanShmemInit(void)
136
0
{
137
0
  int     i;
138
0
  bool    found;
139
140
0
  scan_locations = (ss_scan_locations_t *)
141
0
    ShmemInitStruct("Sync Scan Locations List",
142
0
            SizeOfScanLocations(SYNC_SCAN_NELEM),
143
0
            &found);
144
145
0
  if (!IsUnderPostmaster)
146
0
  {
147
    /* Initialize shared memory area */
148
0
    Assert(!found);
149
150
0
    scan_locations->head = &scan_locations->items[0];
151
0
    scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
152
153
0
    for (i = 0; i < SYNC_SCAN_NELEM; i++)
154
0
    {
155
0
      ss_lru_item_t *item = &scan_locations->items[i];
156
157
      /*
158
       * Initialize all slots with invalid values. As scans are started,
159
       * these invalid entries will fall off the LRU list and get
160
       * replaced with real entries.
161
       */
162
0
      item->location.relfilelocator.spcOid = InvalidOid;
163
0
      item->location.relfilelocator.dbOid = InvalidOid;
164
0
      item->location.relfilelocator.relNumber = InvalidRelFileNumber;
165
0
      item->location.location = InvalidBlockNumber;
166
167
0
      item->prev = (i > 0) ?
168
0
        (&scan_locations->items[i - 1]) : NULL;
169
0
      item->next = (i < SYNC_SCAN_NELEM - 1) ?
170
0
        (&scan_locations->items[i + 1]) : NULL;
171
0
    }
172
0
  }
173
0
  else
174
0
    Assert(found);
175
0
}
176
177
/*
178
 * ss_search --- search the scan_locations structure for an entry with the
179
 *    given relfilelocator.
180
 *
181
 * If "set" is true, the location is updated to the given location.  If no
182
 * entry for the given relfilelocator is found, it will be created at the head
183
 * of the list with the given location, even if "set" is false.
184
 *
185
 * In any case, the location after possible update is returned.
186
 *
187
 * Caller is responsible for having acquired suitable lock on the shared
188
 * data structure.
189
 */
190
static BlockNumber
191
ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set)
192
0
{
193
0
  ss_lru_item_t *item;
194
195
0
  item = scan_locations->head;
196
0
  for (;;)
197
0
  {
198
0
    bool    match;
199
200
0
    match = RelFileLocatorEquals(item->location.relfilelocator,
201
0
                   relfilelocator);
202
203
0
    if (match || item->next == NULL)
204
0
    {
205
      /*
206
       * If we reached the end of list and no match was found, take over
207
       * the last entry
208
       */
209
0
      if (!match)
210
0
      {
211
0
        item->location.relfilelocator = relfilelocator;
212
0
        item->location.location = location;
213
0
      }
214
0
      else if (set)
215
0
        item->location.location = location;
216
217
      /* Move the entry to the front of the LRU list */
218
0
      if (item != scan_locations->head)
219
0
      {
220
        /* unlink */
221
0
        if (item == scan_locations->tail)
222
0
          scan_locations->tail = item->prev;
223
0
        item->prev->next = item->next;
224
0
        if (item->next)
225
0
          item->next->prev = item->prev;
226
227
        /* link */
228
0
        item->prev = NULL;
229
0
        item->next = scan_locations->head;
230
0
        scan_locations->head->prev = item;
231
0
        scan_locations->head = item;
232
0
      }
233
234
0
      return item->location.location;
235
0
    }
236
237
0
    item = item->next;
238
0
  }
239
240
  /* not reached */
241
0
}
242
243
/*
244
 * ss_get_location --- get the optimal starting location for scan
245
 *
246
 * Returns the last-reported location of a sequential scan on the
247
 * relation, or 0 if no valid location is found.
248
 *
249
 * We expect the caller has just done RelationGetNumberOfBlocks(), and
250
 * so that number is passed in rather than computing it again.  The result
251
 * is guaranteed less than relnblocks (assuming that's > 0).
252
 */
253
BlockNumber
254
ss_get_location(Relation rel, BlockNumber relnblocks)
255
0
{
256
0
  BlockNumber startloc;
257
258
0
  LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
259
0
  startloc = ss_search(rel->rd_locator, 0, false);
260
0
  LWLockRelease(SyncScanLock);
261
262
  /*
263
   * If the location is not a valid block number for this scan, start at 0.
264
   *
265
   * This can happen if for instance a VACUUM truncated the table since the
266
   * location was saved.
267
   */
268
0
  if (startloc >= relnblocks)
269
0
    startloc = 0;
270
271
#ifdef TRACE_SYNCSCAN
272
  if (trace_syncscan)
273
    elog(LOG,
274
       "SYNC_SCAN: start \"%s\" (size %u) at %u",
275
       RelationGetRelationName(rel), relnblocks, startloc);
276
#endif
277
278
0
  return startloc;
279
0
}
280
281
/*
282
 * ss_report_location --- update the current scan location
283
 *
284
 * Writes an entry into the shared Sync Scan state of the form
285
 * (relfilelocator, blocknumber), overwriting any existing entry for the
286
 * same relfilelocator.
287
 */
288
void
289
ss_report_location(Relation rel, BlockNumber location)
290
0
{
291
#ifdef TRACE_SYNCSCAN
292
  if (trace_syncscan)
293
  {
294
    if ((location % 1024) == 0)
295
      elog(LOG,
296
         "SYNC_SCAN: scanning \"%s\" at %u",
297
         RelationGetRelationName(rel), location);
298
  }
299
#endif
300
301
  /*
302
   * To reduce lock contention, only report scan progress every N pages. For
303
   * the same reason, don't block if the lock isn't immediately available.
304
   * Missing a few updates isn't critical, it just means that a new scan
305
   * that wants to join the pack will start a little bit behind the head of
306
   * the scan.  Hopefully the pages are still in OS cache and the scan
307
   * catches up quickly.
308
   */
309
0
  if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
310
0
  {
311
0
    if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
312
0
    {
313
0
      (void) ss_search(rel->rd_locator, location, true);
314
0
      LWLockRelease(SyncScanLock);
315
0
    }
316
#ifdef TRACE_SYNCSCAN
317
    else if (trace_syncscan)
318
      elog(LOG,
319
         "SYNC_SCAN: missed update for \"%s\" at %u",
320
         RelationGetRelationName(rel), location);
321
#endif
322
0
  }
323
0
}