/src/postgres/src/backend/access/common/syncscan.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * syncscan.c |
4 | | * scan synchronization support |
5 | | * |
6 | | * When multiple backends run a sequential scan on the same table, we try |
7 | | * to keep them synchronized to reduce the overall I/O needed. The goal is |
8 | | * to read each page into shared buffer cache only once, and let all backends |
9 | | * that take part in the shared scan process the page before it falls out of |
10 | | * the cache. |
11 | | * |
12 | | * Since the "leader" in a pack of backends doing a seqscan will have to wait |
13 | | * for I/O, while the "followers" don't, there is a strong self-synchronizing |
14 | | * effect once we can get the backends examining approximately the same part |
15 | | * of the table at the same time. Hence all that is really needed is to get |
16 | | * a new backend beginning a seqscan to begin it close to where other backends |
17 | | * are reading. We can scan the table circularly, from block X up to the |
18 | | * end and then from block 0 to X-1, to ensure we visit all rows while still |
19 | | * participating in the common scan. |
20 | | * |
21 | | * To accomplish that, we keep track of the scan position of each table, and |
22 | | * start new scans close to where the previous scan(s) are. We don't try to |
23 | | * do any extra synchronization to keep the scans together afterwards; some |
24 | | * scans might progress much more slowly than others, for example if the |
25 | | * results need to be transferred to the client over a slow network, and we |
26 | | * don't want such queries to slow down others. |
27 | | * |
28 | | * There can realistically only be a few large sequential scans on different |
29 | | * tables in progress at any time. Therefore we just keep the scan positions |
30 | | * in a small LRU list which we scan every time we need to look up or update a |
31 | | * scan position. The whole mechanism is only applied for tables exceeding |
32 | | * a threshold size (but that is not the concern of this module). |
33 | | * |
34 | | * INTERFACE ROUTINES |
35 | | * ss_get_location - return current scan location of a relation |
36 | | * ss_report_location - update current scan location |
37 | | * |
38 | | * |
39 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
40 | | * Portions Copyright (c) 1994, Regents of the University of California |
41 | | * |
42 | | * IDENTIFICATION |
43 | | * src/backend/access/common/syncscan.c |
44 | | * |
45 | | *------------------------------------------------------------------------- |
46 | | */ |
47 | | #include "postgres.h" |
48 | | |
49 | | #include "access/syncscan.h" |
50 | | #include "miscadmin.h" |
51 | | #include "storage/lwlock.h" |
52 | | #include "storage/shmem.h" |
53 | | #include "utils/rel.h" |
54 | | |
55 | | |
56 | | /* GUC variables */ |
57 | | #ifdef TRACE_SYNCSCAN |
58 | | bool trace_syncscan = false; |
59 | | #endif |
60 | | |
61 | | |
62 | | /* |
63 | | * Size of the LRU list. |
64 | | * |
65 | | * Note: the code assumes that SYNC_SCAN_NELEM > 1. |
66 | | * |
67 | | * XXX: What's a good value? It should be large enough to hold the |
68 | | * maximum number of large tables scanned simultaneously. But a larger value |
69 | | * means more traversing of the LRU list when starting a new scan. |
70 | | */ |
71 | 0 | #define SYNC_SCAN_NELEM 20 |
72 | | |
73 | | /* |
74 | | * Interval between reports of the location of the current scan, in pages. |
75 | | * |
76 | | * Note: This should be smaller than the ring size (see buffer/freelist.c) |
77 | | * we use for bulk reads. Otherwise a scan joining other scans might start |
78 | | * from a page that's no longer in the buffer cache. This is a bit fuzzy; |
79 | | * there's no guarantee that the new scan will read the page before it leaves |
80 | | * the buffer cache anyway, and on the other hand the page is most likely |
81 | | * still in the OS cache. |
82 | | */ |
83 | 0 | #define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ) |
84 | | |
85 | | |
86 | | /* |
87 | | * The scan locations structure is essentially a doubly-linked LRU with head |
88 | | * and tail pointer, but designed to hold a fixed maximum number of elements in |
89 | | * fixed-size shared memory. |
90 | | */ |
91 | | typedef struct ss_scan_location_t |
92 | | { |
93 | | RelFileLocator relfilelocator; /* identity of a relation */ |
94 | | BlockNumber location; /* last-reported location in the relation */ |
95 | | } ss_scan_location_t; |
96 | | |
97 | | typedef struct ss_lru_item_t |
98 | | { |
99 | | struct ss_lru_item_t *prev; |
100 | | struct ss_lru_item_t *next; |
101 | | ss_scan_location_t location; |
102 | | } ss_lru_item_t; |
103 | | |
104 | | typedef struct ss_scan_locations_t |
105 | | { |
106 | | ss_lru_item_t *head; |
107 | | ss_lru_item_t *tail; |
108 | | ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */ |
109 | | } ss_scan_locations_t; |
110 | | |
111 | | #define SizeOfScanLocations(N) \ |
112 | 0 | (offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t)) |
113 | | |
114 | | /* Pointer to struct in shared memory */ |
115 | | static ss_scan_locations_t *scan_locations; |
116 | | |
117 | | /* prototypes for internal functions */ |
118 | | static BlockNumber ss_search(RelFileLocator relfilelocator, |
119 | | BlockNumber location, bool set); |
120 | | |
121 | | |
122 | | /* |
123 | | * SyncScanShmemSize --- report amount of shared memory space needed |
124 | | */ |
125 | | Size |
126 | | SyncScanShmemSize(void) |
127 | 0 | { |
128 | 0 | return SizeOfScanLocations(SYNC_SCAN_NELEM); |
129 | 0 | } |
130 | | |
131 | | /* |
132 | | * SyncScanShmemInit --- initialize this module's shared memory |
133 | | */ |
134 | | void |
135 | | SyncScanShmemInit(void) |
136 | 0 | { |
137 | 0 | int i; |
138 | 0 | bool found; |
139 | |
|
140 | 0 | scan_locations = (ss_scan_locations_t *) |
141 | 0 | ShmemInitStruct("Sync Scan Locations List", |
142 | 0 | SizeOfScanLocations(SYNC_SCAN_NELEM), |
143 | 0 | &found); |
144 | |
|
145 | 0 | if (!IsUnderPostmaster) |
146 | 0 | { |
147 | | /* Initialize shared memory area */ |
148 | 0 | Assert(!found); |
149 | |
|
150 | 0 | scan_locations->head = &scan_locations->items[0]; |
151 | 0 | scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1]; |
152 | |
|
153 | 0 | for (i = 0; i < SYNC_SCAN_NELEM; i++) |
154 | 0 | { |
155 | 0 | ss_lru_item_t *item = &scan_locations->items[i]; |
156 | | |
157 | | /* |
158 | | * Initialize all slots with invalid values. As scans are started, |
159 | | * these invalid entries will fall off the LRU list and get |
160 | | * replaced with real entries. |
161 | | */ |
162 | 0 | item->location.relfilelocator.spcOid = InvalidOid; |
163 | 0 | item->location.relfilelocator.dbOid = InvalidOid; |
164 | 0 | item->location.relfilelocator.relNumber = InvalidRelFileNumber; |
165 | 0 | item->location.location = InvalidBlockNumber; |
166 | |
|
167 | 0 | item->prev = (i > 0) ? |
168 | 0 | (&scan_locations->items[i - 1]) : NULL; |
169 | 0 | item->next = (i < SYNC_SCAN_NELEM - 1) ? |
170 | 0 | (&scan_locations->items[i + 1]) : NULL; |
171 | 0 | } |
172 | 0 | } |
173 | 0 | else |
174 | 0 | Assert(found); |
175 | 0 | } |
176 | | |
177 | | /* |
178 | | * ss_search --- search the scan_locations structure for an entry with the |
179 | | * given relfilelocator. |
180 | | * |
181 | | * If "set" is true, the location is updated to the given location. If no |
182 | | * entry for the given relfilelocator is found, it will be created at the head |
183 | | * of the list with the given location, even if "set" is false. |
184 | | * |
185 | | * In any case, the location after possible update is returned. |
186 | | * |
187 | | * Caller is responsible for having acquired suitable lock on the shared |
188 | | * data structure. |
189 | | */ |
190 | | static BlockNumber |
191 | | ss_search(RelFileLocator relfilelocator, BlockNumber location, bool set) |
192 | 0 | { |
193 | 0 | ss_lru_item_t *item; |
194 | |
|
195 | 0 | item = scan_locations->head; |
196 | 0 | for (;;) |
197 | 0 | { |
198 | 0 | bool match; |
199 | |
|
200 | 0 | match = RelFileLocatorEquals(item->location.relfilelocator, |
201 | 0 | relfilelocator); |
202 | |
|
203 | 0 | if (match || item->next == NULL) |
204 | 0 | { |
205 | | /* |
206 | | * If we reached the end of list and no match was found, take over |
207 | | * the last entry |
208 | | */ |
209 | 0 | if (!match) |
210 | 0 | { |
211 | 0 | item->location.relfilelocator = relfilelocator; |
212 | 0 | item->location.location = location; |
213 | 0 | } |
214 | 0 | else if (set) |
215 | 0 | item->location.location = location; |
216 | | |
217 | | /* Move the entry to the front of the LRU list */ |
218 | 0 | if (item != scan_locations->head) |
219 | 0 | { |
220 | | /* unlink */ |
221 | 0 | if (item == scan_locations->tail) |
222 | 0 | scan_locations->tail = item->prev; |
223 | 0 | item->prev->next = item->next; |
224 | 0 | if (item->next) |
225 | 0 | item->next->prev = item->prev; |
226 | | |
227 | | /* link */ |
228 | 0 | item->prev = NULL; |
229 | 0 | item->next = scan_locations->head; |
230 | 0 | scan_locations->head->prev = item; |
231 | 0 | scan_locations->head = item; |
232 | 0 | } |
233 | |
|
234 | 0 | return item->location.location; |
235 | 0 | } |
236 | | |
237 | 0 | item = item->next; |
238 | 0 | } |
239 | | |
240 | | /* not reached */ |
241 | 0 | } |
242 | | |
243 | | /* |
244 | | * ss_get_location --- get the optimal starting location for scan |
245 | | * |
246 | | * Returns the last-reported location of a sequential scan on the |
247 | | * relation, or 0 if no valid location is found. |
248 | | * |
249 | | * We expect the caller has just done RelationGetNumberOfBlocks(), and |
250 | | * so that number is passed in rather than computing it again. The result |
251 | | * is guaranteed less than relnblocks (assuming that's > 0). |
252 | | */ |
253 | | BlockNumber |
254 | | ss_get_location(Relation rel, BlockNumber relnblocks) |
255 | 0 | { |
256 | 0 | BlockNumber startloc; |
257 | |
|
258 | 0 | LWLockAcquire(SyncScanLock, LW_EXCLUSIVE); |
259 | 0 | startloc = ss_search(rel->rd_locator, 0, false); |
260 | 0 | LWLockRelease(SyncScanLock); |
261 | | |
262 | | /* |
263 | | * If the location is not a valid block number for this scan, start at 0. |
264 | | * |
265 | | * This can happen if for instance a VACUUM truncated the table since the |
266 | | * location was saved. |
267 | | */ |
268 | 0 | if (startloc >= relnblocks) |
269 | 0 | startloc = 0; |
270 | |
|
271 | | #ifdef TRACE_SYNCSCAN |
272 | | if (trace_syncscan) |
273 | | elog(LOG, |
274 | | "SYNC_SCAN: start \"%s\" (size %u) at %u", |
275 | | RelationGetRelationName(rel), relnblocks, startloc); |
276 | | #endif |
277 | |
|
278 | 0 | return startloc; |
279 | 0 | } |
280 | | |
281 | | /* |
282 | | * ss_report_location --- update the current scan location |
283 | | * |
284 | | * Writes an entry into the shared Sync Scan state of the form |
285 | | * (relfilelocator, blocknumber), overwriting any existing entry for the |
286 | | * same relfilelocator. |
287 | | */ |
288 | | void |
289 | | ss_report_location(Relation rel, BlockNumber location) |
290 | 0 | { |
291 | | #ifdef TRACE_SYNCSCAN |
292 | | if (trace_syncscan) |
293 | | { |
294 | | if ((location % 1024) == 0) |
295 | | elog(LOG, |
296 | | "SYNC_SCAN: scanning \"%s\" at %u", |
297 | | RelationGetRelationName(rel), location); |
298 | | } |
299 | | #endif |
300 | | |
301 | | /* |
302 | | * To reduce lock contention, only report scan progress every N pages. For |
303 | | * the same reason, don't block if the lock isn't immediately available. |
304 | | * Missing a few updates isn't critical, it just means that a new scan |
305 | | * that wants to join the pack will start a little bit behind the head of |
306 | | * the scan. Hopefully the pages are still in OS cache and the scan |
307 | | * catches up quickly. |
308 | | */ |
309 | 0 | if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0) |
310 | 0 | { |
311 | 0 | if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE)) |
312 | 0 | { |
313 | 0 | (void) ss_search(rel->rd_locator, location, true); |
314 | 0 | LWLockRelease(SyncScanLock); |
315 | 0 | } |
316 | | #ifdef TRACE_SYNCSCAN |
317 | | else if (trace_syncscan) |
318 | | elog(LOG, |
319 | | "SYNC_SCAN: missed update for \"%s\" at %u", |
320 | | RelationGetRelationName(rel), location); |
321 | | #endif |
322 | 0 | } |
323 | 0 | } |