/src/postgres/src/backend/storage/buffer/bufmgr.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * bufmgr.c |
4 | | * buffer manager interface routines |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/storage/buffer/bufmgr.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | /* |
16 | | * Principal entry points: |
17 | | * |
18 | | * ReadBuffer() -- find or create a buffer holding the requested page, |
19 | | * and pin it so that no one can destroy it while this process |
20 | | * is using it. |
21 | | * |
22 | | * StartReadBuffer() -- as above, with separate wait step |
23 | | * StartReadBuffers() -- multiple block version |
24 | | * WaitReadBuffers() -- second step of above |
25 | | * |
26 | | * ReleaseBuffer() -- unpin a buffer |
27 | | * |
28 | | * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty". |
29 | | * The disk write is delayed until buffer replacement or checkpoint. |
30 | | * |
31 | | * See also these files: |
32 | | * freelist.c -- chooses victim for buffer replacement |
33 | | * buf_table.c -- manages the buffer lookup table |
34 | | */ |
35 | | #include "postgres.h" |
36 | | |
37 | | #include <sys/file.h> |
38 | | #include <unistd.h> |
39 | | |
40 | | #include "access/tableam.h" |
41 | | #include "access/xloginsert.h" |
42 | | #include "access/xlogutils.h" |
43 | | #ifdef USE_ASSERT_CHECKING |
44 | | #include "catalog/pg_tablespace_d.h" |
45 | | #endif |
46 | | #include "catalog/storage.h" |
47 | | #include "catalog/storage_xlog.h" |
48 | | #include "executor/instrument.h" |
49 | | #include "lib/binaryheap.h" |
50 | | #include "miscadmin.h" |
51 | | #include "pg_trace.h" |
52 | | #include "pgstat.h" |
53 | | #include "postmaster/bgwriter.h" |
54 | | #include "storage/aio.h" |
55 | | #include "storage/buf_internals.h" |
56 | | #include "storage/bufmgr.h" |
57 | | #include "storage/fd.h" |
58 | | #include "storage/ipc.h" |
59 | | #include "storage/lmgr.h" |
60 | | #include "storage/proc.h" |
61 | | #include "storage/read_stream.h" |
62 | | #include "storage/smgr.h" |
63 | | #include "storage/standby.h" |
64 | | #include "utils/memdebug.h" |
65 | | #include "utils/ps_status.h" |
66 | | #include "utils/rel.h" |
67 | | #include "utils/resowner.h" |
68 | | #include "utils/timestamp.h" |
69 | | |
70 | | |
71 | | /* Note: these two macros only work on shared buffers, not local ones! */ |
72 | 0 | #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) |
73 | 0 | #define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) |
74 | | |
75 | | /* Note: this macro only works on local buffers, not shared ones! */ |
76 | | #define LocalBufHdrGetBlock(bufHdr) \ |
77 | | LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)] |
78 | | |
79 | | /* Bits in SyncOneBuffer's return value */ |
80 | 0 | #define BUF_WRITTEN 0x01 |
81 | 0 | #define BUF_REUSABLE 0x02 |
82 | | |
83 | 0 | #define RELS_BSEARCH_THRESHOLD 20 |
84 | | |
85 | | /* |
86 | | * This is the size (in the number of blocks) above which we scan the |
87 | | * entire buffer pool to remove the buffers for all the pages of relation |
88 | | * being dropped. For the relations with size below this threshold, we find |
89 | | * the buffers by doing lookups in BufMapping table. |
90 | | */ |
91 | 0 | #define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32) |
92 | | |
93 | | typedef struct PrivateRefCountEntry |
94 | | { |
95 | | Buffer buffer; |
96 | | int32 refcount; |
97 | | } PrivateRefCountEntry; |
98 | | |
99 | | /* 64 bytes, about the size of a cache line on common systems */ |
100 | 0 | #define REFCOUNT_ARRAY_ENTRIES 8 |
101 | | |
102 | | /* |
103 | | * Status of buffers to checkpoint for a particular tablespace, used |
104 | | * internally in BufferSync. |
105 | | */ |
106 | | typedef struct CkptTsStatus |
107 | | { |
108 | | /* oid of the tablespace */ |
109 | | Oid tsId; |
110 | | |
111 | | /* |
112 | | * Checkpoint progress for this tablespace. To make progress comparable |
113 | | * between tablespaces the progress is, for each tablespace, measured as a |
114 | | * number between 0 and the total number of to-be-checkpointed pages. Each |
115 | | * page checkpointed in this tablespace increments this space's progress |
116 | | * by progress_slice. |
117 | | */ |
118 | | float8 progress; |
119 | | float8 progress_slice; |
120 | | |
121 | | /* number of to-be checkpointed pages in this tablespace */ |
122 | | int num_to_scan; |
123 | | /* already processed pages in this tablespace */ |
124 | | int num_scanned; |
125 | | |
126 | | /* current offset in CkptBufferIds for this tablespace */ |
127 | | int index; |
128 | | } CkptTsStatus; |
129 | | |
130 | | /* |
131 | | * Type for array used to sort SMgrRelations |
132 | | * |
133 | | * FlushRelationsAllBuffers shares the same comparator function with |
134 | | * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be |
135 | | * compatible. |
136 | | */ |
137 | | typedef struct SMgrSortArray |
138 | | { |
139 | | RelFileLocator rlocator; /* This must be the first member */ |
140 | | SMgrRelation srel; |
141 | | } SMgrSortArray; |
142 | | |
143 | | /* GUC variables */ |
144 | | bool zero_damaged_pages = false; |
145 | | int bgwriter_lru_maxpages = 100; |
146 | | double bgwriter_lru_multiplier = 2.0; |
147 | | bool track_io_timing = false; |
148 | | |
149 | | /* |
150 | | * How many buffers PrefetchBuffer callers should try to stay ahead of their |
151 | | * ReadBuffer calls by. Zero means "never prefetch". This value is only used |
152 | | * for buffers not belonging to tablespaces that have their |
153 | | * effective_io_concurrency parameter set. |
154 | | */ |
155 | | int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY; |
156 | | |
157 | | /* |
158 | | * Like effective_io_concurrency, but used by maintenance code paths that might |
159 | | * benefit from a higher setting because they work on behalf of many sessions. |
160 | | * Overridden by the tablespace setting of the same name. |
161 | | */ |
162 | | int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY; |
163 | | |
164 | | /* |
165 | | * Limit on how many blocks should be handled in single I/O operations. |
166 | | * StartReadBuffers() callers should respect it, as should other operations |
167 | | * that call smgr APIs directly. It is computed as the minimum of underlying |
168 | | * GUCs io_combine_limit_guc and io_max_combine_limit. |
169 | | */ |
170 | | int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT; |
171 | | int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT; |
172 | | int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT; |
173 | | |
174 | | /* |
175 | | * GUC variables about triggering kernel writeback for buffers written; OS |
176 | | * dependent defaults are set via the GUC mechanism. |
177 | | */ |
178 | | int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER; |
179 | | int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER; |
180 | | int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER; |
181 | | |
182 | | /* local state for LockBufferForCleanup */ |
183 | | static BufferDesc *PinCountWaitBuf = NULL; |
184 | | |
185 | | /* |
186 | | * Backend-Private refcount management: |
187 | | * |
188 | | * Each buffer also has a private refcount that keeps track of the number of |
189 | | * times the buffer is pinned in the current process. This is so that the |
190 | | * shared refcount needs to be modified only once if a buffer is pinned more |
191 | | * than once by an individual backend. It's also used to check that no buffers |
192 | | * are still pinned at the end of transactions and when exiting. |
193 | | * |
194 | | * |
195 | | * To avoid - as we used to - requiring an array with NBuffers entries to keep |
196 | | * track of local buffers, we use a small sequentially searched array |
197 | | * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to |
198 | | * keep track of backend local pins. |
199 | | * |
200 | | * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all |
201 | | * refcounts are kept track of in the array; after that, new array entries |
202 | | * displace old ones into the hash table. That way a frequently used entry |
203 | | * can't get "stuck" in the hashtable while infrequent ones clog the array. |
204 | | * |
205 | | * Note that in most scenarios the number of pinned buffers will not exceed |
206 | | * REFCOUNT_ARRAY_ENTRIES. |
207 | | * |
208 | | * |
209 | | * To enter a buffer into the refcount tracking mechanism first reserve a free |
210 | | * entry using ReservePrivateRefCountEntry() and then later, if necessary, |
211 | | * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing |
212 | | * memory allocations in NewPrivateRefCountEntry() which can be important |
213 | | * because in some scenarios it's called with a spinlock held... |
214 | | */ |
215 | | static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]; |
216 | | static HTAB *PrivateRefCountHash = NULL; |
217 | | static int32 PrivateRefCountOverflowed = 0; |
218 | | static uint32 PrivateRefCountClock = 0; |
219 | | static PrivateRefCountEntry *ReservedRefCountEntry = NULL; |
220 | | |
221 | | static uint32 MaxProportionalPins; |
222 | | |
223 | | static void ReservePrivateRefCountEntry(void); |
224 | | static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); |
225 | | static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move); |
226 | | static inline int32 GetPrivateRefCount(Buffer buffer); |
227 | | static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); |
228 | | |
229 | | /* ResourceOwner callbacks to hold in-progress I/Os and buffer pins */ |
230 | | static void ResOwnerReleaseBufferIO(Datum res); |
231 | | static char *ResOwnerPrintBufferIO(Datum res); |
232 | | static void ResOwnerReleaseBufferPin(Datum res); |
233 | | static char *ResOwnerPrintBufferPin(Datum res); |
234 | | |
235 | | const ResourceOwnerDesc buffer_io_resowner_desc = |
236 | | { |
237 | | .name = "buffer io", |
238 | | .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS, |
239 | | .release_priority = RELEASE_PRIO_BUFFER_IOS, |
240 | | .ReleaseResource = ResOwnerReleaseBufferIO, |
241 | | .DebugPrint = ResOwnerPrintBufferIO |
242 | | }; |
243 | | |
244 | | const ResourceOwnerDesc buffer_pin_resowner_desc = |
245 | | { |
246 | | .name = "buffer pin", |
247 | | .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS, |
248 | | .release_priority = RELEASE_PRIO_BUFFER_PINS, |
249 | | .ReleaseResource = ResOwnerReleaseBufferPin, |
250 | | .DebugPrint = ResOwnerPrintBufferPin |
251 | | }; |
252 | | |
253 | | /* |
254 | | * Ensure that the PrivateRefCountArray has sufficient space to store one more |
255 | | * entry. This has to be called before using NewPrivateRefCountEntry() to fill |
256 | | * a new entry - but it's perfectly fine to not use a reserved entry. |
257 | | */ |
258 | | static void |
259 | | ReservePrivateRefCountEntry(void) |
260 | 0 | { |
261 | | /* Already reserved (or freed), nothing to do */ |
262 | 0 | if (ReservedRefCountEntry != NULL) |
263 | 0 | return; |
264 | | |
265 | | /* |
266 | | * First search for a free entry the array, that'll be sufficient in the |
267 | | * majority of cases. |
268 | | */ |
269 | 0 | { |
270 | 0 | int i; |
271 | |
|
272 | 0 | for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) |
273 | 0 | { |
274 | 0 | PrivateRefCountEntry *res; |
275 | |
|
276 | 0 | res = &PrivateRefCountArray[i]; |
277 | |
|
278 | 0 | if (res->buffer == InvalidBuffer) |
279 | 0 | { |
280 | 0 | ReservedRefCountEntry = res; |
281 | 0 | return; |
282 | 0 | } |
283 | 0 | } |
284 | 0 | } |
285 | | |
286 | | /* |
287 | | * No luck. All array entries are full. Move one array entry into the hash |
288 | | * table. |
289 | | */ |
290 | 0 | { |
291 | | /* |
292 | | * Move entry from the current clock position in the array into the |
293 | | * hashtable. Use that slot. |
294 | | */ |
295 | 0 | PrivateRefCountEntry *hashent; |
296 | 0 | bool found; |
297 | | |
298 | | /* select victim slot */ |
299 | 0 | ReservedRefCountEntry = |
300 | 0 | &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES]; |
301 | | |
302 | | /* Better be used, otherwise we shouldn't get here. */ |
303 | 0 | Assert(ReservedRefCountEntry->buffer != InvalidBuffer); |
304 | | |
305 | | /* enter victim array entry into hashtable */ |
306 | 0 | hashent = hash_search(PrivateRefCountHash, |
307 | 0 | &(ReservedRefCountEntry->buffer), |
308 | 0 | HASH_ENTER, |
309 | 0 | &found); |
310 | 0 | Assert(!found); |
311 | 0 | hashent->refcount = ReservedRefCountEntry->refcount; |
312 | | |
313 | | /* clear the now free array slot */ |
314 | 0 | ReservedRefCountEntry->buffer = InvalidBuffer; |
315 | 0 | ReservedRefCountEntry->refcount = 0; |
316 | |
|
317 | 0 | PrivateRefCountOverflowed++; |
318 | 0 | } |
319 | 0 | } |
320 | | |
321 | | /* |
322 | | * Fill a previously reserved refcount entry. |
323 | | */ |
324 | | static PrivateRefCountEntry * |
325 | | NewPrivateRefCountEntry(Buffer buffer) |
326 | 0 | { |
327 | 0 | PrivateRefCountEntry *res; |
328 | | |
329 | | /* only allowed to be called when a reservation has been made */ |
330 | 0 | Assert(ReservedRefCountEntry != NULL); |
331 | | |
332 | | /* use up the reserved entry */ |
333 | 0 | res = ReservedRefCountEntry; |
334 | 0 | ReservedRefCountEntry = NULL; |
335 | | |
336 | | /* and fill it */ |
337 | 0 | res->buffer = buffer; |
338 | 0 | res->refcount = 0; |
339 | |
|
340 | 0 | return res; |
341 | 0 | } |
342 | | |
343 | | /* |
344 | | * Return the PrivateRefCount entry for the passed buffer. |
345 | | * |
346 | | * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if |
347 | | * do_move is true, and the entry resides in the hashtable the entry is |
348 | | * optimized for frequent access by moving it to the array. |
349 | | */ |
350 | | static PrivateRefCountEntry * |
351 | | GetPrivateRefCountEntry(Buffer buffer, bool do_move) |
352 | 0 | { |
353 | 0 | PrivateRefCountEntry *res; |
354 | 0 | int i; |
355 | |
|
356 | 0 | Assert(BufferIsValid(buffer)); |
357 | 0 | Assert(!BufferIsLocal(buffer)); |
358 | | |
359 | | /* |
360 | | * First search for references in the array, that'll be sufficient in the |
361 | | * majority of cases. |
362 | | */ |
363 | 0 | for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) |
364 | 0 | { |
365 | 0 | res = &PrivateRefCountArray[i]; |
366 | |
|
367 | 0 | if (res->buffer == buffer) |
368 | 0 | return res; |
369 | 0 | } |
370 | | |
371 | | /* |
372 | | * By here we know that the buffer, if already pinned, isn't residing in |
373 | | * the array. |
374 | | * |
375 | | * Only look up the buffer in the hashtable if we've previously overflowed |
376 | | * into it. |
377 | | */ |
378 | 0 | if (PrivateRefCountOverflowed == 0) |
379 | 0 | return NULL; |
380 | | |
381 | 0 | res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL); |
382 | |
|
383 | 0 | if (res == NULL) |
384 | 0 | return NULL; |
385 | 0 | else if (!do_move) |
386 | 0 | { |
387 | | /* caller doesn't want us to move the hash entry into the array */ |
388 | 0 | return res; |
389 | 0 | } |
390 | 0 | else |
391 | 0 | { |
392 | | /* move buffer from hashtable into the free array slot */ |
393 | 0 | bool found; |
394 | 0 | PrivateRefCountEntry *free; |
395 | | |
396 | | /* Ensure there's a free array slot */ |
397 | 0 | ReservePrivateRefCountEntry(); |
398 | | |
399 | | /* Use up the reserved slot */ |
400 | 0 | Assert(ReservedRefCountEntry != NULL); |
401 | 0 | free = ReservedRefCountEntry; |
402 | 0 | ReservedRefCountEntry = NULL; |
403 | 0 | Assert(free->buffer == InvalidBuffer); |
404 | | |
405 | | /* and fill it */ |
406 | 0 | free->buffer = buffer; |
407 | 0 | free->refcount = res->refcount; |
408 | | |
409 | | /* delete from hashtable */ |
410 | 0 | hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found); |
411 | 0 | Assert(found); |
412 | 0 | Assert(PrivateRefCountOverflowed > 0); |
413 | 0 | PrivateRefCountOverflowed--; |
414 | |
|
415 | 0 | return free; |
416 | 0 | } |
417 | 0 | } |
418 | | |
419 | | /* |
420 | | * Returns how many times the passed buffer is pinned by this backend. |
421 | | * |
422 | | * Only works for shared memory buffers! |
423 | | */ |
424 | | static inline int32 |
425 | | GetPrivateRefCount(Buffer buffer) |
426 | 0 | { |
427 | 0 | PrivateRefCountEntry *ref; |
428 | |
|
429 | 0 | Assert(BufferIsValid(buffer)); |
430 | 0 | Assert(!BufferIsLocal(buffer)); |
431 | | |
432 | | /* |
433 | | * Not moving the entry - that's ok for the current users, but we might |
434 | | * want to change this one day. |
435 | | */ |
436 | 0 | ref = GetPrivateRefCountEntry(buffer, false); |
437 | |
|
438 | 0 | if (ref == NULL) |
439 | 0 | return 0; |
440 | 0 | return ref->refcount; |
441 | 0 | } |
442 | | |
443 | | /* |
444 | | * Release resources used to track the reference count of a buffer which we no |
445 | | * longer have pinned and don't want to pin again immediately. |
446 | | */ |
447 | | static void |
448 | | ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) |
449 | 0 | { |
450 | 0 | Assert(ref->refcount == 0); |
451 | |
|
452 | 0 | if (ref >= &PrivateRefCountArray[0] && |
453 | 0 | ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]) |
454 | 0 | { |
455 | 0 | ref->buffer = InvalidBuffer; |
456 | | |
457 | | /* |
458 | | * Mark the just used entry as reserved - in many scenarios that |
459 | | * allows us to avoid ever having to search the array/hash for free |
460 | | * entries. |
461 | | */ |
462 | 0 | ReservedRefCountEntry = ref; |
463 | 0 | } |
464 | 0 | else |
465 | 0 | { |
466 | 0 | bool found; |
467 | 0 | Buffer buffer = ref->buffer; |
468 | |
|
469 | 0 | hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found); |
470 | 0 | Assert(found); |
471 | 0 | Assert(PrivateRefCountOverflowed > 0); |
472 | 0 | PrivateRefCountOverflowed--; |
473 | 0 | } |
474 | 0 | } |
475 | | |
476 | | /* |
477 | | * BufferIsPinned |
478 | | * True iff the buffer is pinned (also checks for valid buffer number). |
479 | | * |
480 | | * NOTE: what we check here is that *this* backend holds a pin on |
481 | | * the buffer. We do not care whether some other backend does. |
482 | | */ |
483 | | #define BufferIsPinned(bufnum) \ |
484 | | ( \ |
485 | | !BufferIsValid(bufnum) ? \ |
486 | | false \ |
487 | | : \ |
488 | | BufferIsLocal(bufnum) ? \ |
489 | | (LocalRefCount[-(bufnum) - 1] > 0) \ |
490 | | : \ |
491 | | (GetPrivateRefCount(bufnum) > 0) \ |
492 | | ) |
493 | | |
494 | | |
495 | | static Buffer ReadBuffer_common(Relation rel, |
496 | | SMgrRelation smgr, char smgr_persistence, |
497 | | ForkNumber forkNum, BlockNumber blockNum, |
498 | | ReadBufferMode mode, BufferAccessStrategy strategy); |
499 | | static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, |
500 | | ForkNumber fork, |
501 | | BufferAccessStrategy strategy, |
502 | | uint32 flags, |
503 | | uint32 extend_by, |
504 | | BlockNumber extend_upto, |
505 | | Buffer *buffers, |
506 | | uint32 *extended_by); |
507 | | static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, |
508 | | ForkNumber fork, |
509 | | BufferAccessStrategy strategy, |
510 | | uint32 flags, |
511 | | uint32 extend_by, |
512 | | BlockNumber extend_upto, |
513 | | Buffer *buffers, |
514 | | uint32 *extended_by); |
515 | | static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy); |
516 | | static void PinBuffer_Locked(BufferDesc *buf); |
517 | | static void UnpinBuffer(BufferDesc *buf); |
518 | | static void UnpinBufferNoOwner(BufferDesc *buf); |
519 | | static void BufferSync(int flags); |
520 | | static uint32 WaitBufHdrUnlocked(BufferDesc *buf); |
521 | | static int SyncOneBuffer(int buf_id, bool skip_recently_used, |
522 | | WritebackContext *wb_context); |
523 | | static void WaitIO(BufferDesc *buf); |
524 | | static void AbortBufferIO(Buffer buffer); |
525 | | static void shared_buffer_write_error_callback(void *arg); |
526 | | static void local_buffer_write_error_callback(void *arg); |
527 | | static inline BufferDesc *BufferAlloc(SMgrRelation smgr, |
528 | | char relpersistence, |
529 | | ForkNumber forkNum, |
530 | | BlockNumber blockNum, |
531 | | BufferAccessStrategy strategy, |
532 | | bool *foundPtr, IOContext io_context); |
533 | | static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress); |
534 | | static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete); |
535 | | static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context); |
536 | | static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, |
537 | | IOObject io_object, IOContext io_context); |
538 | | static void FindAndDropRelationBuffers(RelFileLocator rlocator, |
539 | | ForkNumber forkNum, |
540 | | BlockNumber nForkBlock, |
541 | | BlockNumber firstDelBlock); |
542 | | static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, |
543 | | RelFileLocator dstlocator, |
544 | | ForkNumber forkNum, bool permanent); |
545 | | static void AtProcExit_Buffers(int code, Datum arg); |
546 | | static void CheckForBufferLeaks(void); |
547 | | #ifdef USE_ASSERT_CHECKING |
548 | | static void AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode, |
549 | | void *unused_context); |
550 | | #endif |
551 | | static int rlocator_comparator(const void *p1, const void *p2); |
552 | | static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb); |
553 | | static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b); |
554 | | static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg); |
555 | | |
556 | | |
557 | | /* |
558 | | * Implementation of PrefetchBuffer() for shared buffers. |
559 | | */ |
560 | | PrefetchBufferResult |
561 | | PrefetchSharedBuffer(SMgrRelation smgr_reln, |
562 | | ForkNumber forkNum, |
563 | | BlockNumber blockNum) |
564 | 0 | { |
565 | 0 | PrefetchBufferResult result = {InvalidBuffer, false}; |
566 | 0 | BufferTag newTag; /* identity of requested block */ |
567 | 0 | uint32 newHash; /* hash value for newTag */ |
568 | 0 | LWLock *newPartitionLock; /* buffer partition lock for it */ |
569 | 0 | int buf_id; |
570 | |
|
571 | 0 | Assert(BlockNumberIsValid(blockNum)); |
572 | | |
573 | | /* create a tag so we can lookup the buffer */ |
574 | 0 | InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator, |
575 | 0 | forkNum, blockNum); |
576 | | |
577 | | /* determine its hash code and partition lock ID */ |
578 | 0 | newHash = BufTableHashCode(&newTag); |
579 | 0 | newPartitionLock = BufMappingPartitionLock(newHash); |
580 | | |
581 | | /* see if the block is in the buffer pool already */ |
582 | 0 | LWLockAcquire(newPartitionLock, LW_SHARED); |
583 | 0 | buf_id = BufTableLookup(&newTag, newHash); |
584 | 0 | LWLockRelease(newPartitionLock); |
585 | | |
586 | | /* If not in buffers, initiate prefetch */ |
587 | 0 | if (buf_id < 0) |
588 | 0 | { |
589 | 0 | #ifdef USE_PREFETCH |
590 | | /* |
591 | | * Try to initiate an asynchronous read. This returns false in |
592 | | * recovery if the relation file doesn't exist. |
593 | | */ |
594 | 0 | if ((io_direct_flags & IO_DIRECT_DATA) == 0 && |
595 | 0 | smgrprefetch(smgr_reln, forkNum, blockNum, 1)) |
596 | 0 | { |
597 | 0 | result.initiated_io = true; |
598 | 0 | } |
599 | 0 | #endif /* USE_PREFETCH */ |
600 | 0 | } |
601 | 0 | else |
602 | 0 | { |
603 | | /* |
604 | | * Report the buffer it was in at that time. The caller may be able |
605 | | * to avoid a buffer table lookup, but it's not pinned and it must be |
606 | | * rechecked! |
607 | | */ |
608 | 0 | result.recent_buffer = buf_id + 1; |
609 | 0 | } |
610 | | |
611 | | /* |
612 | | * If the block *is* in buffers, we do nothing. This is not really ideal: |
613 | | * the block might be just about to be evicted, which would be stupid |
614 | | * since we know we are going to need it soon. But the only easy answer |
615 | | * is to bump the usage_count, which does not seem like a great solution: |
616 | | * when the caller does ultimately touch the block, usage_count would get |
617 | | * bumped again, resulting in too much favoritism for blocks that are |
618 | | * involved in a prefetch sequence. A real fix would involve some |
619 | | * additional per-buffer state, and it's not clear that there's enough of |
620 | | * a problem to justify that. |
621 | | */ |
622 | |
|
623 | 0 | return result; |
624 | 0 | } |
625 | | |
626 | | /* |
627 | | * PrefetchBuffer -- initiate asynchronous read of a block of a relation |
628 | | * |
629 | | * This is named by analogy to ReadBuffer but doesn't actually allocate a |
630 | | * buffer. Instead it tries to ensure that a future ReadBuffer for the given |
631 | | * block will not be delayed by the I/O. Prefetching is optional. |
632 | | * |
633 | | * There are three possible outcomes: |
634 | | * |
635 | | * 1. If the block is already cached, the result includes a valid buffer that |
636 | | * could be used by the caller to avoid the need for a later buffer lookup, but |
637 | | * it's not pinned, so the caller must recheck it. |
638 | | * |
639 | | * 2. If the kernel has been asked to initiate I/O, the initiated_io member is |
640 | | * true. Currently there is no way to know if the data was already cached by |
641 | | * the kernel and therefore didn't really initiate I/O, and no way to know when |
642 | | * the I/O completes other than using synchronous ReadBuffer(). |
643 | | * |
644 | | * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and |
645 | | * USE_PREFETCH is not defined (this build doesn't support prefetching due to |
646 | | * lack of a kernel facility), direct I/O is enabled, or the underlying |
647 | | * relation file wasn't found and we are in recovery. (If the relation file |
648 | | * wasn't found and we are not in recovery, an error is raised). |
649 | | */ |
650 | | PrefetchBufferResult |
651 | | PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) |
652 | 0 | { |
653 | 0 | Assert(RelationIsValid(reln)); |
654 | 0 | Assert(BlockNumberIsValid(blockNum)); |
655 | |
|
656 | 0 | if (RelationUsesLocalBuffers(reln)) |
657 | 0 | { |
658 | | /* see comments in ReadBufferExtended */ |
659 | 0 | if (RELATION_IS_OTHER_TEMP(reln)) |
660 | 0 | ereport(ERROR, |
661 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
662 | 0 | errmsg("cannot access temporary tables of other sessions"))); |
663 | | |
664 | | /* pass it off to localbuf.c */ |
665 | 0 | return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum); |
666 | 0 | } |
667 | 0 | else |
668 | 0 | { |
669 | | /* pass it to the shared buffer version */ |
670 | 0 | return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum); |
671 | 0 | } |
672 | 0 | } |
673 | | |
674 | | /* |
675 | | * ReadRecentBuffer -- try to pin a block in a recently observed buffer |
676 | | * |
677 | | * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's |
678 | | * successful. Return true if the buffer is valid and still has the expected |
679 | | * tag. In that case, the buffer is pinned and the usage count is bumped. |
680 | | */ |
681 | | bool |
682 | | ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, |
683 | | Buffer recent_buffer) |
684 | 0 | { |
685 | 0 | BufferDesc *bufHdr; |
686 | 0 | BufferTag tag; |
687 | 0 | uint32 buf_state; |
688 | 0 | bool have_private_ref; |
689 | |
|
690 | 0 | Assert(BufferIsValid(recent_buffer)); |
691 | |
|
692 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
693 | 0 | ReservePrivateRefCountEntry(); |
694 | 0 | InitBufferTag(&tag, &rlocator, forkNum, blockNum); |
695 | |
|
696 | 0 | if (BufferIsLocal(recent_buffer)) |
697 | 0 | { |
698 | 0 | int b = -recent_buffer - 1; |
699 | |
|
700 | 0 | bufHdr = GetLocalBufferDescriptor(b); |
701 | 0 | buf_state = pg_atomic_read_u32(&bufHdr->state); |
702 | | |
703 | | /* Is it still valid and holding the right tag? */ |
704 | 0 | if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag)) |
705 | 0 | { |
706 | 0 | PinLocalBuffer(bufHdr, true); |
707 | |
|
708 | 0 | pgBufferUsage.local_blks_hit++; |
709 | |
|
710 | 0 | return true; |
711 | 0 | } |
712 | 0 | } |
713 | 0 | else |
714 | 0 | { |
715 | 0 | bufHdr = GetBufferDescriptor(recent_buffer - 1); |
716 | 0 | have_private_ref = GetPrivateRefCount(recent_buffer) > 0; |
717 | | |
718 | | /* |
719 | | * Do we already have this buffer pinned with a private reference? If |
720 | | * so, it must be valid and it is safe to check the tag without |
721 | | * locking. If not, we have to lock the header first and then check. |
722 | | */ |
723 | 0 | if (have_private_ref) |
724 | 0 | buf_state = pg_atomic_read_u32(&bufHdr->state); |
725 | 0 | else |
726 | 0 | buf_state = LockBufHdr(bufHdr); |
727 | |
|
728 | 0 | if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag)) |
729 | 0 | { |
730 | | /* |
731 | | * It's now safe to pin the buffer. We can't pin first and ask |
732 | | * questions later, because it might confuse code paths like |
733 | | * InvalidateBuffer() if we pinned a random non-matching buffer. |
734 | | */ |
735 | 0 | if (have_private_ref) |
736 | 0 | PinBuffer(bufHdr, NULL); /* bump pin count */ |
737 | 0 | else |
738 | 0 | PinBuffer_Locked(bufHdr); /* pin for first time */ |
739 | |
|
740 | 0 | pgBufferUsage.shared_blks_hit++; |
741 | |
|
742 | 0 | return true; |
743 | 0 | } |
744 | | |
745 | | /* If we locked the header above, now unlock. */ |
746 | 0 | if (!have_private_ref) |
747 | 0 | UnlockBufHdr(bufHdr, buf_state); |
748 | 0 | } |
749 | | |
750 | 0 | return false; |
751 | 0 | } |
752 | | |
753 | | /* |
754 | | * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main |
755 | | * fork with RBM_NORMAL mode and default strategy. |
756 | | */ |
757 | | Buffer |
758 | | ReadBuffer(Relation reln, BlockNumber blockNum) |
759 | 0 | { |
760 | 0 | return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL); |
761 | 0 | } |
762 | | |
763 | | /* |
764 | | * ReadBufferExtended -- returns a buffer containing the requested |
765 | | * block of the requested relation. If the blknum |
766 | | * requested is P_NEW, extend the relation file and |
767 | | * allocate a new block. (Caller is responsible for |
768 | | * ensuring that only one backend tries to extend a |
769 | | * relation at the same time!) |
770 | | * |
771 | | * Returns: the buffer number for the buffer containing |
772 | | * the block read. The returned buffer has been pinned. |
773 | | * Does not return on error --- elog's instead. |
774 | | * |
775 | | * Assume when this function is called, that reln has been opened already. |
776 | | * |
777 | | * In RBM_NORMAL mode, the page is read from disk, and the page header is |
778 | | * validated. An error is thrown if the page header is not valid. (But |
779 | | * note that an all-zero page is considered "valid"; see |
780 | | * PageIsVerified().) |
781 | | * |
782 | | * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not |
783 | | * valid, the page is zeroed instead of throwing an error. This is intended |
784 | | * for non-critical data, where the caller is prepared to repair errors. |
785 | | * |
786 | | * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's |
787 | | * filled with zeros instead of reading it from disk. Useful when the caller |
788 | | * is going to fill the page from scratch, since this saves I/O and avoids |
789 | | * unnecessary failure if the page-on-disk has corrupt page headers. |
790 | | * The page is returned locked to ensure that the caller has a chance to |
791 | | * initialize the page before it's made visible to others. |
792 | | * Caution: do not use this mode to read a page that is beyond the relation's |
793 | | * current physical EOF; that is likely to cause problems in md.c when |
794 | | * the page is modified and written out. P_NEW is OK, though. |
795 | | * |
796 | | * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires |
797 | | * a cleanup-strength lock on the page. |
798 | | * |
799 | | * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. |
800 | | * |
801 | | * If strategy is not NULL, a nondefault buffer access strategy is used. |
802 | | * See buffer/README for details. |
803 | | */ |
804 | | inline Buffer |
805 | | ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, |
806 | | ReadBufferMode mode, BufferAccessStrategy strategy) |
807 | 0 | { |
808 | 0 | Buffer buf; |
809 | | |
810 | | /* |
811 | | * Reject attempts to read non-local temporary relations; we would be |
812 | | * likely to get wrong data since we have no visibility into the owning |
813 | | * session's local buffers. |
814 | | */ |
815 | 0 | if (RELATION_IS_OTHER_TEMP(reln)) |
816 | 0 | ereport(ERROR, |
817 | 0 | (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), |
818 | 0 | errmsg("cannot access temporary tables of other sessions"))); |
819 | | |
820 | | /* |
821 | | * Read the buffer, and update pgstat counters to reflect a cache hit or |
822 | | * miss. |
823 | | */ |
824 | 0 | buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0, |
825 | 0 | forkNum, blockNum, mode, strategy); |
826 | |
|
827 | 0 | return buf; |
828 | 0 | } |
829 | | |
830 | | |
831 | | /* |
832 | | * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require |
833 | | * a relcache entry for the relation. |
834 | | * |
835 | | * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and |
836 | | * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function |
837 | | * cannot be used for temporary relations (and making that work might be |
838 | | * difficult, unless we only want to read temporary relations for our own |
839 | | * ProcNumber). |
840 | | */ |
841 | | Buffer |
842 | | ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, |
843 | | BlockNumber blockNum, ReadBufferMode mode, |
844 | | BufferAccessStrategy strategy, bool permanent) |
845 | 0 | { |
846 | 0 | SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER); |
847 | |
|
848 | 0 | return ReadBuffer_common(NULL, smgr, |
849 | 0 | permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, |
850 | 0 | forkNum, blockNum, |
851 | 0 | mode, strategy); |
852 | 0 | } |
853 | | |
854 | | /* |
855 | | * Convenience wrapper around ExtendBufferedRelBy() extending by one block. |
856 | | */ |
857 | | Buffer |
858 | | ExtendBufferedRel(BufferManagerRelation bmr, |
859 | | ForkNumber forkNum, |
860 | | BufferAccessStrategy strategy, |
861 | | uint32 flags) |
862 | 0 | { |
863 | 0 | Buffer buf; |
864 | 0 | uint32 extend_by = 1; |
865 | |
|
866 | 0 | ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by, |
867 | 0 | &buf, &extend_by); |
868 | |
|
869 | 0 | return buf; |
870 | 0 | } |
871 | | |
872 | | /* |
873 | | * Extend relation by multiple blocks. |
874 | | * |
875 | | * Tries to extend the relation by extend_by blocks. Depending on the |
876 | | * availability of resources the relation may end up being extended by a |
877 | | * smaller number of pages (unless an error is thrown, always by at least one |
878 | | * page). *extended_by is updated to the number of pages the relation has been |
879 | | * extended to. |
880 | | * |
881 | | * buffers needs to be an array that is at least extend_by long. Upon |
882 | | * completion, the first extend_by array elements will point to a pinned |
883 | | * buffer. |
884 | | * |
885 | | * If EB_LOCK_FIRST is part of flags, the first returned buffer is |
886 | | * locked. This is useful for callers that want a buffer that is guaranteed to |
887 | | * be empty. |
888 | | */ |
889 | | BlockNumber |
890 | | ExtendBufferedRelBy(BufferManagerRelation bmr, |
891 | | ForkNumber fork, |
892 | | BufferAccessStrategy strategy, |
893 | | uint32 flags, |
894 | | uint32 extend_by, |
895 | | Buffer *buffers, |
896 | | uint32 *extended_by) |
897 | 0 | { |
898 | 0 | Assert((bmr.rel != NULL) != (bmr.smgr != NULL)); |
899 | 0 | Assert(bmr.smgr == NULL || bmr.relpersistence != 0); |
900 | 0 | Assert(extend_by > 0); |
901 | |
|
902 | 0 | if (bmr.smgr == NULL) |
903 | 0 | { |
904 | 0 | bmr.smgr = RelationGetSmgr(bmr.rel); |
905 | 0 | bmr.relpersistence = bmr.rel->rd_rel->relpersistence; |
906 | 0 | } |
907 | |
|
908 | 0 | return ExtendBufferedRelCommon(bmr, fork, strategy, flags, |
909 | 0 | extend_by, InvalidBlockNumber, |
910 | 0 | buffers, extended_by); |
911 | 0 | } |
912 | | |
913 | | /* |
914 | | * Extend the relation so it is at least extend_to blocks large, return buffer |
915 | | * (extend_to - 1). |
916 | | * |
917 | | * This is useful for callers that want to write a specific page, regardless |
918 | | * of the current size of the relation (e.g. useful for visibilitymap and for |
919 | | * crash recovery). |
920 | | */ |
921 | | Buffer |
922 | | ExtendBufferedRelTo(BufferManagerRelation bmr, |
923 | | ForkNumber fork, |
924 | | BufferAccessStrategy strategy, |
925 | | uint32 flags, |
926 | | BlockNumber extend_to, |
927 | | ReadBufferMode mode) |
928 | 0 | { |
929 | 0 | BlockNumber current_size; |
930 | 0 | uint32 extended_by = 0; |
931 | 0 | Buffer buffer = InvalidBuffer; |
932 | 0 | Buffer buffers[64]; |
933 | |
|
934 | 0 | Assert((bmr.rel != NULL) != (bmr.smgr != NULL)); |
935 | 0 | Assert(bmr.smgr == NULL || bmr.relpersistence != 0); |
936 | 0 | Assert(extend_to != InvalidBlockNumber && extend_to > 0); |
937 | |
|
938 | 0 | if (bmr.smgr == NULL) |
939 | 0 | { |
940 | 0 | bmr.smgr = RelationGetSmgr(bmr.rel); |
941 | 0 | bmr.relpersistence = bmr.rel->rd_rel->relpersistence; |
942 | 0 | } |
943 | | |
944 | | /* |
945 | | * If desired, create the file if it doesn't exist. If |
946 | | * smgr_cached_nblocks[fork] is positive then it must exist, no need for |
947 | | * an smgrexists call. |
948 | | */ |
949 | 0 | if ((flags & EB_CREATE_FORK_IF_NEEDED) && |
950 | 0 | (bmr.smgr->smgr_cached_nblocks[fork] == 0 || |
951 | 0 | bmr.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) && |
952 | 0 | !smgrexists(bmr.smgr, fork)) |
953 | 0 | { |
954 | 0 | LockRelationForExtension(bmr.rel, ExclusiveLock); |
955 | | |
956 | | /* recheck, fork might have been created concurrently */ |
957 | 0 | if (!smgrexists(bmr.smgr, fork)) |
958 | 0 | smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY); |
959 | |
|
960 | 0 | UnlockRelationForExtension(bmr.rel, ExclusiveLock); |
961 | 0 | } |
962 | | |
963 | | /* |
964 | | * If requested, invalidate size cache, so that smgrnblocks asks the |
965 | | * kernel. |
966 | | */ |
967 | 0 | if (flags & EB_CLEAR_SIZE_CACHE) |
968 | 0 | bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber; |
969 | | |
970 | | /* |
971 | | * Estimate how many pages we'll need to extend by. This avoids acquiring |
972 | | * unnecessarily many victim buffers. |
973 | | */ |
974 | 0 | current_size = smgrnblocks(bmr.smgr, fork); |
975 | | |
976 | | /* |
977 | | * Since no-one else can be looking at the page contents yet, there is no |
978 | | * difference between an exclusive lock and a cleanup-strength lock. Note |
979 | | * that we pass the original mode to ReadBuffer_common() below, when |
980 | | * falling back to reading the buffer to a concurrent relation extension. |
981 | | */ |
982 | 0 | if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) |
983 | 0 | flags |= EB_LOCK_TARGET; |
984 | |
|
985 | 0 | while (current_size < extend_to) |
986 | 0 | { |
987 | 0 | uint32 num_pages = lengthof(buffers); |
988 | 0 | BlockNumber first_block; |
989 | |
|
990 | 0 | if ((uint64) current_size + num_pages > extend_to) |
991 | 0 | num_pages = extend_to - current_size; |
992 | |
|
993 | 0 | first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags, |
994 | 0 | num_pages, extend_to, |
995 | 0 | buffers, &extended_by); |
996 | |
|
997 | 0 | current_size = first_block + extended_by; |
998 | 0 | Assert(num_pages != 0 || current_size >= extend_to); |
999 | |
|
1000 | 0 | for (uint32 i = 0; i < extended_by; i++) |
1001 | 0 | { |
1002 | 0 | if (first_block + i != extend_to - 1) |
1003 | 0 | ReleaseBuffer(buffers[i]); |
1004 | 0 | else |
1005 | 0 | buffer = buffers[i]; |
1006 | 0 | } |
1007 | 0 | } |
1008 | | |
1009 | | /* |
1010 | | * It's possible that another backend concurrently extended the relation. |
1011 | | * In that case read the buffer. |
1012 | | * |
1013 | | * XXX: Should we control this via a flag? |
1014 | | */ |
1015 | 0 | if (buffer == InvalidBuffer) |
1016 | 0 | { |
1017 | 0 | Assert(extended_by == 0); |
1018 | 0 | buffer = ReadBuffer_common(bmr.rel, bmr.smgr, bmr.relpersistence, |
1019 | 0 | fork, extend_to - 1, mode, strategy); |
1020 | 0 | } |
1021 | |
|
1022 | 0 | return buffer; |
1023 | 0 | } |
1024 | | |
1025 | | /* |
1026 | | * Lock and optionally zero a buffer, as part of the implementation of |
1027 | | * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK. The buffer must be already |
1028 | | * pinned. If the buffer is not already valid, it is zeroed and made valid. |
1029 | | */ |
1030 | | static void |
1031 | | ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid) |
1032 | 0 | { |
1033 | 0 | BufferDesc *bufHdr; |
1034 | 0 | bool need_to_zero; |
1035 | 0 | bool isLocalBuf = BufferIsLocal(buffer); |
1036 | |
|
1037 | 0 | Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); |
1038 | |
|
1039 | 0 | if (already_valid) |
1040 | 0 | { |
1041 | | /* |
1042 | | * If the caller already knew the buffer was valid, we can skip some |
1043 | | * header interaction. The caller just wants to lock the buffer. |
1044 | | */ |
1045 | 0 | need_to_zero = false; |
1046 | 0 | } |
1047 | 0 | else if (isLocalBuf) |
1048 | 0 | { |
1049 | | /* Simple case for non-shared buffers. */ |
1050 | 0 | bufHdr = GetLocalBufferDescriptor(-buffer - 1); |
1051 | 0 | need_to_zero = StartLocalBufferIO(bufHdr, true, false); |
1052 | 0 | } |
1053 | 0 | else |
1054 | 0 | { |
1055 | | /* |
1056 | | * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set |
1057 | | * concurrently. Even though we aren't doing I/O, that ensures that |
1058 | | * we don't zero a page that someone else has pinned. An exclusive |
1059 | | * content lock wouldn't be enough, because readers are allowed to |
1060 | | * drop the content lock after determining that a tuple is visible |
1061 | | * (see buffer access rules in README). |
1062 | | */ |
1063 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
1064 | 0 | need_to_zero = StartBufferIO(bufHdr, true, false); |
1065 | 0 | } |
1066 | |
|
1067 | 0 | if (need_to_zero) |
1068 | 0 | { |
1069 | 0 | memset(BufferGetPage(buffer), 0, BLCKSZ); |
1070 | | |
1071 | | /* |
1072 | | * Grab the buffer content lock before marking the page as valid, to |
1073 | | * make sure that no other backend sees the zeroed page before the |
1074 | | * caller has had a chance to initialize it. |
1075 | | * |
1076 | | * Since no-one else can be looking at the page contents yet, there is |
1077 | | * no difference between an exclusive lock and a cleanup-strength |
1078 | | * lock. (Note that we cannot use LockBuffer() or |
1079 | | * LockBufferForCleanup() here, because they assert that the buffer is |
1080 | | * already valid.) |
1081 | | */ |
1082 | 0 | if (!isLocalBuf) |
1083 | 0 | LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); |
1084 | | |
1085 | | /* Set BM_VALID, terminate IO, and wake up any waiters */ |
1086 | 0 | if (isLocalBuf) |
1087 | 0 | TerminateLocalBufferIO(bufHdr, false, BM_VALID, false); |
1088 | 0 | else |
1089 | 0 | TerminateBufferIO(bufHdr, false, BM_VALID, true, false); |
1090 | 0 | } |
1091 | 0 | else if (!isLocalBuf) |
1092 | 0 | { |
1093 | | /* |
1094 | | * The buffer is valid, so we can't zero it. The caller still expects |
1095 | | * the page to be locked on return. |
1096 | | */ |
1097 | 0 | if (mode == RBM_ZERO_AND_LOCK) |
1098 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
1099 | 0 | else |
1100 | 0 | LockBufferForCleanup(buffer); |
1101 | 0 | } |
1102 | 0 | } |
1103 | | |
1104 | | /* |
1105 | | * Pin a buffer for a given block. *foundPtr is set to true if the block was |
1106 | | * already present, or false if more work is required to either read it in or |
1107 | | * zero it. |
1108 | | */ |
1109 | | static pg_attribute_always_inline Buffer |
1110 | | PinBufferForBlock(Relation rel, |
1111 | | SMgrRelation smgr, |
1112 | | char persistence, |
1113 | | ForkNumber forkNum, |
1114 | | BlockNumber blockNum, |
1115 | | BufferAccessStrategy strategy, |
1116 | | bool *foundPtr) |
1117 | 0 | { |
1118 | 0 | BufferDesc *bufHdr; |
1119 | 0 | IOContext io_context; |
1120 | 0 | IOObject io_object; |
1121 | |
|
1122 | 0 | Assert(blockNum != P_NEW); |
1123 | | |
1124 | | /* Persistence should be set before */ |
1125 | 0 | Assert((persistence == RELPERSISTENCE_TEMP || |
1126 | 0 | persistence == RELPERSISTENCE_PERMANENT || |
1127 | 0 | persistence == RELPERSISTENCE_UNLOGGED)); |
1128 | |
|
1129 | 0 | if (persistence == RELPERSISTENCE_TEMP) |
1130 | 0 | { |
1131 | 0 | io_context = IOCONTEXT_NORMAL; |
1132 | 0 | io_object = IOOBJECT_TEMP_RELATION; |
1133 | 0 | } |
1134 | 0 | else |
1135 | 0 | { |
1136 | 0 | io_context = IOContextForStrategy(strategy); |
1137 | 0 | io_object = IOOBJECT_RELATION; |
1138 | 0 | } |
1139 | |
|
1140 | 0 | TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum, |
1141 | 0 | smgr->smgr_rlocator.locator.spcOid, |
1142 | 0 | smgr->smgr_rlocator.locator.dbOid, |
1143 | 0 | smgr->smgr_rlocator.locator.relNumber, |
1144 | 0 | smgr->smgr_rlocator.backend); |
1145 | |
|
1146 | 0 | if (persistence == RELPERSISTENCE_TEMP) |
1147 | 0 | { |
1148 | 0 | bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr); |
1149 | 0 | if (*foundPtr) |
1150 | 0 | pgBufferUsage.local_blks_hit++; |
1151 | 0 | } |
1152 | 0 | else |
1153 | 0 | { |
1154 | 0 | bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum, |
1155 | 0 | strategy, foundPtr, io_context); |
1156 | 0 | if (*foundPtr) |
1157 | 0 | pgBufferUsage.shared_blks_hit++; |
1158 | 0 | } |
1159 | 0 | if (rel) |
1160 | 0 | { |
1161 | | /* |
1162 | | * While pgBufferUsage's "read" counter isn't bumped unless we reach |
1163 | | * WaitReadBuffers() (so, not for hits, and not for buffers that are |
1164 | | * zeroed instead), the per-relation stats always count them. |
1165 | | */ |
1166 | 0 | pgstat_count_buffer_read(rel); |
1167 | 0 | if (*foundPtr) |
1168 | 0 | pgstat_count_buffer_hit(rel); |
1169 | 0 | } |
1170 | 0 | if (*foundPtr) |
1171 | 0 | { |
1172 | 0 | pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0); |
1173 | 0 | if (VacuumCostActive) |
1174 | 0 | VacuumCostBalance += VacuumCostPageHit; |
1175 | |
|
1176 | 0 | TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum, |
1177 | 0 | smgr->smgr_rlocator.locator.spcOid, |
1178 | 0 | smgr->smgr_rlocator.locator.dbOid, |
1179 | 0 | smgr->smgr_rlocator.locator.relNumber, |
1180 | 0 | smgr->smgr_rlocator.backend, |
1181 | 0 | true); |
1182 | 0 | } |
1183 | |
|
1184 | 0 | return BufferDescriptorGetBuffer(bufHdr); |
1185 | 0 | } |
1186 | | |
1187 | | /* |
1188 | | * ReadBuffer_common -- common logic for all ReadBuffer variants |
1189 | | * |
1190 | | * smgr is required, rel is optional unless using P_NEW. |
1191 | | */ |
1192 | | static pg_attribute_always_inline Buffer |
1193 | | ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, |
1194 | | ForkNumber forkNum, |
1195 | | BlockNumber blockNum, ReadBufferMode mode, |
1196 | | BufferAccessStrategy strategy) |
1197 | 0 | { |
1198 | 0 | ReadBuffersOperation operation; |
1199 | 0 | Buffer buffer; |
1200 | 0 | int flags; |
1201 | 0 | char persistence; |
1202 | | |
1203 | | /* |
1204 | | * Backward compatibility path, most code should use ExtendBufferedRel() |
1205 | | * instead, as acquiring the extension lock inside ExtendBufferedRel() |
1206 | | * scales a lot better. |
1207 | | */ |
1208 | 0 | if (unlikely(blockNum == P_NEW)) |
1209 | 0 | { |
1210 | 0 | uint32 flags = EB_SKIP_EXTENSION_LOCK; |
1211 | | |
1212 | | /* |
1213 | | * Since no-one else can be looking at the page contents yet, there is |
1214 | | * no difference between an exclusive lock and a cleanup-strength |
1215 | | * lock. |
1216 | | */ |
1217 | 0 | if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) |
1218 | 0 | flags |= EB_LOCK_FIRST; |
1219 | |
|
1220 | 0 | return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags); |
1221 | 0 | } |
1222 | | |
1223 | 0 | if (rel) |
1224 | 0 | persistence = rel->rd_rel->relpersistence; |
1225 | 0 | else |
1226 | 0 | persistence = smgr_persistence; |
1227 | |
|
1228 | 0 | if (unlikely(mode == RBM_ZERO_AND_CLEANUP_LOCK || |
1229 | 0 | mode == RBM_ZERO_AND_LOCK)) |
1230 | 0 | { |
1231 | 0 | bool found; |
1232 | |
|
1233 | 0 | buffer = PinBufferForBlock(rel, smgr, persistence, |
1234 | 0 | forkNum, blockNum, strategy, &found); |
1235 | 0 | ZeroAndLockBuffer(buffer, mode, found); |
1236 | 0 | return buffer; |
1237 | 0 | } |
1238 | | |
1239 | | /* |
1240 | | * Signal that we are going to immediately wait. If we're immediately |
1241 | | * waiting, there is no benefit in actually executing the IO |
1242 | | * asynchronously, it would just add dispatch overhead. |
1243 | | */ |
1244 | 0 | flags = READ_BUFFERS_SYNCHRONOUSLY; |
1245 | 0 | if (mode == RBM_ZERO_ON_ERROR) |
1246 | 0 | flags |= READ_BUFFERS_ZERO_ON_ERROR; |
1247 | 0 | operation.smgr = smgr; |
1248 | 0 | operation.rel = rel; |
1249 | 0 | operation.persistence = persistence; |
1250 | 0 | operation.forknum = forkNum; |
1251 | 0 | operation.strategy = strategy; |
1252 | 0 | if (StartReadBuffer(&operation, |
1253 | 0 | &buffer, |
1254 | 0 | blockNum, |
1255 | 0 | flags)) |
1256 | 0 | WaitReadBuffers(&operation); |
1257 | |
|
1258 | 0 | return buffer; |
1259 | 0 | } |
1260 | | |
1261 | | static pg_attribute_always_inline bool |
1262 | | StartReadBuffersImpl(ReadBuffersOperation *operation, |
1263 | | Buffer *buffers, |
1264 | | BlockNumber blockNum, |
1265 | | int *nblocks, |
1266 | | int flags, |
1267 | | bool allow_forwarding) |
1268 | 0 | { |
1269 | 0 | int actual_nblocks = *nblocks; |
1270 | 0 | int maxcombine = 0; |
1271 | 0 | bool did_start_io; |
1272 | |
|
1273 | 0 | Assert(*nblocks == 1 || allow_forwarding); |
1274 | 0 | Assert(*nblocks > 0); |
1275 | 0 | Assert(*nblocks <= MAX_IO_COMBINE_LIMIT); |
1276 | |
|
1277 | 0 | for (int i = 0; i < actual_nblocks; ++i) |
1278 | 0 | { |
1279 | 0 | bool found; |
1280 | |
|
1281 | 0 | if (allow_forwarding && buffers[i] != InvalidBuffer) |
1282 | 0 | { |
1283 | 0 | BufferDesc *bufHdr; |
1284 | | |
1285 | | /* |
1286 | | * This is a buffer that was pinned by an earlier call to |
1287 | | * StartReadBuffers(), but couldn't be handled in one operation at |
1288 | | * that time. The operation was split, and the caller has passed |
1289 | | * an already pinned buffer back to us to handle the rest of the |
1290 | | * operation. It must continue at the expected block number. |
1291 | | */ |
1292 | 0 | Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i); |
1293 | | |
1294 | | /* |
1295 | | * It might be an already valid buffer (a hit) that followed the |
1296 | | * final contiguous block of an earlier I/O (a miss) marking the |
1297 | | * end of it, or a buffer that some other backend has since made |
1298 | | * valid by performing the I/O for us, in which case we can handle |
1299 | | * it as a hit now. It is safe to check for a BM_VALID flag with |
1300 | | * a relaxed load, because we got a fresh view of it while pinning |
1301 | | * it in the previous call. |
1302 | | * |
1303 | | * On the other hand if we don't see BM_VALID yet, it must be an |
1304 | | * I/O that was split by the previous call and we need to try to |
1305 | | * start a new I/O from this block. We're also racing against any |
1306 | | * other backend that might start the I/O or even manage to mark |
1307 | | * it BM_VALID after this check, but StartBufferIO() will handle |
1308 | | * those cases. |
1309 | | */ |
1310 | 0 | if (BufferIsLocal(buffers[i])) |
1311 | 0 | bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1); |
1312 | 0 | else |
1313 | 0 | bufHdr = GetBufferDescriptor(buffers[i] - 1); |
1314 | 0 | Assert(pg_atomic_read_u32(&bufHdr->state) & BM_TAG_VALID); |
1315 | 0 | found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID; |
1316 | 0 | } |
1317 | 0 | else |
1318 | 0 | { |
1319 | 0 | buffers[i] = PinBufferForBlock(operation->rel, |
1320 | 0 | operation->smgr, |
1321 | 0 | operation->persistence, |
1322 | 0 | operation->forknum, |
1323 | 0 | blockNum + i, |
1324 | 0 | operation->strategy, |
1325 | 0 | &found); |
1326 | 0 | } |
1327 | |
|
1328 | 0 | if (found) |
1329 | 0 | { |
1330 | | /* |
1331 | | * We have a hit. If it's the first block in the requested range, |
1332 | | * we can return it immediately and report that WaitReadBuffers() |
1333 | | * does not need to be called. If the initial value of *nblocks |
1334 | | * was larger, the caller will have to call again for the rest. |
1335 | | */ |
1336 | 0 | if (i == 0) |
1337 | 0 | { |
1338 | 0 | *nblocks = 1; |
1339 | |
|
1340 | | #ifdef USE_ASSERT_CHECKING |
1341 | | |
1342 | | /* |
1343 | | * Initialize enough of ReadBuffersOperation to make |
1344 | | * CheckReadBuffersOperation() work. Outside of assertions |
1345 | | * that's not necessary when no IO is issued. |
1346 | | */ |
1347 | | operation->buffers = buffers; |
1348 | | operation->blocknum = blockNum; |
1349 | | operation->nblocks = 1; |
1350 | | operation->nblocks_done = 1; |
1351 | | CheckReadBuffersOperation(operation, true); |
1352 | | #endif |
1353 | 0 | return false; |
1354 | 0 | } |
1355 | | |
1356 | | /* |
1357 | | * Otherwise we already have an I/O to perform, but this block |
1358 | | * can't be included as it is already valid. Split the I/O here. |
1359 | | * There may or may not be more blocks requiring I/O after this |
1360 | | * one, we haven't checked, but they can't be contiguous with this |
1361 | | * one in the way. We'll leave this buffer pinned, forwarding it |
1362 | | * to the next call, avoiding the need to unpin it here and re-pin |
1363 | | * it in the next call. |
1364 | | */ |
1365 | 0 | actual_nblocks = i; |
1366 | 0 | break; |
1367 | 0 | } |
1368 | 0 | else |
1369 | 0 | { |
1370 | | /* |
1371 | | * Check how many blocks we can cover with the same IO. The smgr |
1372 | | * implementation might e.g. be limited due to a segment boundary. |
1373 | | */ |
1374 | 0 | if (i == 0 && actual_nblocks > 1) |
1375 | 0 | { |
1376 | 0 | maxcombine = smgrmaxcombine(operation->smgr, |
1377 | 0 | operation->forknum, |
1378 | 0 | blockNum); |
1379 | 0 | if (unlikely(maxcombine < actual_nblocks)) |
1380 | 0 | { |
1381 | 0 | elog(DEBUG2, "limiting nblocks at %u from %u to %u", |
1382 | 0 | blockNum, actual_nblocks, maxcombine); |
1383 | 0 | actual_nblocks = maxcombine; |
1384 | 0 | } |
1385 | 0 | } |
1386 | 0 | } |
1387 | 0 | } |
1388 | 0 | *nblocks = actual_nblocks; |
1389 | | |
1390 | | /* Populate information needed for I/O. */ |
1391 | 0 | operation->buffers = buffers; |
1392 | 0 | operation->blocknum = blockNum; |
1393 | 0 | operation->flags = flags; |
1394 | 0 | operation->nblocks = actual_nblocks; |
1395 | 0 | operation->nblocks_done = 0; |
1396 | 0 | pgaio_wref_clear(&operation->io_wref); |
1397 | | |
1398 | | /* |
1399 | | * When using AIO, start the IO in the background. If not, issue prefetch |
1400 | | * requests if desired by the caller. |
1401 | | * |
1402 | | * The reason we have a dedicated path for IOMETHOD_SYNC here is to |
1403 | | * de-risk the introduction of AIO somewhat. It's a large architectural |
1404 | | * change, with lots of chances for unanticipated performance effects. |
1405 | | * |
1406 | | * Use of IOMETHOD_SYNC already leads to not actually performing IO |
1407 | | * asynchronously, but without the check here we'd execute IO earlier than |
1408 | | * we used to. Eventually this IOMETHOD_SYNC specific path should go away. |
1409 | | */ |
1410 | 0 | if (io_method != IOMETHOD_SYNC) |
1411 | 0 | { |
1412 | | /* |
1413 | | * Try to start IO asynchronously. It's possible that no IO needs to |
1414 | | * be started, if another backend already performed the IO. |
1415 | | * |
1416 | | * Note that if an IO is started, it might not cover the entire |
1417 | | * requested range, e.g. because an intermediary block has been read |
1418 | | * in by another backend. In that case any "trailing" buffers we |
1419 | | * already pinned above will be "forwarded" by read_stream.c to the |
1420 | | * next call to StartReadBuffers(). |
1421 | | * |
1422 | | * This is signalled to the caller by decrementing *nblocks *and* |
1423 | | * reducing operation->nblocks. The latter is done here, but not below |
1424 | | * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the |
1425 | | * overall read size anymore, we need to retry until done in its |
1426 | | * entirety or until failed. |
1427 | | */ |
1428 | 0 | did_start_io = AsyncReadBuffers(operation, nblocks); |
1429 | |
|
1430 | 0 | operation->nblocks = *nblocks; |
1431 | 0 | } |
1432 | 0 | else |
1433 | 0 | { |
1434 | 0 | operation->flags |= READ_BUFFERS_SYNCHRONOUSLY; |
1435 | |
|
1436 | 0 | if (flags & READ_BUFFERS_ISSUE_ADVICE) |
1437 | 0 | { |
1438 | | /* |
1439 | | * In theory we should only do this if PinBufferForBlock() had to |
1440 | | * allocate new buffers above. That way, if two calls to |
1441 | | * StartReadBuffers() were made for the same blocks before |
1442 | | * WaitReadBuffers(), only the first would issue the advice. |
1443 | | * That'd be a better simulation of true asynchronous I/O, which |
1444 | | * would only start the I/O once, but isn't done here for |
1445 | | * simplicity. |
1446 | | */ |
1447 | 0 | smgrprefetch(operation->smgr, |
1448 | 0 | operation->forknum, |
1449 | 0 | blockNum, |
1450 | 0 | actual_nblocks); |
1451 | 0 | } |
1452 | | |
1453 | | /* |
1454 | | * Indicate that WaitReadBuffers() should be called. WaitReadBuffers() |
1455 | | * will initiate the necessary IO. |
1456 | | */ |
1457 | 0 | did_start_io = true; |
1458 | 0 | } |
1459 | |
|
1460 | 0 | CheckReadBuffersOperation(operation, !did_start_io); |
1461 | |
|
1462 | 0 | return did_start_io; |
1463 | 0 | } |
1464 | | |
1465 | | /* |
1466 | | * Begin reading a range of blocks beginning at blockNum and extending for |
1467 | | * *nblocks. *nblocks and the buffers array are in/out parameters. On entry, |
1468 | | * the buffers elements covered by *nblocks must hold either InvalidBuffer or |
1469 | | * buffers forwarded by an earlier call to StartReadBuffers() that was split |
1470 | | * and is now being continued. On return, *nblocks holds the number of blocks |
1471 | | * accepted by this operation. If it is less than the original number then |
1472 | | * this operation has been split, but buffer elements up to the original |
1473 | | * requested size may hold forwarded buffers to be used for a continuing |
1474 | | * operation. The caller must either start a new I/O beginning at the block |
1475 | | * immediately following the blocks accepted by this call and pass those |
1476 | | * buffers back in, or release them if it chooses not to. It shouldn't make |
1477 | | * any other use of or assumptions about forwarded buffers. |
1478 | | * |
1479 | | * If false is returned, no I/O is necessary and the buffers covered by |
1480 | | * *nblocks on exit are valid and ready to be accessed. If true is returned, |
1481 | | * an I/O has been started, and WaitReadBuffers() must be called with the same |
1482 | | * operation object before the buffers covered by *nblocks on exit can be |
1483 | | * accessed. Along with the operation object, the caller-supplied array of |
1484 | | * buffers must remain valid until WaitReadBuffers() is called, and any |
1485 | | * forwarded buffers must also be preserved for a continuing call unless |
1486 | | * they are explicitly released. |
1487 | | */ |
1488 | | bool |
1489 | | StartReadBuffers(ReadBuffersOperation *operation, |
1490 | | Buffer *buffers, |
1491 | | BlockNumber blockNum, |
1492 | | int *nblocks, |
1493 | | int flags) |
1494 | 0 | { |
1495 | 0 | return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags, |
1496 | 0 | true /* expect forwarded buffers */ ); |
1497 | 0 | } |
1498 | | |
1499 | | /* |
1500 | | * Single block version of the StartReadBuffers(). This might save a few |
1501 | | * instructions when called from another translation unit, because it is |
1502 | | * specialized for nblocks == 1. |
1503 | | * |
1504 | | * This version does not support "forwarded" buffers: they cannot be created |
1505 | | * by reading only one block and *buffer is ignored on entry. |
1506 | | */ |
1507 | | bool |
1508 | | StartReadBuffer(ReadBuffersOperation *operation, |
1509 | | Buffer *buffer, |
1510 | | BlockNumber blocknum, |
1511 | | int flags) |
1512 | 0 | { |
1513 | 0 | int nblocks = 1; |
1514 | 0 | bool result; |
1515 | |
|
1516 | 0 | result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags, |
1517 | 0 | false /* single block, no forwarding */ ); |
1518 | 0 | Assert(nblocks == 1); /* single block can't be short */ |
1519 | |
|
1520 | 0 | return result; |
1521 | 0 | } |
1522 | | |
1523 | | /* |
1524 | | * Perform sanity checks on the ReadBuffersOperation. |
1525 | | */ |
1526 | | static void |
1527 | | CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete) |
1528 | 0 | { |
1529 | | #ifdef USE_ASSERT_CHECKING |
1530 | | Assert(operation->nblocks_done <= operation->nblocks); |
1531 | | Assert(!is_complete || operation->nblocks == operation->nblocks_done); |
1532 | | |
1533 | | for (int i = 0; i < operation->nblocks; i++) |
1534 | | { |
1535 | | Buffer buffer = operation->buffers[i]; |
1536 | | BufferDesc *buf_hdr = BufferIsLocal(buffer) ? |
1537 | | GetLocalBufferDescriptor(-buffer - 1) : |
1538 | | GetBufferDescriptor(buffer - 1); |
1539 | | |
1540 | | Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i); |
1541 | | Assert(pg_atomic_read_u32(&buf_hdr->state) & BM_TAG_VALID); |
1542 | | |
1543 | | if (i < operation->nblocks_done) |
1544 | | Assert(pg_atomic_read_u32(&buf_hdr->state) & BM_VALID); |
1545 | | } |
1546 | | #endif |
1547 | 0 | } |
1548 | | |
1549 | | /* helper for ReadBuffersCanStartIO(), to avoid repetition */ |
1550 | | static inline bool |
1551 | | ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait) |
1552 | 0 | { |
1553 | 0 | if (BufferIsLocal(buffer)) |
1554 | 0 | return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1), |
1555 | 0 | true, nowait); |
1556 | 0 | else |
1557 | 0 | return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait); |
1558 | 0 | } |
1559 | | |
1560 | | /* |
1561 | | * Helper for AsyncReadBuffers that tries to get the buffer ready for IO. |
1562 | | */ |
1563 | | static inline bool |
1564 | | ReadBuffersCanStartIO(Buffer buffer, bool nowait) |
1565 | 0 | { |
1566 | | /* |
1567 | | * If this backend currently has staged IO, we need to submit the pending |
1568 | | * IO before waiting for the right to issue IO, to avoid the potential for |
1569 | | * deadlocks (and, more commonly, unnecessary delays for other backends). |
1570 | | */ |
1571 | 0 | if (!nowait && pgaio_have_staged()) |
1572 | 0 | { |
1573 | 0 | if (ReadBuffersCanStartIOOnce(buffer, true)) |
1574 | 0 | return true; |
1575 | | |
1576 | | /* |
1577 | | * Unfortunately StartBufferIO() returning false doesn't allow to |
1578 | | * distinguish between the buffer already being valid and IO already |
1579 | | * being in progress. Since IO already being in progress is quite |
1580 | | * rare, this approach seems fine. |
1581 | | */ |
1582 | 0 | pgaio_submit_staged(); |
1583 | 0 | } |
1584 | | |
1585 | 0 | return ReadBuffersCanStartIOOnce(buffer, nowait); |
1586 | 0 | } |
1587 | | |
1588 | | /* |
1589 | | * Helper for WaitReadBuffers() that processes the results of a readv |
1590 | | * operation, raising an error if necessary. |
1591 | | */ |
1592 | | static void |
1593 | | ProcessReadBuffersResult(ReadBuffersOperation *operation) |
1594 | 0 | { |
1595 | 0 | PgAioReturn *aio_ret = &operation->io_return; |
1596 | 0 | PgAioResultStatus rs = aio_ret->result.status; |
1597 | 0 | int newly_read_blocks = 0; |
1598 | |
|
1599 | 0 | Assert(pgaio_wref_valid(&operation->io_wref)); |
1600 | 0 | Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN); |
1601 | | |
1602 | | /* |
1603 | | * SMGR reports the number of blocks successfully read as the result of |
1604 | | * the IO operation. Thus we can simply add that to ->nblocks_done. |
1605 | | */ |
1606 | |
|
1607 | 0 | if (likely(rs != PGAIO_RS_ERROR)) |
1608 | 0 | newly_read_blocks = aio_ret->result.result; |
1609 | |
|
1610 | 0 | if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING) |
1611 | 0 | pgaio_result_report(aio_ret->result, &aio_ret->target_data, |
1612 | 0 | rs == PGAIO_RS_ERROR ? ERROR : WARNING); |
1613 | 0 | else if (aio_ret->result.status == PGAIO_RS_PARTIAL) |
1614 | 0 | { |
1615 | | /* |
1616 | | * We'll retry, so we just emit a debug message to the server log (or |
1617 | | * not even that in prod scenarios). |
1618 | | */ |
1619 | 0 | pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1); |
1620 | 0 | elog(DEBUG3, "partial read, will retry"); |
1621 | 0 | } |
1622 | | |
1623 | 0 | Assert(newly_read_blocks > 0); |
1624 | 0 | Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT); |
1625 | |
|
1626 | 0 | operation->nblocks_done += newly_read_blocks; |
1627 | |
|
1628 | 0 | Assert(operation->nblocks_done <= operation->nblocks); |
1629 | 0 | } |
1630 | | |
1631 | | void |
1632 | | WaitReadBuffers(ReadBuffersOperation *operation) |
1633 | 0 | { |
1634 | 0 | PgAioReturn *aio_ret = &operation->io_return; |
1635 | 0 | IOContext io_context; |
1636 | 0 | IOObject io_object; |
1637 | |
|
1638 | 0 | if (operation->persistence == RELPERSISTENCE_TEMP) |
1639 | 0 | { |
1640 | 0 | io_context = IOCONTEXT_NORMAL; |
1641 | 0 | io_object = IOOBJECT_TEMP_RELATION; |
1642 | 0 | } |
1643 | 0 | else |
1644 | 0 | { |
1645 | 0 | io_context = IOContextForStrategy(operation->strategy); |
1646 | 0 | io_object = IOOBJECT_RELATION; |
1647 | 0 | } |
1648 | | |
1649 | | /* |
1650 | | * If we get here without an IO operation having been issued, the |
1651 | | * io_method == IOMETHOD_SYNC path must have been used. Otherwise the |
1652 | | * caller should not have called WaitReadBuffers(). |
1653 | | * |
1654 | | * In the case of IOMETHOD_SYNC, we start - as we used to before the |
1655 | | * introducing of AIO - the IO in WaitReadBuffers(). This is done as part |
1656 | | * of the retry logic below, no extra code is required. |
1657 | | * |
1658 | | * This path is expected to eventually go away. |
1659 | | */ |
1660 | 0 | if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC) |
1661 | 0 | elog(ERROR, "waiting for read operation that didn't read"); |
1662 | | |
1663 | | /* |
1664 | | * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're |
1665 | | * done. We may need multiple retries, not just because we could get |
1666 | | * multiple partial reads, but also because some of the remaining |
1667 | | * to-be-read buffers may have been read in by other backends, limiting |
1668 | | * the IO size. |
1669 | | */ |
1670 | 0 | while (true) |
1671 | 0 | { |
1672 | 0 | int ignored_nblocks_progress; |
1673 | |
|
1674 | 0 | CheckReadBuffersOperation(operation, false); |
1675 | | |
1676 | | /* |
1677 | | * If there is an IO associated with the operation, we may need to |
1678 | | * wait for it. |
1679 | | */ |
1680 | 0 | if (pgaio_wref_valid(&operation->io_wref)) |
1681 | 0 | { |
1682 | | /* |
1683 | | * Track the time spent waiting for the IO to complete. As |
1684 | | * tracking a wait even if we don't actually need to wait |
1685 | | * |
1686 | | * a) is not cheap, due to the timestamping overhead |
1687 | | * |
1688 | | * b) reports some time as waiting, even if we never waited |
1689 | | * |
1690 | | * we first check if we already know the IO is complete. |
1691 | | */ |
1692 | 0 | if (aio_ret->result.status == PGAIO_RS_UNKNOWN && |
1693 | 0 | !pgaio_wref_check_done(&operation->io_wref)) |
1694 | 0 | { |
1695 | 0 | instr_time io_start = pgstat_prepare_io_time(track_io_timing); |
1696 | |
|
1697 | 0 | pgaio_wref_wait(&operation->io_wref); |
1698 | | |
1699 | | /* |
1700 | | * The IO operation itself was already counted earlier, in |
1701 | | * AsyncReadBuffers(), this just accounts for the wait time. |
1702 | | */ |
1703 | 0 | pgstat_count_io_op_time(io_object, io_context, IOOP_READ, |
1704 | 0 | io_start, 0, 0); |
1705 | 0 | } |
1706 | 0 | else |
1707 | 0 | { |
1708 | 0 | Assert(pgaio_wref_check_done(&operation->io_wref)); |
1709 | 0 | } |
1710 | | |
1711 | | /* |
1712 | | * We now are sure the IO completed. Check the results. This |
1713 | | * includes reporting on errors if there were any. |
1714 | | */ |
1715 | 0 | ProcessReadBuffersResult(operation); |
1716 | 0 | } |
1717 | | |
1718 | | /* |
1719 | | * Most of the time, the one IO we already started, will read in |
1720 | | * everything. But we need to deal with partial reads and buffers not |
1721 | | * needing IO anymore. |
1722 | | */ |
1723 | 0 | if (operation->nblocks_done == operation->nblocks) |
1724 | 0 | break; |
1725 | | |
1726 | 0 | CHECK_FOR_INTERRUPTS(); |
1727 | | |
1728 | | /* |
1729 | | * This may only complete the IO partially, either because some |
1730 | | * buffers were already valid, or because of a partial read. |
1731 | | * |
1732 | | * NB: In contrast to after the AsyncReadBuffers() call in |
1733 | | * StartReadBuffers(), we do *not* reduce |
1734 | | * ReadBuffersOperation->nblocks here, callers expect the full |
1735 | | * operation to be completed at this point (as more operations may |
1736 | | * have been queued). |
1737 | | */ |
1738 | 0 | AsyncReadBuffers(operation, &ignored_nblocks_progress); |
1739 | 0 | } |
1740 | |
|
1741 | 0 | CheckReadBuffersOperation(operation, true); |
1742 | | |
1743 | | /* NB: READ_DONE tracepoint was already executed in completion callback */ |
1744 | 0 | } |
1745 | | |
1746 | | /* |
1747 | | * Initiate IO for the ReadBuffersOperation |
1748 | | * |
1749 | | * This function only starts a single IO at a time. The size of the IO may be |
1750 | | * limited to below the to-be-read blocks, if one of the buffers has |
1751 | | * concurrently been read in. If the first to-be-read buffer is already valid, |
1752 | | * no IO will be issued. |
1753 | | * |
1754 | | * To support retries after partial reads, the first operation->nblocks_done |
1755 | | * buffers are skipped. |
1756 | | * |
1757 | | * On return *nblocks_progress is updated to reflect the number of buffers |
1758 | | * affected by the call. If the first buffer is valid, *nblocks_progress is |
1759 | | * set to 1 and operation->nblocks_done is incremented. |
1760 | | * |
1761 | | * Returns true if IO was initiated, false if no IO was necessary. |
1762 | | */ |
1763 | | static bool |
1764 | | AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress) |
1765 | 0 | { |
1766 | 0 | Buffer *buffers = &operation->buffers[0]; |
1767 | 0 | int flags = operation->flags; |
1768 | 0 | BlockNumber blocknum = operation->blocknum; |
1769 | 0 | ForkNumber forknum = operation->forknum; |
1770 | 0 | char persistence = operation->persistence; |
1771 | 0 | int16 nblocks_done = operation->nblocks_done; |
1772 | 0 | Buffer *io_buffers = &operation->buffers[nblocks_done]; |
1773 | 0 | int io_buffers_len = 0; |
1774 | 0 | PgAioHandle *ioh; |
1775 | 0 | uint32 ioh_flags = 0; |
1776 | 0 | void *io_pages[MAX_IO_COMBINE_LIMIT]; |
1777 | 0 | IOContext io_context; |
1778 | 0 | IOObject io_object; |
1779 | 0 | bool did_start_io; |
1780 | | |
1781 | | /* |
1782 | | * When this IO is executed synchronously, either because the caller will |
1783 | | * immediately block waiting for the IO or because IOMETHOD_SYNC is used, |
1784 | | * the AIO subsystem needs to know. |
1785 | | */ |
1786 | 0 | if (flags & READ_BUFFERS_SYNCHRONOUSLY) |
1787 | 0 | ioh_flags |= PGAIO_HF_SYNCHRONOUS; |
1788 | |
|
1789 | 0 | if (persistence == RELPERSISTENCE_TEMP) |
1790 | 0 | { |
1791 | 0 | io_context = IOCONTEXT_NORMAL; |
1792 | 0 | io_object = IOOBJECT_TEMP_RELATION; |
1793 | 0 | ioh_flags |= PGAIO_HF_REFERENCES_LOCAL; |
1794 | 0 | } |
1795 | 0 | else |
1796 | 0 | { |
1797 | 0 | io_context = IOContextForStrategy(operation->strategy); |
1798 | 0 | io_object = IOOBJECT_RELATION; |
1799 | 0 | } |
1800 | | |
1801 | | /* |
1802 | | * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR |
1803 | | * flag. The reason for that is that, hopefully, zero_damaged_pages isn't |
1804 | | * set globally, but on a per-session basis. The completion callback, |
1805 | | * which may be run in other processes, e.g. in IO workers, may have a |
1806 | | * different value of the zero_damaged_pages GUC. |
1807 | | * |
1808 | | * XXX: We probably should eventually use a different flag for |
1809 | | * zero_damaged_pages, so we can report different log levels / error codes |
1810 | | * for zero_damaged_pages and ZERO_ON_ERROR. |
1811 | | */ |
1812 | 0 | if (zero_damaged_pages) |
1813 | 0 | flags |= READ_BUFFERS_ZERO_ON_ERROR; |
1814 | | |
1815 | | /* |
1816 | | * For the same reason as with zero_damaged_pages we need to use this |
1817 | | * backend's ignore_checksum_failure value. |
1818 | | */ |
1819 | 0 | if (ignore_checksum_failure) |
1820 | 0 | flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES; |
1821 | | |
1822 | | |
1823 | | /* |
1824 | | * To be allowed to report stats in the local completion callback we need |
1825 | | * to prepare to report stats now. This ensures we can safely report the |
1826 | | * checksum failure even in a critical section. |
1827 | | */ |
1828 | 0 | pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid); |
1829 | | |
1830 | | /* |
1831 | | * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire() |
1832 | | * might block, which we don't want after setting IO_IN_PROGRESS. |
1833 | | * |
1834 | | * If we need to wait for IO before we can get a handle, submit |
1835 | | * already-staged IO first, so that other backends don't need to wait. |
1836 | | * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to |
1837 | | * wait for already submitted IO, which doesn't require additional locks, |
1838 | | * but it could still cause undesirable waits. |
1839 | | * |
1840 | | * A secondary benefit is that this would allow us to measure the time in |
1841 | | * pgaio_io_acquire() without causing undue timer overhead in the common, |
1842 | | * non-blocking, case. However, currently the pgstats infrastructure |
1843 | | * doesn't really allow that, as it a) asserts that an operation can't |
1844 | | * have time without operations b) doesn't have an API to report |
1845 | | * "accumulated" time. |
1846 | | */ |
1847 | 0 | ioh = pgaio_io_acquire_nb(CurrentResourceOwner, &operation->io_return); |
1848 | 0 | if (unlikely(!ioh)) |
1849 | 0 | { |
1850 | 0 | pgaio_submit_staged(); |
1851 | |
|
1852 | 0 | ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return); |
1853 | 0 | } |
1854 | | |
1855 | | /* |
1856 | | * Check if we can start IO on the first to-be-read buffer. |
1857 | | * |
1858 | | * If an I/O is already in progress in another backend, we want to wait |
1859 | | * for the outcome: either done, or something went wrong and we will |
1860 | | * retry. |
1861 | | */ |
1862 | 0 | if (!ReadBuffersCanStartIO(buffers[nblocks_done], false)) |
1863 | 0 | { |
1864 | | /* |
1865 | | * Someone else has already completed this block, we're done. |
1866 | | * |
1867 | | * When IO is necessary, ->nblocks_done is updated in |
1868 | | * ProcessReadBuffersResult(), but that is not called if no IO is |
1869 | | * necessary. Thus update here. |
1870 | | */ |
1871 | 0 | operation->nblocks_done += 1; |
1872 | 0 | *nblocks_progress = 1; |
1873 | |
|
1874 | 0 | pgaio_io_release(ioh); |
1875 | 0 | pgaio_wref_clear(&operation->io_wref); |
1876 | 0 | did_start_io = false; |
1877 | | |
1878 | | /* |
1879 | | * Report and track this as a 'hit' for this backend, even though it |
1880 | | * must have started out as a miss in PinBufferForBlock(). The other |
1881 | | * backend will track this as a 'read'. |
1882 | | */ |
1883 | 0 | TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done, |
1884 | 0 | operation->smgr->smgr_rlocator.locator.spcOid, |
1885 | 0 | operation->smgr->smgr_rlocator.locator.dbOid, |
1886 | 0 | operation->smgr->smgr_rlocator.locator.relNumber, |
1887 | 0 | operation->smgr->smgr_rlocator.backend, |
1888 | 0 | true); |
1889 | |
|
1890 | 0 | if (persistence == RELPERSISTENCE_TEMP) |
1891 | 0 | pgBufferUsage.local_blks_hit += 1; |
1892 | 0 | else |
1893 | 0 | pgBufferUsage.shared_blks_hit += 1; |
1894 | |
|
1895 | 0 | if (operation->rel) |
1896 | 0 | pgstat_count_buffer_hit(operation->rel); |
1897 | |
|
1898 | 0 | pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0); |
1899 | |
|
1900 | 0 | if (VacuumCostActive) |
1901 | 0 | VacuumCostBalance += VacuumCostPageHit; |
1902 | 0 | } |
1903 | 0 | else |
1904 | 0 | { |
1905 | 0 | instr_time io_start; |
1906 | | |
1907 | | /* We found a buffer that we need to read in. */ |
1908 | 0 | Assert(io_buffers[0] == buffers[nblocks_done]); |
1909 | 0 | io_pages[0] = BufferGetBlock(buffers[nblocks_done]); |
1910 | 0 | io_buffers_len = 1; |
1911 | | |
1912 | | /* |
1913 | | * How many neighboring-on-disk blocks can we scatter-read into other |
1914 | | * buffers at the same time? In this case we don't wait if we see an |
1915 | | * I/O already in progress. We already set BM_IO_IN_PROGRESS for the |
1916 | | * head block, so we should get on with that I/O as soon as possible. |
1917 | | */ |
1918 | 0 | for (int i = nblocks_done + 1; i < operation->nblocks; i++) |
1919 | 0 | { |
1920 | 0 | if (!ReadBuffersCanStartIO(buffers[i], true)) |
1921 | 0 | break; |
1922 | | /* Must be consecutive block numbers. */ |
1923 | 0 | Assert(BufferGetBlockNumber(buffers[i - 1]) == |
1924 | 0 | BufferGetBlockNumber(buffers[i]) - 1); |
1925 | 0 | Assert(io_buffers[io_buffers_len] == buffers[i]); |
1926 | |
|
1927 | 0 | io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]); |
1928 | 0 | } |
1929 | | |
1930 | | /* get a reference to wait for in WaitReadBuffers() */ |
1931 | 0 | pgaio_io_get_wref(ioh, &operation->io_wref); |
1932 | | |
1933 | | /* provide the list of buffers to the completion callbacks */ |
1934 | 0 | pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len); |
1935 | |
|
1936 | 0 | pgaio_io_register_callbacks(ioh, |
1937 | 0 | persistence == RELPERSISTENCE_TEMP ? |
1938 | 0 | PGAIO_HCB_LOCAL_BUFFER_READV : |
1939 | 0 | PGAIO_HCB_SHARED_BUFFER_READV, |
1940 | 0 | flags); |
1941 | |
|
1942 | 0 | pgaio_io_set_flag(ioh, ioh_flags); |
1943 | | |
1944 | | /* --- |
1945 | | * Even though we're trying to issue IO asynchronously, track the time |
1946 | | * in smgrstartreadv(): |
1947 | | * - if io_method == IOMETHOD_SYNC, we will always perform the IO |
1948 | | * immediately |
1949 | | * - the io method might not support the IO (e.g. worker IO for a temp |
1950 | | * table) |
1951 | | * --- |
1952 | | */ |
1953 | 0 | io_start = pgstat_prepare_io_time(track_io_timing); |
1954 | 0 | smgrstartreadv(ioh, operation->smgr, forknum, |
1955 | 0 | blocknum + nblocks_done, |
1956 | 0 | io_pages, io_buffers_len); |
1957 | 0 | pgstat_count_io_op_time(io_object, io_context, IOOP_READ, |
1958 | 0 | io_start, 1, io_buffers_len * BLCKSZ); |
1959 | |
|
1960 | 0 | if (persistence == RELPERSISTENCE_TEMP) |
1961 | 0 | pgBufferUsage.local_blks_read += io_buffers_len; |
1962 | 0 | else |
1963 | 0 | pgBufferUsage.shared_blks_read += io_buffers_len; |
1964 | | |
1965 | | /* |
1966 | | * Track vacuum cost when issuing IO, not after waiting for it. |
1967 | | * Otherwise we could end up issuing a lot of IO in a short timespan, |
1968 | | * despite a low cost limit. |
1969 | | */ |
1970 | 0 | if (VacuumCostActive) |
1971 | 0 | VacuumCostBalance += VacuumCostPageMiss * io_buffers_len; |
1972 | |
|
1973 | 0 | *nblocks_progress = io_buffers_len; |
1974 | 0 | did_start_io = true; |
1975 | 0 | } |
1976 | |
|
1977 | 0 | return did_start_io; |
1978 | 0 | } |
1979 | | |
1980 | | /* |
1981 | | * BufferAlloc -- subroutine for PinBufferForBlock. Handles lookup of a shared |
1982 | | * buffer. If no buffer exists already, selects a replacement victim and |
1983 | | * evicts the old page, but does NOT read in new page. |
1984 | | * |
1985 | | * "strategy" can be a buffer replacement strategy object, or NULL for |
1986 | | * the default strategy. The selected buffer's usage_count is advanced when |
1987 | | * using the default strategy, but otherwise possibly not (see PinBuffer). |
1988 | | * |
1989 | | * The returned buffer is pinned and is already marked as holding the |
1990 | | * desired page. If it already did have the desired page, *foundPtr is |
1991 | | * set true. Otherwise, *foundPtr is set false. |
1992 | | * |
1993 | | * io_context is passed as an output parameter to avoid calling |
1994 | | * IOContextForStrategy() when there is a shared buffers hit and no IO |
1995 | | * statistics need be captured. |
1996 | | * |
1997 | | * No locks are held either at entry or exit. |
1998 | | */ |
1999 | | static pg_attribute_always_inline BufferDesc * |
2000 | | BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, |
2001 | | BlockNumber blockNum, |
2002 | | BufferAccessStrategy strategy, |
2003 | | bool *foundPtr, IOContext io_context) |
2004 | 0 | { |
2005 | 0 | BufferTag newTag; /* identity of requested block */ |
2006 | 0 | uint32 newHash; /* hash value for newTag */ |
2007 | 0 | LWLock *newPartitionLock; /* buffer partition lock for it */ |
2008 | 0 | int existing_buf_id; |
2009 | 0 | Buffer victim_buffer; |
2010 | 0 | BufferDesc *victim_buf_hdr; |
2011 | 0 | uint32 victim_buf_state; |
2012 | | |
2013 | | /* Make sure we will have room to remember the buffer pin */ |
2014 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
2015 | 0 | ReservePrivateRefCountEntry(); |
2016 | | |
2017 | | /* create a tag so we can lookup the buffer */ |
2018 | 0 | InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum); |
2019 | | |
2020 | | /* determine its hash code and partition lock ID */ |
2021 | 0 | newHash = BufTableHashCode(&newTag); |
2022 | 0 | newPartitionLock = BufMappingPartitionLock(newHash); |
2023 | | |
2024 | | /* see if the block is in the buffer pool already */ |
2025 | 0 | LWLockAcquire(newPartitionLock, LW_SHARED); |
2026 | 0 | existing_buf_id = BufTableLookup(&newTag, newHash); |
2027 | 0 | if (existing_buf_id >= 0) |
2028 | 0 | { |
2029 | 0 | BufferDesc *buf; |
2030 | 0 | bool valid; |
2031 | | |
2032 | | /* |
2033 | | * Found it. Now, pin the buffer so no one can steal it from the |
2034 | | * buffer pool, and check to see if the correct data has been loaded |
2035 | | * into the buffer. |
2036 | | */ |
2037 | 0 | buf = GetBufferDescriptor(existing_buf_id); |
2038 | |
|
2039 | 0 | valid = PinBuffer(buf, strategy); |
2040 | | |
2041 | | /* Can release the mapping lock as soon as we've pinned it */ |
2042 | 0 | LWLockRelease(newPartitionLock); |
2043 | |
|
2044 | 0 | *foundPtr = true; |
2045 | |
|
2046 | 0 | if (!valid) |
2047 | 0 | { |
2048 | | /* |
2049 | | * We can only get here if (a) someone else is still reading in |
2050 | | * the page, (b) a previous read attempt failed, or (c) someone |
2051 | | * called StartReadBuffers() but not yet WaitReadBuffers(). |
2052 | | */ |
2053 | 0 | *foundPtr = false; |
2054 | 0 | } |
2055 | |
|
2056 | 0 | return buf; |
2057 | 0 | } |
2058 | | |
2059 | | /* |
2060 | | * Didn't find it in the buffer pool. We'll have to initialize a new |
2061 | | * buffer. Remember to unlock the mapping lock while doing the work. |
2062 | | */ |
2063 | 0 | LWLockRelease(newPartitionLock); |
2064 | | |
2065 | | /* |
2066 | | * Acquire a victim buffer. Somebody else might try to do the same, we |
2067 | | * don't hold any conflicting locks. If so we'll have to undo our work |
2068 | | * later. |
2069 | | */ |
2070 | 0 | victim_buffer = GetVictimBuffer(strategy, io_context); |
2071 | 0 | victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1); |
2072 | | |
2073 | | /* |
2074 | | * Try to make a hashtable entry for the buffer under its new tag. If |
2075 | | * somebody else inserted another buffer for the tag, we'll release the |
2076 | | * victim buffer we acquired and use the already inserted one. |
2077 | | */ |
2078 | 0 | LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); |
2079 | 0 | existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id); |
2080 | 0 | if (existing_buf_id >= 0) |
2081 | 0 | { |
2082 | 0 | BufferDesc *existing_buf_hdr; |
2083 | 0 | bool valid; |
2084 | | |
2085 | | /* |
2086 | | * Got a collision. Someone has already done what we were about to do. |
2087 | | * We'll just handle this as if it were found in the buffer pool in |
2088 | | * the first place. First, give up the buffer we were planning to |
2089 | | * use. |
2090 | | * |
2091 | | * We could do this after releasing the partition lock, but then we'd |
2092 | | * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry() |
2093 | | * before acquiring the lock, for the rare case of such a collision. |
2094 | | */ |
2095 | 0 | UnpinBuffer(victim_buf_hdr); |
2096 | | |
2097 | | /* |
2098 | | * The victim buffer we acquired previously is clean and unused, let |
2099 | | * it be found again quickly |
2100 | | */ |
2101 | 0 | StrategyFreeBuffer(victim_buf_hdr); |
2102 | | |
2103 | | /* remaining code should match code at top of routine */ |
2104 | |
|
2105 | 0 | existing_buf_hdr = GetBufferDescriptor(existing_buf_id); |
2106 | |
|
2107 | 0 | valid = PinBuffer(existing_buf_hdr, strategy); |
2108 | | |
2109 | | /* Can release the mapping lock as soon as we've pinned it */ |
2110 | 0 | LWLockRelease(newPartitionLock); |
2111 | |
|
2112 | 0 | *foundPtr = true; |
2113 | |
|
2114 | 0 | if (!valid) |
2115 | 0 | { |
2116 | | /* |
2117 | | * We can only get here if (a) someone else is still reading in |
2118 | | * the page, (b) a previous read attempt failed, or (c) someone |
2119 | | * called StartReadBuffers() but not yet WaitReadBuffers(). |
2120 | | */ |
2121 | 0 | *foundPtr = false; |
2122 | 0 | } |
2123 | |
|
2124 | 0 | return existing_buf_hdr; |
2125 | 0 | } |
2126 | | |
2127 | | /* |
2128 | | * Need to lock the buffer header too in order to change its tag. |
2129 | | */ |
2130 | 0 | victim_buf_state = LockBufHdr(victim_buf_hdr); |
2131 | | |
2132 | | /* some sanity checks while we hold the buffer header lock */ |
2133 | 0 | Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1); |
2134 | 0 | Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS))); |
2135 | |
|
2136 | 0 | victim_buf_hdr->tag = newTag; |
2137 | | |
2138 | | /* |
2139 | | * Make sure BM_PERMANENT is set for buffers that must be written at every |
2140 | | * checkpoint. Unlogged buffers only need to be written at shutdown |
2141 | | * checkpoints, except for their "init" forks, which need to be treated |
2142 | | * just like permanent relations. |
2143 | | */ |
2144 | 0 | victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; |
2145 | 0 | if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM) |
2146 | 0 | victim_buf_state |= BM_PERMANENT; |
2147 | |
|
2148 | 0 | UnlockBufHdr(victim_buf_hdr, victim_buf_state); |
2149 | |
|
2150 | 0 | LWLockRelease(newPartitionLock); |
2151 | | |
2152 | | /* |
2153 | | * Buffer contents are currently invalid. |
2154 | | */ |
2155 | 0 | *foundPtr = false; |
2156 | |
|
2157 | 0 | return victim_buf_hdr; |
2158 | 0 | } |
2159 | | |
2160 | | /* |
2161 | | * InvalidateBuffer -- mark a shared buffer invalid and return it to the |
2162 | | * freelist. |
2163 | | * |
2164 | | * The buffer header spinlock must be held at entry. We drop it before |
2165 | | * returning. (This is sane because the caller must have locked the |
2166 | | * buffer in order to be sure it should be dropped.) |
2167 | | * |
2168 | | * This is used only in contexts such as dropping a relation. We assume |
2169 | | * that no other backend could possibly be interested in using the page, |
2170 | | * so the only reason the buffer might be pinned is if someone else is |
2171 | | * trying to write it out. We have to let them finish before we can |
2172 | | * reclaim the buffer. |
2173 | | * |
2174 | | * The buffer could get reclaimed by someone else while we are waiting |
2175 | | * to acquire the necessary locks; if so, don't mess it up. |
2176 | | */ |
2177 | | static void |
2178 | | InvalidateBuffer(BufferDesc *buf) |
2179 | 0 | { |
2180 | 0 | BufferTag oldTag; |
2181 | 0 | uint32 oldHash; /* hash value for oldTag */ |
2182 | 0 | LWLock *oldPartitionLock; /* buffer partition lock for it */ |
2183 | 0 | uint32 oldFlags; |
2184 | 0 | uint32 buf_state; |
2185 | | |
2186 | | /* Save the original buffer tag before dropping the spinlock */ |
2187 | 0 | oldTag = buf->tag; |
2188 | |
|
2189 | 0 | buf_state = pg_atomic_read_u32(&buf->state); |
2190 | 0 | Assert(buf_state & BM_LOCKED); |
2191 | 0 | UnlockBufHdr(buf, buf_state); |
2192 | | |
2193 | | /* |
2194 | | * Need to compute the old tag's hashcode and partition lock ID. XXX is it |
2195 | | * worth storing the hashcode in BufferDesc so we need not recompute it |
2196 | | * here? Probably not. |
2197 | | */ |
2198 | 0 | oldHash = BufTableHashCode(&oldTag); |
2199 | 0 | oldPartitionLock = BufMappingPartitionLock(oldHash); |
2200 | |
|
2201 | 0 | retry: |
2202 | | |
2203 | | /* |
2204 | | * Acquire exclusive mapping lock in preparation for changing the buffer's |
2205 | | * association. |
2206 | | */ |
2207 | 0 | LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE); |
2208 | | |
2209 | | /* Re-lock the buffer header */ |
2210 | 0 | buf_state = LockBufHdr(buf); |
2211 | | |
2212 | | /* If it's changed while we were waiting for lock, do nothing */ |
2213 | 0 | if (!BufferTagsEqual(&buf->tag, &oldTag)) |
2214 | 0 | { |
2215 | 0 | UnlockBufHdr(buf, buf_state); |
2216 | 0 | LWLockRelease(oldPartitionLock); |
2217 | 0 | return; |
2218 | 0 | } |
2219 | | |
2220 | | /* |
2221 | | * We assume the reason for it to be pinned is that either we were |
2222 | | * asynchronously reading the page in before erroring out or someone else |
2223 | | * is flushing the page out. Wait for the IO to finish. (This could be |
2224 | | * an infinite loop if the refcount is messed up... it would be nice to |
2225 | | * time out after awhile, but there seems no way to be sure how many loops |
2226 | | * may be needed. Note that if the other guy has pinned the buffer but |
2227 | | * not yet done StartBufferIO, WaitIO will fall through and we'll |
2228 | | * effectively be busy-looping here.) |
2229 | | */ |
2230 | 0 | if (BUF_STATE_GET_REFCOUNT(buf_state) != 0) |
2231 | 0 | { |
2232 | 0 | UnlockBufHdr(buf, buf_state); |
2233 | 0 | LWLockRelease(oldPartitionLock); |
2234 | | /* safety check: should definitely not be our *own* pin */ |
2235 | 0 | if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0) |
2236 | 0 | elog(ERROR, "buffer is pinned in InvalidateBuffer"); |
2237 | 0 | WaitIO(buf); |
2238 | 0 | goto retry; |
2239 | 0 | } |
2240 | | |
2241 | | /* |
2242 | | * Clear out the buffer's tag and flags. We must do this to ensure that |
2243 | | * linear scans of the buffer array don't think the buffer is valid. |
2244 | | */ |
2245 | 0 | oldFlags = buf_state & BUF_FLAG_MASK; |
2246 | 0 | ClearBufferTag(&buf->tag); |
2247 | 0 | buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK); |
2248 | 0 | UnlockBufHdr(buf, buf_state); |
2249 | | |
2250 | | /* |
2251 | | * Remove the buffer from the lookup hashtable, if it was in there. |
2252 | | */ |
2253 | 0 | if (oldFlags & BM_TAG_VALID) |
2254 | 0 | BufTableDelete(&oldTag, oldHash); |
2255 | | |
2256 | | /* |
2257 | | * Done with mapping lock. |
2258 | | */ |
2259 | 0 | LWLockRelease(oldPartitionLock); |
2260 | | |
2261 | | /* |
2262 | | * Insert the buffer at the head of the list of free buffers. |
2263 | | */ |
2264 | 0 | StrategyFreeBuffer(buf); |
2265 | 0 | } |
2266 | | |
2267 | | /* |
2268 | | * Helper routine for GetVictimBuffer() |
2269 | | * |
2270 | | * Needs to be called on a buffer with a valid tag, pinned, but without the |
2271 | | * buffer header spinlock held. |
2272 | | * |
2273 | | * Returns true if the buffer can be reused, in which case the buffer is only |
2274 | | * pinned by this backend and marked as invalid, false otherwise. |
2275 | | */ |
2276 | | static bool |
2277 | | InvalidateVictimBuffer(BufferDesc *buf_hdr) |
2278 | 0 | { |
2279 | 0 | uint32 buf_state; |
2280 | 0 | uint32 hash; |
2281 | 0 | LWLock *partition_lock; |
2282 | 0 | BufferTag tag; |
2283 | |
|
2284 | 0 | Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1); |
2285 | | |
2286 | | /* have buffer pinned, so it's safe to read tag without lock */ |
2287 | 0 | tag = buf_hdr->tag; |
2288 | |
|
2289 | 0 | hash = BufTableHashCode(&tag); |
2290 | 0 | partition_lock = BufMappingPartitionLock(hash); |
2291 | |
|
2292 | 0 | LWLockAcquire(partition_lock, LW_EXCLUSIVE); |
2293 | | |
2294 | | /* lock the buffer header */ |
2295 | 0 | buf_state = LockBufHdr(buf_hdr); |
2296 | | |
2297 | | /* |
2298 | | * We have the buffer pinned nobody else should have been able to unset |
2299 | | * this concurrently. |
2300 | | */ |
2301 | 0 | Assert(buf_state & BM_TAG_VALID); |
2302 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); |
2303 | 0 | Assert(BufferTagsEqual(&buf_hdr->tag, &tag)); |
2304 | | |
2305 | | /* |
2306 | | * If somebody else pinned the buffer since, or even worse, dirtied it, |
2307 | | * give up on this buffer: It's clearly in use. |
2308 | | */ |
2309 | 0 | if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY)) |
2310 | 0 | { |
2311 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); |
2312 | |
|
2313 | 0 | UnlockBufHdr(buf_hdr, buf_state); |
2314 | 0 | LWLockRelease(partition_lock); |
2315 | |
|
2316 | 0 | return false; |
2317 | 0 | } |
2318 | | |
2319 | | /* |
2320 | | * Clear out the buffer's tag and flags and usagecount. This is not |
2321 | | * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before |
2322 | | * doing anything with the buffer. But currently it's beneficial, as the |
2323 | | * cheaper pre-check for several linear scans of shared buffers use the |
2324 | | * tag (see e.g. FlushDatabaseBuffers()). |
2325 | | */ |
2326 | 0 | ClearBufferTag(&buf_hdr->tag); |
2327 | 0 | buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK); |
2328 | 0 | UnlockBufHdr(buf_hdr, buf_state); |
2329 | |
|
2330 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); |
2331 | | |
2332 | | /* finally delete buffer from the buffer mapping table */ |
2333 | 0 | BufTableDelete(&tag, hash); |
2334 | |
|
2335 | 0 | LWLockRelease(partition_lock); |
2336 | |
|
2337 | 0 | Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID))); |
2338 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); |
2339 | 0 | Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0); |
2340 | |
|
2341 | 0 | return true; |
2342 | 0 | } |
2343 | | |
2344 | | static Buffer |
2345 | | GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) |
2346 | 0 | { |
2347 | 0 | BufferDesc *buf_hdr; |
2348 | 0 | Buffer buf; |
2349 | 0 | uint32 buf_state; |
2350 | 0 | bool from_ring; |
2351 | | |
2352 | | /* |
2353 | | * Ensure, while the spinlock's not yet held, that there's a free refcount |
2354 | | * entry, and a resource owner slot for the pin. |
2355 | | */ |
2356 | 0 | ReservePrivateRefCountEntry(); |
2357 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
2358 | | |
2359 | | /* we return here if a prospective victim buffer gets used concurrently */ |
2360 | 0 | again: |
2361 | | |
2362 | | /* |
2363 | | * Select a victim buffer. The buffer is returned with its header |
2364 | | * spinlock still held! |
2365 | | */ |
2366 | 0 | buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring); |
2367 | 0 | buf = BufferDescriptorGetBuffer(buf_hdr); |
2368 | |
|
2369 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0); |
2370 | | |
2371 | | /* Pin the buffer and then release the buffer spinlock */ |
2372 | 0 | PinBuffer_Locked(buf_hdr); |
2373 | | |
2374 | | /* |
2375 | | * We shouldn't have any other pins for this buffer. |
2376 | | */ |
2377 | 0 | CheckBufferIsPinnedOnce(buf); |
2378 | | |
2379 | | /* |
2380 | | * If the buffer was dirty, try to write it out. There is a race |
2381 | | * condition here, in that someone might dirty it after we released the |
2382 | | * buffer header lock above, or even while we are writing it out (since |
2383 | | * our share-lock won't prevent hint-bit updates). We will recheck the |
2384 | | * dirty bit after re-locking the buffer header. |
2385 | | */ |
2386 | 0 | if (buf_state & BM_DIRTY) |
2387 | 0 | { |
2388 | 0 | LWLock *content_lock; |
2389 | |
|
2390 | 0 | Assert(buf_state & BM_TAG_VALID); |
2391 | 0 | Assert(buf_state & BM_VALID); |
2392 | | |
2393 | | /* |
2394 | | * We need a share-lock on the buffer contents to write it out (else |
2395 | | * we might write invalid data, eg because someone else is compacting |
2396 | | * the page contents while we write). We must use a conditional lock |
2397 | | * acquisition here to avoid deadlock. Even though the buffer was not |
2398 | | * pinned (and therefore surely not locked) when StrategyGetBuffer |
2399 | | * returned it, someone else could have pinned and exclusive-locked it |
2400 | | * by the time we get here. If we try to get the lock unconditionally, |
2401 | | * we'd block waiting for them; if they later block waiting for us, |
2402 | | * deadlock ensues. (This has been observed to happen when two |
2403 | | * backends are both trying to split btree index pages, and the second |
2404 | | * one just happens to be trying to split the page the first one got |
2405 | | * from StrategyGetBuffer.) |
2406 | | */ |
2407 | 0 | content_lock = BufferDescriptorGetContentLock(buf_hdr); |
2408 | 0 | if (!LWLockConditionalAcquire(content_lock, LW_SHARED)) |
2409 | 0 | { |
2410 | | /* |
2411 | | * Someone else has locked the buffer, so give it up and loop back |
2412 | | * to get another one. |
2413 | | */ |
2414 | 0 | UnpinBuffer(buf_hdr); |
2415 | 0 | goto again; |
2416 | 0 | } |
2417 | | |
2418 | | /* |
2419 | | * If using a nondefault strategy, and writing the buffer would |
2420 | | * require a WAL flush, let the strategy decide whether to go ahead |
2421 | | * and write/reuse the buffer or to choose another victim. We need a |
2422 | | * lock to inspect the page LSN, so this can't be done inside |
2423 | | * StrategyGetBuffer. |
2424 | | */ |
2425 | 0 | if (strategy != NULL) |
2426 | 0 | { |
2427 | 0 | XLogRecPtr lsn; |
2428 | | |
2429 | | /* Read the LSN while holding buffer header lock */ |
2430 | 0 | buf_state = LockBufHdr(buf_hdr); |
2431 | 0 | lsn = BufferGetLSN(buf_hdr); |
2432 | 0 | UnlockBufHdr(buf_hdr, buf_state); |
2433 | |
|
2434 | 0 | if (XLogNeedsFlush(lsn) |
2435 | 0 | && StrategyRejectBuffer(strategy, buf_hdr, from_ring)) |
2436 | 0 | { |
2437 | 0 | LWLockRelease(content_lock); |
2438 | 0 | UnpinBuffer(buf_hdr); |
2439 | 0 | goto again; |
2440 | 0 | } |
2441 | 0 | } |
2442 | | |
2443 | | /* OK, do the I/O */ |
2444 | 0 | FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context); |
2445 | 0 | LWLockRelease(content_lock); |
2446 | |
|
2447 | 0 | ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context, |
2448 | 0 | &buf_hdr->tag); |
2449 | 0 | } |
2450 | | |
2451 | | |
2452 | 0 | if (buf_state & BM_VALID) |
2453 | 0 | { |
2454 | | /* |
2455 | | * When a BufferAccessStrategy is in use, blocks evicted from shared |
2456 | | * buffers are counted as IOOP_EVICT in the corresponding context |
2457 | | * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a |
2458 | | * strategy in two cases: 1) while initially claiming buffers for the |
2459 | | * strategy ring 2) to replace an existing strategy ring buffer |
2460 | | * because it is pinned or in use and cannot be reused. |
2461 | | * |
2462 | | * Blocks evicted from buffers already in the strategy ring are |
2463 | | * counted as IOOP_REUSE in the corresponding strategy context. |
2464 | | * |
2465 | | * At this point, we can accurately count evictions and reuses, |
2466 | | * because we have successfully claimed the valid buffer. Previously, |
2467 | | * we may have been forced to release the buffer due to concurrent |
2468 | | * pinners or erroring out. |
2469 | | */ |
2470 | 0 | pgstat_count_io_op(IOOBJECT_RELATION, io_context, |
2471 | 0 | from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0); |
2472 | 0 | } |
2473 | | |
2474 | | /* |
2475 | | * If the buffer has an entry in the buffer mapping table, delete it. This |
2476 | | * can fail because another backend could have pinned or dirtied the |
2477 | | * buffer. |
2478 | | */ |
2479 | 0 | if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr)) |
2480 | 0 | { |
2481 | 0 | UnpinBuffer(buf_hdr); |
2482 | 0 | goto again; |
2483 | 0 | } |
2484 | | |
2485 | | /* a final set of sanity checks */ |
2486 | | #ifdef USE_ASSERT_CHECKING |
2487 | | buf_state = pg_atomic_read_u32(&buf_hdr->state); |
2488 | | |
2489 | | Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1); |
2490 | | Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY))); |
2491 | | |
2492 | | CheckBufferIsPinnedOnce(buf); |
2493 | | #endif |
2494 | | |
2495 | 0 | return buf; |
2496 | 0 | } |
2497 | | |
2498 | | /* |
2499 | | * Return the maximum number of buffers that a backend should try to pin once, |
2500 | | * to avoid exceeding its fair share. This is the highest value that |
2501 | | * GetAdditionalPinLimit() could ever return. Note that it may be zero on a |
2502 | | * system with a very small buffer pool relative to max_connections. |
2503 | | */ |
2504 | | uint32 |
2505 | | GetPinLimit(void) |
2506 | 0 | { |
2507 | 0 | return MaxProportionalPins; |
2508 | 0 | } |
2509 | | |
2510 | | /* |
2511 | | * Return the maximum number of additional buffers that this backend should |
2512 | | * pin if it wants to stay under the per-backend limit, considering the number |
2513 | | * of buffers it has already pinned. Unlike LimitAdditionalPins(), the limit |
2514 | | * return by this function can be zero. |
2515 | | */ |
2516 | | uint32 |
2517 | | GetAdditionalPinLimit(void) |
2518 | 0 | { |
2519 | 0 | uint32 estimated_pins_held; |
2520 | | |
2521 | | /* |
2522 | | * We get the number of "overflowed" pins for free, but don't know the |
2523 | | * number of pins in PrivateRefCountArray. The cost of calculating that |
2524 | | * exactly doesn't seem worth it, so just assume the max. |
2525 | | */ |
2526 | 0 | estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES; |
2527 | | |
2528 | | /* Is this backend already holding more than its fair share? */ |
2529 | 0 | if (estimated_pins_held > MaxProportionalPins) |
2530 | 0 | return 0; |
2531 | | |
2532 | 0 | return MaxProportionalPins - estimated_pins_held; |
2533 | 0 | } |
2534 | | |
2535 | | /* |
2536 | | * Limit the number of pins a batch operation may additionally acquire, to |
2537 | | * avoid running out of pinnable buffers. |
2538 | | * |
2539 | | * One additional pin is always allowed, on the assumption that the operation |
2540 | | * requires at least one to make progress. |
2541 | | */ |
2542 | | void |
2543 | | LimitAdditionalPins(uint32 *additional_pins) |
2544 | 0 | { |
2545 | 0 | uint32 limit; |
2546 | |
|
2547 | 0 | if (*additional_pins <= 1) |
2548 | 0 | return; |
2549 | | |
2550 | 0 | limit = GetAdditionalPinLimit(); |
2551 | 0 | limit = Max(limit, 1); |
2552 | 0 | if (limit < *additional_pins) |
2553 | 0 | *additional_pins = limit; |
2554 | 0 | } |
2555 | | |
2556 | | /* |
2557 | | * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to |
2558 | | * avoid duplicating the tracing and relpersistence related logic. |
2559 | | */ |
2560 | | static BlockNumber |
2561 | | ExtendBufferedRelCommon(BufferManagerRelation bmr, |
2562 | | ForkNumber fork, |
2563 | | BufferAccessStrategy strategy, |
2564 | | uint32 flags, |
2565 | | uint32 extend_by, |
2566 | | BlockNumber extend_upto, |
2567 | | Buffer *buffers, |
2568 | | uint32 *extended_by) |
2569 | 0 | { |
2570 | 0 | BlockNumber first_block; |
2571 | |
|
2572 | 0 | TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork, |
2573 | 0 | bmr.smgr->smgr_rlocator.locator.spcOid, |
2574 | 0 | bmr.smgr->smgr_rlocator.locator.dbOid, |
2575 | 0 | bmr.smgr->smgr_rlocator.locator.relNumber, |
2576 | 0 | bmr.smgr->smgr_rlocator.backend, |
2577 | 0 | extend_by); |
2578 | |
|
2579 | 0 | if (bmr.relpersistence == RELPERSISTENCE_TEMP) |
2580 | 0 | first_block = ExtendBufferedRelLocal(bmr, fork, flags, |
2581 | 0 | extend_by, extend_upto, |
2582 | 0 | buffers, &extend_by); |
2583 | 0 | else |
2584 | 0 | first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags, |
2585 | 0 | extend_by, extend_upto, |
2586 | 0 | buffers, &extend_by); |
2587 | 0 | *extended_by = extend_by; |
2588 | |
|
2589 | 0 | TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork, |
2590 | 0 | bmr.smgr->smgr_rlocator.locator.spcOid, |
2591 | 0 | bmr.smgr->smgr_rlocator.locator.dbOid, |
2592 | 0 | bmr.smgr->smgr_rlocator.locator.relNumber, |
2593 | 0 | bmr.smgr->smgr_rlocator.backend, |
2594 | 0 | *extended_by, |
2595 | 0 | first_block); |
2596 | |
|
2597 | 0 | return first_block; |
2598 | 0 | } |
2599 | | |
2600 | | /* |
2601 | | * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for |
2602 | | * shared buffers. |
2603 | | */ |
2604 | | static BlockNumber |
2605 | | ExtendBufferedRelShared(BufferManagerRelation bmr, |
2606 | | ForkNumber fork, |
2607 | | BufferAccessStrategy strategy, |
2608 | | uint32 flags, |
2609 | | uint32 extend_by, |
2610 | | BlockNumber extend_upto, |
2611 | | Buffer *buffers, |
2612 | | uint32 *extended_by) |
2613 | 0 | { |
2614 | 0 | BlockNumber first_block; |
2615 | 0 | IOContext io_context = IOContextForStrategy(strategy); |
2616 | 0 | instr_time io_start; |
2617 | |
|
2618 | 0 | LimitAdditionalPins(&extend_by); |
2619 | | |
2620 | | /* |
2621 | | * Acquire victim buffers for extension without holding extension lock. |
2622 | | * Writing out victim buffers is the most expensive part of extending the |
2623 | | * relation, particularly when doing so requires WAL flushes. Zeroing out |
2624 | | * the buffers is also quite expensive, so do that before holding the |
2625 | | * extension lock as well. |
2626 | | * |
2627 | | * These pages are pinned by us and not valid. While we hold the pin they |
2628 | | * can't be acquired as victim buffers by another backend. |
2629 | | */ |
2630 | 0 | for (uint32 i = 0; i < extend_by; i++) |
2631 | 0 | { |
2632 | 0 | Block buf_block; |
2633 | |
|
2634 | 0 | buffers[i] = GetVictimBuffer(strategy, io_context); |
2635 | 0 | buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1)); |
2636 | | |
2637 | | /* new buffers are zero-filled */ |
2638 | 0 | MemSet(buf_block, 0, BLCKSZ); |
2639 | 0 | } |
2640 | | |
2641 | | /* |
2642 | | * Lock relation against concurrent extensions, unless requested not to. |
2643 | | * |
2644 | | * We use the same extension lock for all forks. That's unnecessarily |
2645 | | * restrictive, but currently extensions for forks don't happen often |
2646 | | * enough to make it worth locking more granularly. |
2647 | | * |
2648 | | * Note that another backend might have extended the relation by the time |
2649 | | * we get the lock. |
2650 | | */ |
2651 | 0 | if (!(flags & EB_SKIP_EXTENSION_LOCK)) |
2652 | 0 | LockRelationForExtension(bmr.rel, ExclusiveLock); |
2653 | | |
2654 | | /* |
2655 | | * If requested, invalidate size cache, so that smgrnblocks asks the |
2656 | | * kernel. |
2657 | | */ |
2658 | 0 | if (flags & EB_CLEAR_SIZE_CACHE) |
2659 | 0 | bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber; |
2660 | |
|
2661 | 0 | first_block = smgrnblocks(bmr.smgr, fork); |
2662 | | |
2663 | | /* |
2664 | | * Now that we have the accurate relation size, check if the caller wants |
2665 | | * us to extend to only up to a specific size. If there were concurrent |
2666 | | * extensions, we might have acquired too many buffers and need to release |
2667 | | * them. |
2668 | | */ |
2669 | 0 | if (extend_upto != InvalidBlockNumber) |
2670 | 0 | { |
2671 | 0 | uint32 orig_extend_by = extend_by; |
2672 | |
|
2673 | 0 | if (first_block > extend_upto) |
2674 | 0 | extend_by = 0; |
2675 | 0 | else if ((uint64) first_block + extend_by > extend_upto) |
2676 | 0 | extend_by = extend_upto - first_block; |
2677 | |
|
2678 | 0 | for (uint32 i = extend_by; i < orig_extend_by; i++) |
2679 | 0 | { |
2680 | 0 | BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1); |
2681 | | |
2682 | | /* |
2683 | | * The victim buffer we acquired previously is clean and unused, |
2684 | | * let it be found again quickly |
2685 | | */ |
2686 | 0 | StrategyFreeBuffer(buf_hdr); |
2687 | 0 | UnpinBuffer(buf_hdr); |
2688 | 0 | } |
2689 | |
|
2690 | 0 | if (extend_by == 0) |
2691 | 0 | { |
2692 | 0 | if (!(flags & EB_SKIP_EXTENSION_LOCK)) |
2693 | 0 | UnlockRelationForExtension(bmr.rel, ExclusiveLock); |
2694 | 0 | *extended_by = extend_by; |
2695 | 0 | return first_block; |
2696 | 0 | } |
2697 | 0 | } |
2698 | | |
2699 | | /* Fail if relation is already at maximum possible length */ |
2700 | 0 | if ((uint64) first_block + extend_by >= MaxBlockNumber) |
2701 | 0 | ereport(ERROR, |
2702 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
2703 | 0 | errmsg("cannot extend relation %s beyond %u blocks", |
2704 | 0 | relpath(bmr.smgr->smgr_rlocator, fork).str, |
2705 | 0 | MaxBlockNumber))); |
2706 | | |
2707 | | /* |
2708 | | * Insert buffers into buffer table, mark as IO_IN_PROGRESS. |
2709 | | * |
2710 | | * This needs to happen before we extend the relation, because as soon as |
2711 | | * we do, other backends can start to read in those pages. |
2712 | | */ |
2713 | 0 | for (uint32 i = 0; i < extend_by; i++) |
2714 | 0 | { |
2715 | 0 | Buffer victim_buf = buffers[i]; |
2716 | 0 | BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1); |
2717 | 0 | BufferTag tag; |
2718 | 0 | uint32 hash; |
2719 | 0 | LWLock *partition_lock; |
2720 | 0 | int existing_id; |
2721 | | |
2722 | | /* in case we need to pin an existing buffer below */ |
2723 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
2724 | 0 | ReservePrivateRefCountEntry(); |
2725 | |
|
2726 | 0 | InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i); |
2727 | 0 | hash = BufTableHashCode(&tag); |
2728 | 0 | partition_lock = BufMappingPartitionLock(hash); |
2729 | |
|
2730 | 0 | LWLockAcquire(partition_lock, LW_EXCLUSIVE); |
2731 | |
|
2732 | 0 | existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id); |
2733 | | |
2734 | | /* |
2735 | | * We get here only in the corner case where we are trying to extend |
2736 | | * the relation but we found a pre-existing buffer. This can happen |
2737 | | * because a prior attempt at extending the relation failed, and |
2738 | | * because mdread doesn't complain about reads beyond EOF (when |
2739 | | * zero_damaged_pages is ON) and so a previous attempt to read a block |
2740 | | * beyond EOF could have left a "valid" zero-filled buffer. |
2741 | | * |
2742 | | * This has also been observed when relation was overwritten by |
2743 | | * external process. Since the legitimate cases should always have |
2744 | | * left a zero-filled buffer, complain if not PageIsNew. |
2745 | | */ |
2746 | 0 | if (existing_id >= 0) |
2747 | 0 | { |
2748 | 0 | BufferDesc *existing_hdr = GetBufferDescriptor(existing_id); |
2749 | 0 | Block buf_block; |
2750 | 0 | bool valid; |
2751 | | |
2752 | | /* |
2753 | | * Pin the existing buffer before releasing the partition lock, |
2754 | | * preventing it from being evicted. |
2755 | | */ |
2756 | 0 | valid = PinBuffer(existing_hdr, strategy); |
2757 | |
|
2758 | 0 | LWLockRelease(partition_lock); |
2759 | | |
2760 | | /* |
2761 | | * The victim buffer we acquired previously is clean and unused, |
2762 | | * let it be found again quickly |
2763 | | */ |
2764 | 0 | StrategyFreeBuffer(victim_buf_hdr); |
2765 | 0 | UnpinBuffer(victim_buf_hdr); |
2766 | |
|
2767 | 0 | buffers[i] = BufferDescriptorGetBuffer(existing_hdr); |
2768 | 0 | buf_block = BufHdrGetBlock(existing_hdr); |
2769 | |
|
2770 | 0 | if (valid && !PageIsNew((Page) buf_block)) |
2771 | 0 | ereport(ERROR, |
2772 | 0 | (errmsg("unexpected data beyond EOF in block %u of relation %s", |
2773 | 0 | existing_hdr->tag.blockNum, |
2774 | 0 | relpath(bmr.smgr->smgr_rlocator, fork).str))); |
2775 | | |
2776 | | /* |
2777 | | * We *must* do smgr[zero]extend before succeeding, else the page |
2778 | | * will not be reserved by the kernel, and the next P_NEW call |
2779 | | * will decide to return the same page. Clear the BM_VALID bit, |
2780 | | * do StartBufferIO() and proceed. |
2781 | | * |
2782 | | * Loop to handle the very small possibility that someone re-sets |
2783 | | * BM_VALID between our clearing it and StartBufferIO inspecting |
2784 | | * it. |
2785 | | */ |
2786 | 0 | do |
2787 | 0 | { |
2788 | 0 | uint32 buf_state = LockBufHdr(existing_hdr); |
2789 | |
|
2790 | 0 | buf_state &= ~BM_VALID; |
2791 | 0 | UnlockBufHdr(existing_hdr, buf_state); |
2792 | 0 | } while (!StartBufferIO(existing_hdr, true, false)); |
2793 | 0 | } |
2794 | 0 | else |
2795 | 0 | { |
2796 | 0 | uint32 buf_state; |
2797 | |
|
2798 | 0 | buf_state = LockBufHdr(victim_buf_hdr); |
2799 | | |
2800 | | /* some sanity checks while we hold the buffer header lock */ |
2801 | 0 | Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED))); |
2802 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1); |
2803 | |
|
2804 | 0 | victim_buf_hdr->tag = tag; |
2805 | |
|
2806 | 0 | buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; |
2807 | 0 | if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM) |
2808 | 0 | buf_state |= BM_PERMANENT; |
2809 | |
|
2810 | 0 | UnlockBufHdr(victim_buf_hdr, buf_state); |
2811 | |
|
2812 | 0 | LWLockRelease(partition_lock); |
2813 | | |
2814 | | /* XXX: could combine the locked operations in it with the above */ |
2815 | 0 | StartBufferIO(victim_buf_hdr, true, false); |
2816 | 0 | } |
2817 | 0 | } |
2818 | | |
2819 | 0 | io_start = pgstat_prepare_io_time(track_io_timing); |
2820 | | |
2821 | | /* |
2822 | | * Note: if smgrzeroextend fails, we will end up with buffers that are |
2823 | | * allocated but not marked BM_VALID. The next relation extension will |
2824 | | * still select the same block number (because the relation didn't get any |
2825 | | * longer on disk) and so future attempts to extend the relation will find |
2826 | | * the same buffers (if they have not been recycled) but come right back |
2827 | | * here to try smgrzeroextend again. |
2828 | | * |
2829 | | * We don't need to set checksum for all-zero pages. |
2830 | | */ |
2831 | 0 | smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false); |
2832 | | |
2833 | | /* |
2834 | | * Release the file-extension lock; it's now OK for someone else to extend |
2835 | | * the relation some more. |
2836 | | * |
2837 | | * We remove IO_IN_PROGRESS after this, as waking up waiting backends can |
2838 | | * take noticeable time. |
2839 | | */ |
2840 | 0 | if (!(flags & EB_SKIP_EXTENSION_LOCK)) |
2841 | 0 | UnlockRelationForExtension(bmr.rel, ExclusiveLock); |
2842 | |
|
2843 | 0 | pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND, |
2844 | 0 | io_start, 1, extend_by * BLCKSZ); |
2845 | | |
2846 | | /* Set BM_VALID, terminate IO, and wake up any waiters */ |
2847 | 0 | for (uint32 i = 0; i < extend_by; i++) |
2848 | 0 | { |
2849 | 0 | Buffer buf = buffers[i]; |
2850 | 0 | BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1); |
2851 | 0 | bool lock = false; |
2852 | |
|
2853 | 0 | if (flags & EB_LOCK_FIRST && i == 0) |
2854 | 0 | lock = true; |
2855 | 0 | else if (flags & EB_LOCK_TARGET) |
2856 | 0 | { |
2857 | 0 | Assert(extend_upto != InvalidBlockNumber); |
2858 | 0 | if (first_block + i + 1 == extend_upto) |
2859 | 0 | lock = true; |
2860 | 0 | } |
2861 | |
|
2862 | 0 | if (lock) |
2863 | 0 | LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE); |
2864 | |
|
2865 | 0 | TerminateBufferIO(buf_hdr, false, BM_VALID, true, false); |
2866 | 0 | } |
2867 | |
|
2868 | 0 | pgBufferUsage.shared_blks_written += extend_by; |
2869 | |
|
2870 | 0 | *extended_by = extend_by; |
2871 | |
|
2872 | 0 | return first_block; |
2873 | 0 | } |
2874 | | |
2875 | | /* |
2876 | | * BufferIsExclusiveLocked |
2877 | | * |
2878 | | * Checks if buffer is exclusive-locked. |
2879 | | * |
2880 | | * Buffer must be pinned. |
2881 | | */ |
2882 | | bool |
2883 | | BufferIsExclusiveLocked(Buffer buffer) |
2884 | 0 | { |
2885 | 0 | BufferDesc *bufHdr; |
2886 | |
|
2887 | 0 | Assert(BufferIsPinned(buffer)); |
2888 | |
|
2889 | 0 | if (BufferIsLocal(buffer)) |
2890 | 0 | { |
2891 | | /* Content locks are not maintained for local buffers. */ |
2892 | 0 | return true; |
2893 | 0 | } |
2894 | 0 | else |
2895 | 0 | { |
2896 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
2897 | 0 | return LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), |
2898 | 0 | LW_EXCLUSIVE); |
2899 | 0 | } |
2900 | 0 | } |
2901 | | |
2902 | | /* |
2903 | | * BufferIsDirty |
2904 | | * |
2905 | | * Checks if buffer is already dirty. |
2906 | | * |
2907 | | * Buffer must be pinned and exclusive-locked. (Without an exclusive lock, |
2908 | | * the result may be stale before it's returned.) |
2909 | | */ |
2910 | | bool |
2911 | | BufferIsDirty(Buffer buffer) |
2912 | 0 | { |
2913 | 0 | BufferDesc *bufHdr; |
2914 | |
|
2915 | 0 | Assert(BufferIsPinned(buffer)); |
2916 | |
|
2917 | 0 | if (BufferIsLocal(buffer)) |
2918 | 0 | { |
2919 | 0 | int bufid = -buffer - 1; |
2920 | |
|
2921 | 0 | bufHdr = GetLocalBufferDescriptor(bufid); |
2922 | | /* Content locks are not maintained for local buffers. */ |
2923 | 0 | } |
2924 | 0 | else |
2925 | 0 | { |
2926 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
2927 | 0 | Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), |
2928 | 0 | LW_EXCLUSIVE)); |
2929 | 0 | } |
2930 | |
|
2931 | 0 | return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY; |
2932 | 0 | } |
2933 | | |
2934 | | /* |
2935 | | * MarkBufferDirty |
2936 | | * |
2937 | | * Marks buffer contents as dirty (actual write happens later). |
2938 | | * |
2939 | | * Buffer must be pinned and exclusive-locked. (If caller does not hold |
2940 | | * exclusive lock, then somebody could be in process of writing the buffer, |
2941 | | * leading to risk of bad data written to disk.) |
2942 | | */ |
2943 | | void |
2944 | | MarkBufferDirty(Buffer buffer) |
2945 | 0 | { |
2946 | 0 | BufferDesc *bufHdr; |
2947 | 0 | uint32 buf_state; |
2948 | 0 | uint32 old_buf_state; |
2949 | |
|
2950 | 0 | if (!BufferIsValid(buffer)) |
2951 | 0 | elog(ERROR, "bad buffer ID: %d", buffer); |
2952 | | |
2953 | 0 | if (BufferIsLocal(buffer)) |
2954 | 0 | { |
2955 | 0 | MarkLocalBufferDirty(buffer); |
2956 | 0 | return; |
2957 | 0 | } |
2958 | | |
2959 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
2960 | |
|
2961 | 0 | Assert(BufferIsPinned(buffer)); |
2962 | 0 | Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), |
2963 | 0 | LW_EXCLUSIVE)); |
2964 | |
|
2965 | 0 | old_buf_state = pg_atomic_read_u32(&bufHdr->state); |
2966 | 0 | for (;;) |
2967 | 0 | { |
2968 | 0 | if (old_buf_state & BM_LOCKED) |
2969 | 0 | old_buf_state = WaitBufHdrUnlocked(bufHdr); |
2970 | |
|
2971 | 0 | buf_state = old_buf_state; |
2972 | |
|
2973 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); |
2974 | 0 | buf_state |= BM_DIRTY | BM_JUST_DIRTIED; |
2975 | |
|
2976 | 0 | if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state, |
2977 | 0 | buf_state)) |
2978 | 0 | break; |
2979 | 0 | } |
2980 | | |
2981 | | /* |
2982 | | * If the buffer was not dirty already, do vacuum accounting. |
2983 | | */ |
2984 | 0 | if (!(old_buf_state & BM_DIRTY)) |
2985 | 0 | { |
2986 | 0 | pgBufferUsage.shared_blks_dirtied++; |
2987 | 0 | if (VacuumCostActive) |
2988 | 0 | VacuumCostBalance += VacuumCostPageDirty; |
2989 | 0 | } |
2990 | 0 | } |
2991 | | |
2992 | | /* |
2993 | | * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer() |
2994 | | * |
2995 | | * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock |
2996 | | * compared to calling the two routines separately. Now it's mainly just |
2997 | | * a convenience function. However, if the passed buffer is valid and |
2998 | | * already contains the desired block, we just return it as-is; and that |
2999 | | * does save considerable work compared to a full release and reacquire. |
3000 | | * |
3001 | | * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old |
3002 | | * buffer actually needs to be released. This case is the same as ReadBuffer, |
3003 | | * but can save some tests in the caller. |
3004 | | */ |
3005 | | Buffer |
3006 | | ReleaseAndReadBuffer(Buffer buffer, |
3007 | | Relation relation, |
3008 | | BlockNumber blockNum) |
3009 | 0 | { |
3010 | 0 | ForkNumber forkNum = MAIN_FORKNUM; |
3011 | 0 | BufferDesc *bufHdr; |
3012 | |
|
3013 | 0 | if (BufferIsValid(buffer)) |
3014 | 0 | { |
3015 | 0 | Assert(BufferIsPinned(buffer)); |
3016 | 0 | if (BufferIsLocal(buffer)) |
3017 | 0 | { |
3018 | 0 | bufHdr = GetLocalBufferDescriptor(-buffer - 1); |
3019 | 0 | if (bufHdr->tag.blockNum == blockNum && |
3020 | 0 | BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && |
3021 | 0 | BufTagGetForkNum(&bufHdr->tag) == forkNum) |
3022 | 0 | return buffer; |
3023 | 0 | UnpinLocalBuffer(buffer); |
3024 | 0 | } |
3025 | 0 | else |
3026 | 0 | { |
3027 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
3028 | | /* we have pin, so it's ok to examine tag without spinlock */ |
3029 | 0 | if (bufHdr->tag.blockNum == blockNum && |
3030 | 0 | BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && |
3031 | 0 | BufTagGetForkNum(&bufHdr->tag) == forkNum) |
3032 | 0 | return buffer; |
3033 | 0 | UnpinBuffer(bufHdr); |
3034 | 0 | } |
3035 | 0 | } |
3036 | | |
3037 | 0 | return ReadBuffer(relation, blockNum); |
3038 | 0 | } |
3039 | | |
3040 | | /* |
3041 | | * PinBuffer -- make buffer unavailable for replacement. |
3042 | | * |
3043 | | * For the default access strategy, the buffer's usage_count is incremented |
3044 | | * when we first pin it; for other strategies we just make sure the usage_count |
3045 | | * isn't zero. (The idea of the latter is that we don't want synchronized |
3046 | | * heap scans to inflate the count, but we need it to not be zero to discourage |
3047 | | * other backends from stealing buffers from our ring. As long as we cycle |
3048 | | * through the ring faster than the global clock-sweep cycles, buffers in |
3049 | | * our ring won't be chosen as victims for replacement by other backends.) |
3050 | | * |
3051 | | * This should be applied only to shared buffers, never local ones. |
3052 | | * |
3053 | | * Since buffers are pinned/unpinned very frequently, pin buffers without |
3054 | | * taking the buffer header lock; instead update the state variable in loop of |
3055 | | * CAS operations. Hopefully it's just a single CAS. |
3056 | | * |
3057 | | * Note that ResourceOwnerEnlarge() and ReservePrivateRefCountEntry() |
3058 | | * must have been done already. |
3059 | | * |
3060 | | * Returns true if buffer is BM_VALID, else false. This provision allows |
3061 | | * some callers to avoid an extra spinlock cycle. |
3062 | | */ |
3063 | | static bool |
3064 | | PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) |
3065 | 0 | { |
3066 | 0 | Buffer b = BufferDescriptorGetBuffer(buf); |
3067 | 0 | bool result; |
3068 | 0 | PrivateRefCountEntry *ref; |
3069 | |
|
3070 | 0 | Assert(!BufferIsLocal(b)); |
3071 | 0 | Assert(ReservedRefCountEntry != NULL); |
3072 | |
|
3073 | 0 | ref = GetPrivateRefCountEntry(b, true); |
3074 | |
|
3075 | 0 | if (ref == NULL) |
3076 | 0 | { |
3077 | 0 | uint32 buf_state; |
3078 | 0 | uint32 old_buf_state; |
3079 | |
|
3080 | 0 | ref = NewPrivateRefCountEntry(b); |
3081 | |
|
3082 | 0 | old_buf_state = pg_atomic_read_u32(&buf->state); |
3083 | 0 | for (;;) |
3084 | 0 | { |
3085 | 0 | if (old_buf_state & BM_LOCKED) |
3086 | 0 | old_buf_state = WaitBufHdrUnlocked(buf); |
3087 | |
|
3088 | 0 | buf_state = old_buf_state; |
3089 | | |
3090 | | /* increase refcount */ |
3091 | 0 | buf_state += BUF_REFCOUNT_ONE; |
3092 | |
|
3093 | 0 | if (strategy == NULL) |
3094 | 0 | { |
3095 | | /* Default case: increase usagecount unless already max. */ |
3096 | 0 | if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT) |
3097 | 0 | buf_state += BUF_USAGECOUNT_ONE; |
3098 | 0 | } |
3099 | 0 | else |
3100 | 0 | { |
3101 | | /* |
3102 | | * Ring buffers shouldn't evict others from pool. Thus we |
3103 | | * don't make usagecount more than 1. |
3104 | | */ |
3105 | 0 | if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0) |
3106 | 0 | buf_state += BUF_USAGECOUNT_ONE; |
3107 | 0 | } |
3108 | |
|
3109 | 0 | if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, |
3110 | 0 | buf_state)) |
3111 | 0 | { |
3112 | 0 | result = (buf_state & BM_VALID) != 0; |
3113 | | |
3114 | | /* |
3115 | | * Assume that we acquired a buffer pin for the purposes of |
3116 | | * Valgrind buffer client checks (even in !result case) to |
3117 | | * keep things simple. Buffers that are unsafe to access are |
3118 | | * not generally guaranteed to be marked undefined or |
3119 | | * non-accessible in any case. |
3120 | | */ |
3121 | 0 | VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); |
3122 | 0 | break; |
3123 | 0 | } |
3124 | 0 | } |
3125 | 0 | } |
3126 | 0 | else |
3127 | 0 | { |
3128 | | /* |
3129 | | * If we previously pinned the buffer, it is likely to be valid, but |
3130 | | * it may not be if StartReadBuffers() was called and |
3131 | | * WaitReadBuffers() hasn't been called yet. We'll check by loading |
3132 | | * the flags without locking. This is racy, but it's OK to return |
3133 | | * false spuriously: when WaitReadBuffers() calls StartBufferIO(), |
3134 | | * it'll see that it's now valid. |
3135 | | * |
3136 | | * Note: We deliberately avoid a Valgrind client request here. |
3137 | | * Individual access methods can optionally superimpose buffer page |
3138 | | * client requests on top of our client requests to enforce that |
3139 | | * buffers are only accessed while locked (and pinned). It's possible |
3140 | | * that the buffer page is legitimately non-accessible here. We |
3141 | | * cannot meddle with that. |
3142 | | */ |
3143 | 0 | result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0; |
3144 | 0 | } |
3145 | |
|
3146 | 0 | ref->refcount++; |
3147 | 0 | Assert(ref->refcount > 0); |
3148 | 0 | ResourceOwnerRememberBuffer(CurrentResourceOwner, b); |
3149 | 0 | return result; |
3150 | 0 | } |
3151 | | |
3152 | | /* |
3153 | | * PinBuffer_Locked -- as above, but caller already locked the buffer header. |
3154 | | * The spinlock is released before return. |
3155 | | * |
3156 | | * As this function is called with the spinlock held, the caller has to |
3157 | | * previously call ReservePrivateRefCountEntry() and |
3158 | | * ResourceOwnerEnlarge(CurrentResourceOwner); |
3159 | | * |
3160 | | * Currently, no callers of this function want to modify the buffer's |
3161 | | * usage_count at all, so there's no need for a strategy parameter. |
3162 | | * Also we don't bother with a BM_VALID test (the caller could check that for |
3163 | | * itself). |
3164 | | * |
3165 | | * Also all callers only ever use this function when it's known that the |
3166 | | * buffer can't have a preexisting pin by this backend. That allows us to skip |
3167 | | * searching the private refcount array & hash, which is a boon, because the |
3168 | | * spinlock is still held. |
3169 | | * |
3170 | | * Note: use of this routine is frequently mandatory, not just an optimization |
3171 | | * to save a spin lock/unlock cycle, because we need to pin a buffer before |
3172 | | * its state can change under us. |
3173 | | */ |
3174 | | static void |
3175 | | PinBuffer_Locked(BufferDesc *buf) |
3176 | 0 | { |
3177 | 0 | Buffer b; |
3178 | 0 | PrivateRefCountEntry *ref; |
3179 | 0 | uint32 buf_state; |
3180 | | |
3181 | | /* |
3182 | | * As explained, We don't expect any preexisting pins. That allows us to |
3183 | | * manipulate the PrivateRefCount after releasing the spinlock |
3184 | | */ |
3185 | 0 | Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL); |
3186 | | |
3187 | | /* |
3188 | | * Buffer can't have a preexisting pin, so mark its page as defined to |
3189 | | * Valgrind (this is similar to the PinBuffer() case where the backend |
3190 | | * doesn't already have a buffer pin) |
3191 | | */ |
3192 | 0 | VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); |
3193 | | |
3194 | | /* |
3195 | | * Since we hold the buffer spinlock, we can update the buffer state and |
3196 | | * release the lock in one operation. |
3197 | | */ |
3198 | 0 | buf_state = pg_atomic_read_u32(&buf->state); |
3199 | 0 | Assert(buf_state & BM_LOCKED); |
3200 | 0 | buf_state += BUF_REFCOUNT_ONE; |
3201 | 0 | UnlockBufHdr(buf, buf_state); |
3202 | |
|
3203 | 0 | b = BufferDescriptorGetBuffer(buf); |
3204 | |
|
3205 | 0 | ref = NewPrivateRefCountEntry(b); |
3206 | 0 | ref->refcount++; |
3207 | |
|
3208 | 0 | ResourceOwnerRememberBuffer(CurrentResourceOwner, b); |
3209 | 0 | } |
3210 | | |
3211 | | /* |
3212 | | * Support for waking up another backend that is waiting for the cleanup lock |
3213 | | * to be released using BM_PIN_COUNT_WAITER. |
3214 | | * |
3215 | | * See LockBufferForCleanup(). |
3216 | | * |
3217 | | * Expected to be called just after releasing a buffer pin (in a BufferDesc, |
3218 | | * not just reducing the backend-local pincount for the buffer). |
3219 | | */ |
3220 | | static void |
3221 | | WakePinCountWaiter(BufferDesc *buf) |
3222 | 0 | { |
3223 | | /* |
3224 | | * Acquire the buffer header lock, re-check that there's a waiter. Another |
3225 | | * backend could have unpinned this buffer, and already woken up the |
3226 | | * waiter. |
3227 | | * |
3228 | | * There's no danger of the buffer being replaced after we unpinned it |
3229 | | * above, as it's pinned by the waiter. The waiter removes |
3230 | | * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this |
3231 | | * backend waking it up. |
3232 | | */ |
3233 | 0 | uint32 buf_state = LockBufHdr(buf); |
3234 | |
|
3235 | 0 | if ((buf_state & BM_PIN_COUNT_WAITER) && |
3236 | 0 | BUF_STATE_GET_REFCOUNT(buf_state) == 1) |
3237 | 0 | { |
3238 | | /* we just released the last pin other than the waiter's */ |
3239 | 0 | int wait_backend_pgprocno = buf->wait_backend_pgprocno; |
3240 | |
|
3241 | 0 | buf_state &= ~BM_PIN_COUNT_WAITER; |
3242 | 0 | UnlockBufHdr(buf, buf_state); |
3243 | 0 | ProcSendSignal(wait_backend_pgprocno); |
3244 | 0 | } |
3245 | 0 | else |
3246 | 0 | UnlockBufHdr(buf, buf_state); |
3247 | 0 | } |
3248 | | |
3249 | | /* |
3250 | | * UnpinBuffer -- make buffer available for replacement. |
3251 | | * |
3252 | | * This should be applied only to shared buffers, never local ones. This |
3253 | | * always adjusts CurrentResourceOwner. |
3254 | | */ |
3255 | | static void |
3256 | | UnpinBuffer(BufferDesc *buf) |
3257 | 0 | { |
3258 | 0 | Buffer b = BufferDescriptorGetBuffer(buf); |
3259 | |
|
3260 | 0 | ResourceOwnerForgetBuffer(CurrentResourceOwner, b); |
3261 | 0 | UnpinBufferNoOwner(buf); |
3262 | 0 | } |
3263 | | |
3264 | | static void |
3265 | | UnpinBufferNoOwner(BufferDesc *buf) |
3266 | 0 | { |
3267 | 0 | PrivateRefCountEntry *ref; |
3268 | 0 | Buffer b = BufferDescriptorGetBuffer(buf); |
3269 | |
|
3270 | 0 | Assert(!BufferIsLocal(b)); |
3271 | | |
3272 | | /* not moving as we're likely deleting it soon anyway */ |
3273 | 0 | ref = GetPrivateRefCountEntry(b, false); |
3274 | 0 | Assert(ref != NULL); |
3275 | 0 | Assert(ref->refcount > 0); |
3276 | 0 | ref->refcount--; |
3277 | 0 | if (ref->refcount == 0) |
3278 | 0 | { |
3279 | 0 | uint32 buf_state; |
3280 | 0 | uint32 old_buf_state; |
3281 | | |
3282 | | /* |
3283 | | * Mark buffer non-accessible to Valgrind. |
3284 | | * |
3285 | | * Note that the buffer may have already been marked non-accessible |
3286 | | * within access method code that enforces that buffers are only |
3287 | | * accessed while a buffer lock is held. |
3288 | | */ |
3289 | 0 | VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ); |
3290 | | |
3291 | | /* I'd better not still hold the buffer content lock */ |
3292 | 0 | Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf))); |
3293 | | |
3294 | | /* |
3295 | | * Decrement the shared reference count. |
3296 | | * |
3297 | | * Since buffer spinlock holder can update status using just write, |
3298 | | * it's not safe to use atomic decrement here; thus use a CAS loop. |
3299 | | */ |
3300 | 0 | old_buf_state = pg_atomic_read_u32(&buf->state); |
3301 | 0 | for (;;) |
3302 | 0 | { |
3303 | 0 | if (old_buf_state & BM_LOCKED) |
3304 | 0 | old_buf_state = WaitBufHdrUnlocked(buf); |
3305 | |
|
3306 | 0 | buf_state = old_buf_state; |
3307 | |
|
3308 | 0 | buf_state -= BUF_REFCOUNT_ONE; |
3309 | |
|
3310 | 0 | if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, |
3311 | 0 | buf_state)) |
3312 | 0 | break; |
3313 | 0 | } |
3314 | | |
3315 | | /* Support LockBufferForCleanup() */ |
3316 | 0 | if (buf_state & BM_PIN_COUNT_WAITER) |
3317 | 0 | WakePinCountWaiter(buf); |
3318 | |
|
3319 | 0 | ForgetPrivateRefCountEntry(ref); |
3320 | 0 | } |
3321 | 0 | } |
3322 | | |
3323 | 0 | #define ST_SORT sort_checkpoint_bufferids |
3324 | 0 | #define ST_ELEMENT_TYPE CkptSortItem |
3325 | 0 | #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b) |
3326 | | #define ST_SCOPE static |
3327 | | #define ST_DEFINE |
3328 | | #include "lib/sort_template.h" |
3329 | | |
3330 | | /* |
3331 | | * BufferSync -- Write out all dirty buffers in the pool. |
3332 | | * |
3333 | | * This is called at checkpoint time to write out all dirty shared buffers. |
3334 | | * The checkpoint request flags should be passed in. If CHECKPOINT_FAST is |
3335 | | * set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN, |
3336 | | * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_UNLOGGED is set, we write |
3337 | | * even unlogged buffers, which are otherwise skipped. The remaining flags |
3338 | | * currently have no effect here. |
3339 | | */ |
3340 | | static void |
3341 | | BufferSync(int flags) |
3342 | 0 | { |
3343 | 0 | uint32 buf_state; |
3344 | 0 | int buf_id; |
3345 | 0 | int num_to_scan; |
3346 | 0 | int num_spaces; |
3347 | 0 | int num_processed; |
3348 | 0 | int num_written; |
3349 | 0 | CkptTsStatus *per_ts_stat = NULL; |
3350 | 0 | Oid last_tsid; |
3351 | 0 | binaryheap *ts_heap; |
3352 | 0 | int i; |
3353 | 0 | int mask = BM_DIRTY; |
3354 | 0 | WritebackContext wb_context; |
3355 | | |
3356 | | /* |
3357 | | * Unless this is a shutdown checkpoint or we have been explicitly told, |
3358 | | * we write only permanent, dirty buffers. But at shutdown or end of |
3359 | | * recovery, we write all dirty buffers. |
3360 | | */ |
3361 | 0 | if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | |
3362 | 0 | CHECKPOINT_FLUSH_UNLOGGED)))) |
3363 | 0 | mask |= BM_PERMANENT; |
3364 | | |
3365 | | /* |
3366 | | * Loop over all buffers, and mark the ones that need to be written with |
3367 | | * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we |
3368 | | * can estimate how much work needs to be done. |
3369 | | * |
3370 | | * This allows us to write only those pages that were dirty when the |
3371 | | * checkpoint began, and not those that get dirtied while it proceeds. |
3372 | | * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us |
3373 | | * later in this function, or by normal backends or the bgwriter cleaning |
3374 | | * scan, the flag is cleared. Any buffer dirtied after this point won't |
3375 | | * have the flag set. |
3376 | | * |
3377 | | * Note that if we fail to write some buffer, we may leave buffers with |
3378 | | * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would |
3379 | | * certainly need to be written for the next checkpoint attempt, too. |
3380 | | */ |
3381 | 0 | num_to_scan = 0; |
3382 | 0 | for (buf_id = 0; buf_id < NBuffers; buf_id++) |
3383 | 0 | { |
3384 | 0 | BufferDesc *bufHdr = GetBufferDescriptor(buf_id); |
3385 | | |
3386 | | /* |
3387 | | * Header spinlock is enough to examine BM_DIRTY, see comment in |
3388 | | * SyncOneBuffer. |
3389 | | */ |
3390 | 0 | buf_state = LockBufHdr(bufHdr); |
3391 | |
|
3392 | 0 | if ((buf_state & mask) == mask) |
3393 | 0 | { |
3394 | 0 | CkptSortItem *item; |
3395 | |
|
3396 | 0 | buf_state |= BM_CHECKPOINT_NEEDED; |
3397 | |
|
3398 | 0 | item = &CkptBufferIds[num_to_scan++]; |
3399 | 0 | item->buf_id = buf_id; |
3400 | 0 | item->tsId = bufHdr->tag.spcOid; |
3401 | 0 | item->relNumber = BufTagGetRelNumber(&bufHdr->tag); |
3402 | 0 | item->forkNum = BufTagGetForkNum(&bufHdr->tag); |
3403 | 0 | item->blockNum = bufHdr->tag.blockNum; |
3404 | 0 | } |
3405 | |
|
3406 | 0 | UnlockBufHdr(bufHdr, buf_state); |
3407 | | |
3408 | | /* Check for barrier events in case NBuffers is large. */ |
3409 | 0 | if (ProcSignalBarrierPending) |
3410 | 0 | ProcessProcSignalBarrier(); |
3411 | 0 | } |
3412 | |
|
3413 | 0 | if (num_to_scan == 0) |
3414 | 0 | return; /* nothing to do */ |
3415 | | |
3416 | 0 | WritebackContextInit(&wb_context, &checkpoint_flush_after); |
3417 | |
|
3418 | 0 | TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan); |
3419 | | |
3420 | | /* |
3421 | | * Sort buffers that need to be written to reduce the likelihood of random |
3422 | | * IO. The sorting is also important for the implementation of balancing |
3423 | | * writes between tablespaces. Without balancing writes we'd potentially |
3424 | | * end up writing to the tablespaces one-by-one; possibly overloading the |
3425 | | * underlying system. |
3426 | | */ |
3427 | 0 | sort_checkpoint_bufferids(CkptBufferIds, num_to_scan); |
3428 | |
|
3429 | 0 | num_spaces = 0; |
3430 | | |
3431 | | /* |
3432 | | * Allocate progress status for each tablespace with buffers that need to |
3433 | | * be flushed. This requires the to-be-flushed array to be sorted. |
3434 | | */ |
3435 | 0 | last_tsid = InvalidOid; |
3436 | 0 | for (i = 0; i < num_to_scan; i++) |
3437 | 0 | { |
3438 | 0 | CkptTsStatus *s; |
3439 | 0 | Oid cur_tsid; |
3440 | |
|
3441 | 0 | cur_tsid = CkptBufferIds[i].tsId; |
3442 | | |
3443 | | /* |
3444 | | * Grow array of per-tablespace status structs, every time a new |
3445 | | * tablespace is found. |
3446 | | */ |
3447 | 0 | if (last_tsid == InvalidOid || last_tsid != cur_tsid) |
3448 | 0 | { |
3449 | 0 | Size sz; |
3450 | |
|
3451 | 0 | num_spaces++; |
3452 | | |
3453 | | /* |
3454 | | * Not worth adding grow-by-power-of-2 logic here - even with a |
3455 | | * few hundred tablespaces this should be fine. |
3456 | | */ |
3457 | 0 | sz = sizeof(CkptTsStatus) * num_spaces; |
3458 | |
|
3459 | 0 | if (per_ts_stat == NULL) |
3460 | 0 | per_ts_stat = (CkptTsStatus *) palloc(sz); |
3461 | 0 | else |
3462 | 0 | per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz); |
3463 | |
|
3464 | 0 | s = &per_ts_stat[num_spaces - 1]; |
3465 | 0 | memset(s, 0, sizeof(*s)); |
3466 | 0 | s->tsId = cur_tsid; |
3467 | | |
3468 | | /* |
3469 | | * The first buffer in this tablespace. As CkptBufferIds is sorted |
3470 | | * by tablespace all (s->num_to_scan) buffers in this tablespace |
3471 | | * will follow afterwards. |
3472 | | */ |
3473 | 0 | s->index = i; |
3474 | | |
3475 | | /* |
3476 | | * progress_slice will be determined once we know how many buffers |
3477 | | * are in each tablespace, i.e. after this loop. |
3478 | | */ |
3479 | |
|
3480 | 0 | last_tsid = cur_tsid; |
3481 | 0 | } |
3482 | 0 | else |
3483 | 0 | { |
3484 | 0 | s = &per_ts_stat[num_spaces - 1]; |
3485 | 0 | } |
3486 | |
|
3487 | 0 | s->num_to_scan++; |
3488 | | |
3489 | | /* Check for barrier events. */ |
3490 | 0 | if (ProcSignalBarrierPending) |
3491 | 0 | ProcessProcSignalBarrier(); |
3492 | 0 | } |
3493 | |
|
3494 | 0 | Assert(num_spaces > 0); |
3495 | | |
3496 | | /* |
3497 | | * Build a min-heap over the write-progress in the individual tablespaces, |
3498 | | * and compute how large a portion of the total progress a single |
3499 | | * processed buffer is. |
3500 | | */ |
3501 | 0 | ts_heap = binaryheap_allocate(num_spaces, |
3502 | 0 | ts_ckpt_progress_comparator, |
3503 | 0 | NULL); |
3504 | |
|
3505 | 0 | for (i = 0; i < num_spaces; i++) |
3506 | 0 | { |
3507 | 0 | CkptTsStatus *ts_stat = &per_ts_stat[i]; |
3508 | |
|
3509 | 0 | ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan; |
3510 | |
|
3511 | 0 | binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat)); |
3512 | 0 | } |
3513 | |
|
3514 | 0 | binaryheap_build(ts_heap); |
3515 | | |
3516 | | /* |
3517 | | * Iterate through to-be-checkpointed buffers and write the ones (still) |
3518 | | * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between |
3519 | | * tablespaces; otherwise the sorting would lead to only one tablespace |
3520 | | * receiving writes at a time, making inefficient use of the hardware. |
3521 | | */ |
3522 | 0 | num_processed = 0; |
3523 | 0 | num_written = 0; |
3524 | 0 | while (!binaryheap_empty(ts_heap)) |
3525 | 0 | { |
3526 | 0 | BufferDesc *bufHdr = NULL; |
3527 | 0 | CkptTsStatus *ts_stat = (CkptTsStatus *) |
3528 | 0 | DatumGetPointer(binaryheap_first(ts_heap)); |
3529 | |
|
3530 | 0 | buf_id = CkptBufferIds[ts_stat->index].buf_id; |
3531 | 0 | Assert(buf_id != -1); |
3532 | |
|
3533 | 0 | bufHdr = GetBufferDescriptor(buf_id); |
3534 | |
|
3535 | 0 | num_processed++; |
3536 | | |
3537 | | /* |
3538 | | * We don't need to acquire the lock here, because we're only looking |
3539 | | * at a single bit. It's possible that someone else writes the buffer |
3540 | | * and clears the flag right after we check, but that doesn't matter |
3541 | | * since SyncOneBuffer will then do nothing. However, there is a |
3542 | | * further race condition: it's conceivable that between the time we |
3543 | | * examine the bit here and the time SyncOneBuffer acquires the lock, |
3544 | | * someone else not only wrote the buffer but replaced it with another |
3545 | | * page and dirtied it. In that improbable case, SyncOneBuffer will |
3546 | | * write the buffer though we didn't need to. It doesn't seem worth |
3547 | | * guarding against this, though. |
3548 | | */ |
3549 | 0 | if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED) |
3550 | 0 | { |
3551 | 0 | if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN) |
3552 | 0 | { |
3553 | 0 | TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); |
3554 | 0 | PendingCheckpointerStats.buffers_written++; |
3555 | 0 | num_written++; |
3556 | 0 | } |
3557 | 0 | } |
3558 | | |
3559 | | /* |
3560 | | * Measure progress independent of actually having to flush the buffer |
3561 | | * - otherwise writing become unbalanced. |
3562 | | */ |
3563 | 0 | ts_stat->progress += ts_stat->progress_slice; |
3564 | 0 | ts_stat->num_scanned++; |
3565 | 0 | ts_stat->index++; |
3566 | | |
3567 | | /* Have all the buffers from the tablespace been processed? */ |
3568 | 0 | if (ts_stat->num_scanned == ts_stat->num_to_scan) |
3569 | 0 | { |
3570 | 0 | binaryheap_remove_first(ts_heap); |
3571 | 0 | } |
3572 | 0 | else |
3573 | 0 | { |
3574 | | /* update heap with the new progress */ |
3575 | 0 | binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat)); |
3576 | 0 | } |
3577 | | |
3578 | | /* |
3579 | | * Sleep to throttle our I/O rate. |
3580 | | * |
3581 | | * (This will check for barrier events even if it doesn't sleep.) |
3582 | | */ |
3583 | 0 | CheckpointWriteDelay(flags, (double) num_processed / num_to_scan); |
3584 | 0 | } |
3585 | | |
3586 | | /* |
3587 | | * Issue all pending flushes. Only checkpointer calls BufferSync(), so |
3588 | | * IOContext will always be IOCONTEXT_NORMAL. |
3589 | | */ |
3590 | 0 | IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL); |
3591 | |
|
3592 | 0 | pfree(per_ts_stat); |
3593 | 0 | per_ts_stat = NULL; |
3594 | 0 | binaryheap_free(ts_heap); |
3595 | | |
3596 | | /* |
3597 | | * Update checkpoint statistics. As noted above, this doesn't include |
3598 | | * buffers written by other backends or bgwriter scan. |
3599 | | */ |
3600 | 0 | CheckpointStats.ckpt_bufs_written += num_written; |
3601 | |
|
3602 | 0 | TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan); |
3603 | 0 | } |
3604 | | |
3605 | | /* |
3606 | | * BgBufferSync -- Write out some dirty buffers in the pool. |
3607 | | * |
3608 | | * This is called periodically by the background writer process. |
3609 | | * |
3610 | | * Returns true if it's appropriate for the bgwriter process to go into |
3611 | | * low-power hibernation mode. (This happens if the strategy clock sweep |
3612 | | * has been "lapped" and no buffer allocations have occurred recently, |
3613 | | * or if the bgwriter has been effectively disabled by setting |
3614 | | * bgwriter_lru_maxpages to 0.) |
3615 | | */ |
3616 | | bool |
3617 | | BgBufferSync(WritebackContext *wb_context) |
3618 | 0 | { |
3619 | | /* info obtained from freelist.c */ |
3620 | 0 | int strategy_buf_id; |
3621 | 0 | uint32 strategy_passes; |
3622 | 0 | uint32 recent_alloc; |
3623 | | |
3624 | | /* |
3625 | | * Information saved between calls so we can determine the strategy |
3626 | | * point's advance rate and avoid scanning already-cleaned buffers. |
3627 | | */ |
3628 | 0 | static bool saved_info_valid = false; |
3629 | 0 | static int prev_strategy_buf_id; |
3630 | 0 | static uint32 prev_strategy_passes; |
3631 | 0 | static int next_to_clean; |
3632 | 0 | static uint32 next_passes; |
3633 | | |
3634 | | /* Moving averages of allocation rate and clean-buffer density */ |
3635 | 0 | static float smoothed_alloc = 0; |
3636 | 0 | static float smoothed_density = 10.0; |
3637 | | |
3638 | | /* Potentially these could be tunables, but for now, not */ |
3639 | 0 | float smoothing_samples = 16; |
3640 | 0 | float scan_whole_pool_milliseconds = 120000.0; |
3641 | | |
3642 | | /* Used to compute how far we scan ahead */ |
3643 | 0 | long strategy_delta; |
3644 | 0 | int bufs_to_lap; |
3645 | 0 | int bufs_ahead; |
3646 | 0 | float scans_per_alloc; |
3647 | 0 | int reusable_buffers_est; |
3648 | 0 | int upcoming_alloc_est; |
3649 | 0 | int min_scan_buffers; |
3650 | | |
3651 | | /* Variables for the scanning loop proper */ |
3652 | 0 | int num_to_scan; |
3653 | 0 | int num_written; |
3654 | 0 | int reusable_buffers; |
3655 | | |
3656 | | /* Variables for final smoothed_density update */ |
3657 | 0 | long new_strategy_delta; |
3658 | 0 | uint32 new_recent_alloc; |
3659 | | |
3660 | | /* |
3661 | | * Find out where the freelist clock sweep currently is, and how many |
3662 | | * buffer allocations have happened since our last call. |
3663 | | */ |
3664 | 0 | strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); |
3665 | | |
3666 | | /* Report buffer alloc counts to pgstat */ |
3667 | 0 | PendingBgWriterStats.buf_alloc += recent_alloc; |
3668 | | |
3669 | | /* |
3670 | | * If we're not running the LRU scan, just stop after doing the stats |
3671 | | * stuff. We mark the saved state invalid so that we can recover sanely |
3672 | | * if LRU scan is turned back on later. |
3673 | | */ |
3674 | 0 | if (bgwriter_lru_maxpages <= 0) |
3675 | 0 | { |
3676 | 0 | saved_info_valid = false; |
3677 | 0 | return true; |
3678 | 0 | } |
3679 | | |
3680 | | /* |
3681 | | * Compute strategy_delta = how many buffers have been scanned by the |
3682 | | * clock sweep since last time. If first time through, assume none. Then |
3683 | | * see if we are still ahead of the clock sweep, and if so, how many |
3684 | | * buffers we could scan before we'd catch up with it and "lap" it. Note: |
3685 | | * weird-looking coding of xxx_passes comparisons are to avoid bogus |
3686 | | * behavior when the passes counts wrap around. |
3687 | | */ |
3688 | 0 | if (saved_info_valid) |
3689 | 0 | { |
3690 | 0 | int32 passes_delta = strategy_passes - prev_strategy_passes; |
3691 | |
|
3692 | 0 | strategy_delta = strategy_buf_id - prev_strategy_buf_id; |
3693 | 0 | strategy_delta += (long) passes_delta * NBuffers; |
3694 | |
|
3695 | 0 | Assert(strategy_delta >= 0); |
3696 | |
|
3697 | 0 | if ((int32) (next_passes - strategy_passes) > 0) |
3698 | 0 | { |
3699 | | /* we're one pass ahead of the strategy point */ |
3700 | 0 | bufs_to_lap = strategy_buf_id - next_to_clean; |
3701 | | #ifdef BGW_DEBUG |
3702 | | elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", |
3703 | | next_passes, next_to_clean, |
3704 | | strategy_passes, strategy_buf_id, |
3705 | | strategy_delta, bufs_to_lap); |
3706 | | #endif |
3707 | 0 | } |
3708 | 0 | else if (next_passes == strategy_passes && |
3709 | 0 | next_to_clean >= strategy_buf_id) |
3710 | 0 | { |
3711 | | /* on same pass, but ahead or at least not behind */ |
3712 | 0 | bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id); |
3713 | | #ifdef BGW_DEBUG |
3714 | | elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", |
3715 | | next_passes, next_to_clean, |
3716 | | strategy_passes, strategy_buf_id, |
3717 | | strategy_delta, bufs_to_lap); |
3718 | | #endif |
3719 | 0 | } |
3720 | 0 | else |
3721 | 0 | { |
3722 | | /* |
3723 | | * We're behind, so skip forward to the strategy point and start |
3724 | | * cleaning from there. |
3725 | | */ |
3726 | | #ifdef BGW_DEBUG |
3727 | | elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld", |
3728 | | next_passes, next_to_clean, |
3729 | | strategy_passes, strategy_buf_id, |
3730 | | strategy_delta); |
3731 | | #endif |
3732 | 0 | next_to_clean = strategy_buf_id; |
3733 | 0 | next_passes = strategy_passes; |
3734 | 0 | bufs_to_lap = NBuffers; |
3735 | 0 | } |
3736 | 0 | } |
3737 | 0 | else |
3738 | 0 | { |
3739 | | /* |
3740 | | * Initializing at startup or after LRU scanning had been off. Always |
3741 | | * start at the strategy point. |
3742 | | */ |
3743 | | #ifdef BGW_DEBUG |
3744 | | elog(DEBUG2, "bgwriter initializing: strategy %u-%u", |
3745 | | strategy_passes, strategy_buf_id); |
3746 | | #endif |
3747 | 0 | strategy_delta = 0; |
3748 | 0 | next_to_clean = strategy_buf_id; |
3749 | 0 | next_passes = strategy_passes; |
3750 | 0 | bufs_to_lap = NBuffers; |
3751 | 0 | } |
3752 | | |
3753 | | /* Update saved info for next time */ |
3754 | 0 | prev_strategy_buf_id = strategy_buf_id; |
3755 | 0 | prev_strategy_passes = strategy_passes; |
3756 | 0 | saved_info_valid = true; |
3757 | | |
3758 | | /* |
3759 | | * Compute how many buffers had to be scanned for each new allocation, ie, |
3760 | | * 1/density of reusable buffers, and track a moving average of that. |
3761 | | * |
3762 | | * If the strategy point didn't move, we don't update the density estimate |
3763 | | */ |
3764 | 0 | if (strategy_delta > 0 && recent_alloc > 0) |
3765 | 0 | { |
3766 | 0 | scans_per_alloc = (float) strategy_delta / (float) recent_alloc; |
3767 | 0 | smoothed_density += (scans_per_alloc - smoothed_density) / |
3768 | 0 | smoothing_samples; |
3769 | 0 | } |
3770 | | |
3771 | | /* |
3772 | | * Estimate how many reusable buffers there are between the current |
3773 | | * strategy point and where we've scanned ahead to, based on the smoothed |
3774 | | * density estimate. |
3775 | | */ |
3776 | 0 | bufs_ahead = NBuffers - bufs_to_lap; |
3777 | 0 | reusable_buffers_est = (float) bufs_ahead / smoothed_density; |
3778 | | |
3779 | | /* |
3780 | | * Track a moving average of recent buffer allocations. Here, rather than |
3781 | | * a true average we want a fast-attack, slow-decline behavior: we |
3782 | | * immediately follow any increase. |
3783 | | */ |
3784 | 0 | if (smoothed_alloc <= (float) recent_alloc) |
3785 | 0 | smoothed_alloc = recent_alloc; |
3786 | 0 | else |
3787 | 0 | smoothed_alloc += ((float) recent_alloc - smoothed_alloc) / |
3788 | 0 | smoothing_samples; |
3789 | | |
3790 | | /* Scale the estimate by a GUC to allow more aggressive tuning. */ |
3791 | 0 | upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier); |
3792 | | |
3793 | | /* |
3794 | | * If recent_alloc remains at zero for many cycles, smoothed_alloc will |
3795 | | * eventually underflow to zero, and the underflows produce annoying |
3796 | | * kernel warnings on some platforms. Once upcoming_alloc_est has gone to |
3797 | | * zero, there's no point in tracking smaller and smaller values of |
3798 | | * smoothed_alloc, so just reset it to exactly zero to avoid this |
3799 | | * syndrome. It will pop back up as soon as recent_alloc increases. |
3800 | | */ |
3801 | 0 | if (upcoming_alloc_est == 0) |
3802 | 0 | smoothed_alloc = 0; |
3803 | | |
3804 | | /* |
3805 | | * Even in cases where there's been little or no buffer allocation |
3806 | | * activity, we want to make a small amount of progress through the buffer |
3807 | | * cache so that as many reusable buffers as possible are clean after an |
3808 | | * idle period. |
3809 | | * |
3810 | | * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times |
3811 | | * the BGW will be called during the scan_whole_pool time; slice the |
3812 | | * buffer pool into that many sections. |
3813 | | */ |
3814 | 0 | min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay)); |
3815 | |
|
3816 | 0 | if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est)) |
3817 | 0 | { |
3818 | | #ifdef BGW_DEBUG |
3819 | | elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d", |
3820 | | upcoming_alloc_est, min_scan_buffers, reusable_buffers_est); |
3821 | | #endif |
3822 | 0 | upcoming_alloc_est = min_scan_buffers + reusable_buffers_est; |
3823 | 0 | } |
3824 | | |
3825 | | /* |
3826 | | * Now write out dirty reusable buffers, working forward from the |
3827 | | * next_to_clean point, until we have lapped the strategy scan, or cleaned |
3828 | | * enough buffers to match our estimate of the next cycle's allocation |
3829 | | * requirements, or hit the bgwriter_lru_maxpages limit. |
3830 | | */ |
3831 | |
|
3832 | 0 | num_to_scan = bufs_to_lap; |
3833 | 0 | num_written = 0; |
3834 | 0 | reusable_buffers = reusable_buffers_est; |
3835 | | |
3836 | | /* Execute the LRU scan */ |
3837 | 0 | while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) |
3838 | 0 | { |
3839 | 0 | int sync_state = SyncOneBuffer(next_to_clean, true, |
3840 | 0 | wb_context); |
3841 | |
|
3842 | 0 | if (++next_to_clean >= NBuffers) |
3843 | 0 | { |
3844 | 0 | next_to_clean = 0; |
3845 | 0 | next_passes++; |
3846 | 0 | } |
3847 | 0 | num_to_scan--; |
3848 | |
|
3849 | 0 | if (sync_state & BUF_WRITTEN) |
3850 | 0 | { |
3851 | 0 | reusable_buffers++; |
3852 | 0 | if (++num_written >= bgwriter_lru_maxpages) |
3853 | 0 | { |
3854 | 0 | PendingBgWriterStats.maxwritten_clean++; |
3855 | 0 | break; |
3856 | 0 | } |
3857 | 0 | } |
3858 | 0 | else if (sync_state & BUF_REUSABLE) |
3859 | 0 | reusable_buffers++; |
3860 | 0 | } |
3861 | |
|
3862 | 0 | PendingBgWriterStats.buf_written_clean += num_written; |
3863 | |
|
3864 | | #ifdef BGW_DEBUG |
3865 | | elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d", |
3866 | | recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead, |
3867 | | smoothed_density, reusable_buffers_est, upcoming_alloc_est, |
3868 | | bufs_to_lap - num_to_scan, |
3869 | | num_written, |
3870 | | reusable_buffers - reusable_buffers_est); |
3871 | | #endif |
3872 | | |
3873 | | /* |
3874 | | * Consider the above scan as being like a new allocation scan. |
3875 | | * Characterize its density and update the smoothed one based on it. This |
3876 | | * effectively halves the moving average period in cases where both the |
3877 | | * strategy and the background writer are doing some useful scanning, |
3878 | | * which is helpful because a long memory isn't as desirable on the |
3879 | | * density estimates. |
3880 | | */ |
3881 | 0 | new_strategy_delta = bufs_to_lap - num_to_scan; |
3882 | 0 | new_recent_alloc = reusable_buffers - reusable_buffers_est; |
3883 | 0 | if (new_strategy_delta > 0 && new_recent_alloc > 0) |
3884 | 0 | { |
3885 | 0 | scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc; |
3886 | 0 | smoothed_density += (scans_per_alloc - smoothed_density) / |
3887 | 0 | smoothing_samples; |
3888 | |
|
3889 | | #ifdef BGW_DEBUG |
3890 | | elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f", |
3891 | | new_recent_alloc, new_strategy_delta, |
3892 | | scans_per_alloc, smoothed_density); |
3893 | | #endif |
3894 | 0 | } |
3895 | | |
3896 | | /* Return true if OK to hibernate */ |
3897 | 0 | return (bufs_to_lap == 0 && recent_alloc == 0); |
3898 | 0 | } |
3899 | | |
3900 | | /* |
3901 | | * SyncOneBuffer -- process a single buffer during syncing. |
3902 | | * |
3903 | | * If skip_recently_used is true, we don't write currently-pinned buffers, nor |
3904 | | * buffers marked recently used, as these are not replacement candidates. |
3905 | | * |
3906 | | * Returns a bitmask containing the following flag bits: |
3907 | | * BUF_WRITTEN: we wrote the buffer. |
3908 | | * BUF_REUSABLE: buffer is available for replacement, ie, it has |
3909 | | * pin count 0 and usage count 0. |
3910 | | * |
3911 | | * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean |
3912 | | * after locking it, but we don't care all that much.) |
3913 | | */ |
3914 | | static int |
3915 | | SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) |
3916 | 0 | { |
3917 | 0 | BufferDesc *bufHdr = GetBufferDescriptor(buf_id); |
3918 | 0 | int result = 0; |
3919 | 0 | uint32 buf_state; |
3920 | 0 | BufferTag tag; |
3921 | | |
3922 | | /* Make sure we can handle the pin */ |
3923 | 0 | ReservePrivateRefCountEntry(); |
3924 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
3925 | | |
3926 | | /* |
3927 | | * Check whether buffer needs writing. |
3928 | | * |
3929 | | * We can make this check without taking the buffer content lock so long |
3930 | | * as we mark pages dirty in access methods *before* logging changes with |
3931 | | * XLogInsert(): if someone marks the buffer dirty just after our check we |
3932 | | * don't worry because our checkpoint.redo points before log record for |
3933 | | * upcoming changes and so we are not required to write such dirty buffer. |
3934 | | */ |
3935 | 0 | buf_state = LockBufHdr(bufHdr); |
3936 | |
|
3937 | 0 | if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 && |
3938 | 0 | BUF_STATE_GET_USAGECOUNT(buf_state) == 0) |
3939 | 0 | { |
3940 | 0 | result |= BUF_REUSABLE; |
3941 | 0 | } |
3942 | 0 | else if (skip_recently_used) |
3943 | 0 | { |
3944 | | /* Caller told us not to write recently-used buffers */ |
3945 | 0 | UnlockBufHdr(bufHdr, buf_state); |
3946 | 0 | return result; |
3947 | 0 | } |
3948 | | |
3949 | 0 | if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY)) |
3950 | 0 | { |
3951 | | /* It's clean, so nothing to do */ |
3952 | 0 | UnlockBufHdr(bufHdr, buf_state); |
3953 | 0 | return result; |
3954 | 0 | } |
3955 | | |
3956 | | /* |
3957 | | * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the |
3958 | | * buffer is clean by the time we've locked it.) |
3959 | | */ |
3960 | 0 | PinBuffer_Locked(bufHdr); |
3961 | 0 | LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); |
3962 | |
|
3963 | 0 | FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); |
3964 | |
|
3965 | 0 | LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); |
3966 | |
|
3967 | 0 | tag = bufHdr->tag; |
3968 | |
|
3969 | 0 | UnpinBuffer(bufHdr); |
3970 | | |
3971 | | /* |
3972 | | * SyncOneBuffer() is only called by checkpointer and bgwriter, so |
3973 | | * IOContext will always be IOCONTEXT_NORMAL. |
3974 | | */ |
3975 | 0 | ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag); |
3976 | |
|
3977 | 0 | return result | BUF_WRITTEN; |
3978 | 0 | } |
3979 | | |
3980 | | /* |
3981 | | * AtEOXact_Buffers - clean up at end of transaction. |
3982 | | * |
3983 | | * As of PostgreSQL 8.0, buffer pins should get released by the |
3984 | | * ResourceOwner mechanism. This routine is just a debugging |
3985 | | * cross-check that no pins remain. |
3986 | | */ |
3987 | | void |
3988 | | AtEOXact_Buffers(bool isCommit) |
3989 | 0 | { |
3990 | 0 | CheckForBufferLeaks(); |
3991 | |
|
3992 | 0 | AtEOXact_LocalBuffers(isCommit); |
3993 | |
|
3994 | 0 | Assert(PrivateRefCountOverflowed == 0); |
3995 | 0 | } |
3996 | | |
3997 | | /* |
3998 | | * Initialize access to shared buffer pool |
3999 | | * |
4000 | | * This is called during backend startup (whether standalone or under the |
4001 | | * postmaster). It sets up for this backend's access to the already-existing |
4002 | | * buffer pool. |
4003 | | */ |
4004 | | void |
4005 | | InitBufferManagerAccess(void) |
4006 | 0 | { |
4007 | 0 | HASHCTL hash_ctl; |
4008 | | |
4009 | | /* |
4010 | | * An advisory limit on the number of pins each backend should hold, based |
4011 | | * on shared_buffers and the maximum number of connections possible. |
4012 | | * That's very pessimistic, but outside toy-sized shared_buffers it should |
4013 | | * allow plenty of pins. LimitAdditionalPins() and |
4014 | | * GetAdditionalPinLimit() can be used to check the remaining balance. |
4015 | | */ |
4016 | 0 | MaxProportionalPins = NBuffers / (MaxBackends + NUM_AUXILIARY_PROCS); |
4017 | |
|
4018 | 0 | memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray)); |
4019 | |
|
4020 | 0 | hash_ctl.keysize = sizeof(int32); |
4021 | 0 | hash_ctl.entrysize = sizeof(PrivateRefCountEntry); |
4022 | |
|
4023 | 0 | PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl, |
4024 | 0 | HASH_ELEM | HASH_BLOBS); |
4025 | | |
4026 | | /* |
4027 | | * AtProcExit_Buffers needs LWLock access, and thereby has to be called at |
4028 | | * the corresponding phase of backend shutdown. |
4029 | | */ |
4030 | 0 | Assert(MyProc != NULL); |
4031 | 0 | on_shmem_exit(AtProcExit_Buffers, 0); |
4032 | 0 | } |
4033 | | |
4034 | | /* |
4035 | | * During backend exit, ensure that we released all shared-buffer locks and |
4036 | | * assert that we have no remaining pins. |
4037 | | */ |
4038 | | static void |
4039 | | AtProcExit_Buffers(int code, Datum arg) |
4040 | 0 | { |
4041 | 0 | UnlockBuffers(); |
4042 | |
|
4043 | 0 | CheckForBufferLeaks(); |
4044 | | |
4045 | | /* localbuf.c needs a chance too */ |
4046 | 0 | AtProcExit_LocalBuffers(); |
4047 | 0 | } |
4048 | | |
4049 | | /* |
4050 | | * CheckForBufferLeaks - ensure this backend holds no buffer pins |
4051 | | * |
4052 | | * As of PostgreSQL 8.0, buffer pins should get released by the |
4053 | | * ResourceOwner mechanism. This routine is just a debugging |
4054 | | * cross-check that no pins remain. |
4055 | | */ |
4056 | | static void |
4057 | | CheckForBufferLeaks(void) |
4058 | 0 | { |
4059 | | #ifdef USE_ASSERT_CHECKING |
4060 | | int RefCountErrors = 0; |
4061 | | PrivateRefCountEntry *res; |
4062 | | int i; |
4063 | | char *s; |
4064 | | |
4065 | | /* check the array */ |
4066 | | for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) |
4067 | | { |
4068 | | res = &PrivateRefCountArray[i]; |
4069 | | |
4070 | | if (res->buffer != InvalidBuffer) |
4071 | | { |
4072 | | s = DebugPrintBufferRefcount(res->buffer); |
4073 | | elog(WARNING, "buffer refcount leak: %s", s); |
4074 | | pfree(s); |
4075 | | |
4076 | | RefCountErrors++; |
4077 | | } |
4078 | | } |
4079 | | |
4080 | | /* if necessary search the hash */ |
4081 | | if (PrivateRefCountOverflowed) |
4082 | | { |
4083 | | HASH_SEQ_STATUS hstat; |
4084 | | |
4085 | | hash_seq_init(&hstat, PrivateRefCountHash); |
4086 | | while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL) |
4087 | | { |
4088 | | s = DebugPrintBufferRefcount(res->buffer); |
4089 | | elog(WARNING, "buffer refcount leak: %s", s); |
4090 | | pfree(s); |
4091 | | RefCountErrors++; |
4092 | | } |
4093 | | } |
4094 | | |
4095 | | Assert(RefCountErrors == 0); |
4096 | | #endif |
4097 | 0 | } |
4098 | | |
4099 | | #ifdef USE_ASSERT_CHECKING |
4100 | | /* |
4101 | | * Check for exclusive-locked catalog buffers. This is the core of |
4102 | | * AssertCouldGetRelation(). |
4103 | | * |
4104 | | * A backend would self-deadlock on LWLocks if the catalog scan read the |
4105 | | * exclusive-locked buffer. The main threat is exclusive-locked buffers of |
4106 | | * catalogs used in relcache, because a catcache search on any catalog may |
4107 | | * build that catalog's relcache entry. We don't have an inventory of |
4108 | | * catalogs relcache uses, so just check buffers of most catalogs. |
4109 | | * |
4110 | | * It's better to minimize waits while holding an exclusive buffer lock, so it |
4111 | | * would be nice to broaden this check not to be catalog-specific. However, |
4112 | | * bttextcmp() accesses pg_collation, and non-core opclasses might similarly |
4113 | | * read tables. That is deadlock-free as long as there's no loop in the |
4114 | | * dependency graph: modifying table A may cause an opclass to read table B, |
4115 | | * but it must not cause a read of table A. |
4116 | | */ |
4117 | | void |
4118 | | AssertBufferLocksPermitCatalogRead(void) |
4119 | | { |
4120 | | ForEachLWLockHeldByMe(AssertNotCatalogBufferLock, NULL); |
4121 | | } |
4122 | | |
4123 | | static void |
4124 | | AssertNotCatalogBufferLock(LWLock *lock, LWLockMode mode, |
4125 | | void *unused_context) |
4126 | | { |
4127 | | BufferDesc *bufHdr; |
4128 | | BufferTag tag; |
4129 | | Oid relid; |
4130 | | |
4131 | | if (mode != LW_EXCLUSIVE) |
4132 | | return; |
4133 | | |
4134 | | if (!((BufferDescPadded *) lock > BufferDescriptors && |
4135 | | (BufferDescPadded *) lock < BufferDescriptors + NBuffers)) |
4136 | | return; /* not a buffer lock */ |
4137 | | |
4138 | | bufHdr = (BufferDesc *) |
4139 | | ((char *) lock - offsetof(BufferDesc, content_lock)); |
4140 | | tag = bufHdr->tag; |
4141 | | |
4142 | | /* |
4143 | | * This relNumber==relid assumption holds until a catalog experiences |
4144 | | * VACUUM FULL or similar. After a command like that, relNumber will be |
4145 | | * in the normal (non-catalog) range, and we lose the ability to detect |
4146 | | * hazardous access to that catalog. Calling RelidByRelfilenumber() would |
4147 | | * close that gap, but RelidByRelfilenumber() might then deadlock with a |
4148 | | * held lock. |
4149 | | */ |
4150 | | relid = tag.relNumber; |
4151 | | |
4152 | | if (IsCatalogTextUniqueIndexOid(relid)) /* see comments at the callee */ |
4153 | | return; |
4154 | | |
4155 | | Assert(!IsCatalogRelationOid(relid)); |
4156 | | } |
4157 | | #endif |
4158 | | |
4159 | | |
4160 | | /* |
4161 | | * Helper routine to issue warnings when a buffer is unexpectedly pinned |
4162 | | */ |
4163 | | char * |
4164 | | DebugPrintBufferRefcount(Buffer buffer) |
4165 | 0 | { |
4166 | 0 | BufferDesc *buf; |
4167 | 0 | int32 loccount; |
4168 | 0 | char *result; |
4169 | 0 | ProcNumber backend; |
4170 | 0 | uint32 buf_state; |
4171 | |
|
4172 | 0 | Assert(BufferIsValid(buffer)); |
4173 | 0 | if (BufferIsLocal(buffer)) |
4174 | 0 | { |
4175 | 0 | buf = GetLocalBufferDescriptor(-buffer - 1); |
4176 | 0 | loccount = LocalRefCount[-buffer - 1]; |
4177 | 0 | backend = MyProcNumber; |
4178 | 0 | } |
4179 | 0 | else |
4180 | 0 | { |
4181 | 0 | buf = GetBufferDescriptor(buffer - 1); |
4182 | 0 | loccount = GetPrivateRefCount(buffer); |
4183 | 0 | backend = INVALID_PROC_NUMBER; |
4184 | 0 | } |
4185 | | |
4186 | | /* theoretically we should lock the bufhdr here */ |
4187 | 0 | buf_state = pg_atomic_read_u32(&buf->state); |
4188 | |
|
4189 | 0 | result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)", |
4190 | 0 | buffer, |
4191 | 0 | relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend, |
4192 | 0 | BufTagGetForkNum(&buf->tag)).str, |
4193 | 0 | buf->tag.blockNum, buf_state & BUF_FLAG_MASK, |
4194 | 0 | BUF_STATE_GET_REFCOUNT(buf_state), loccount); |
4195 | 0 | return result; |
4196 | 0 | } |
4197 | | |
4198 | | /* |
4199 | | * CheckPointBuffers |
4200 | | * |
4201 | | * Flush all dirty blocks in buffer pool to disk at checkpoint time. |
4202 | | * |
4203 | | * Note: temporary relations do not participate in checkpoints, so they don't |
4204 | | * need to be flushed. |
4205 | | */ |
4206 | | void |
4207 | | CheckPointBuffers(int flags) |
4208 | 0 | { |
4209 | 0 | BufferSync(flags); |
4210 | 0 | } |
4211 | | |
4212 | | /* |
4213 | | * BufferGetBlockNumber |
4214 | | * Returns the block number associated with a buffer. |
4215 | | * |
4216 | | * Note: |
4217 | | * Assumes that the buffer is valid and pinned, else the |
4218 | | * value may be obsolete immediately... |
4219 | | */ |
4220 | | BlockNumber |
4221 | | BufferGetBlockNumber(Buffer buffer) |
4222 | 0 | { |
4223 | 0 | BufferDesc *bufHdr; |
4224 | |
|
4225 | 0 | Assert(BufferIsPinned(buffer)); |
4226 | |
|
4227 | 0 | if (BufferIsLocal(buffer)) |
4228 | 0 | bufHdr = GetLocalBufferDescriptor(-buffer - 1); |
4229 | 0 | else |
4230 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
4231 | | |
4232 | | /* pinned, so OK to read tag without spinlock */ |
4233 | 0 | return bufHdr->tag.blockNum; |
4234 | 0 | } |
4235 | | |
4236 | | /* |
4237 | | * BufferGetTag |
4238 | | * Returns the relfilelocator, fork number and block number associated with |
4239 | | * a buffer. |
4240 | | */ |
4241 | | void |
4242 | | BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, |
4243 | | BlockNumber *blknum) |
4244 | 0 | { |
4245 | 0 | BufferDesc *bufHdr; |
4246 | | |
4247 | | /* Do the same checks as BufferGetBlockNumber. */ |
4248 | 0 | Assert(BufferIsPinned(buffer)); |
4249 | |
|
4250 | 0 | if (BufferIsLocal(buffer)) |
4251 | 0 | bufHdr = GetLocalBufferDescriptor(-buffer - 1); |
4252 | 0 | else |
4253 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
4254 | | |
4255 | | /* pinned, so OK to read tag without spinlock */ |
4256 | 0 | *rlocator = BufTagGetRelFileLocator(&bufHdr->tag); |
4257 | 0 | *forknum = BufTagGetForkNum(&bufHdr->tag); |
4258 | 0 | *blknum = bufHdr->tag.blockNum; |
4259 | 0 | } |
4260 | | |
4261 | | /* |
4262 | | * FlushBuffer |
4263 | | * Physically write out a shared buffer. |
4264 | | * |
4265 | | * NOTE: this actually just passes the buffer contents to the kernel; the |
4266 | | * real write to disk won't happen until the kernel feels like it. This |
4267 | | * is okay from our point of view since we can redo the changes from WAL. |
4268 | | * However, we will need to force the changes to disk via fsync before |
4269 | | * we can checkpoint WAL. |
4270 | | * |
4271 | | * The caller must hold a pin on the buffer and have share-locked the |
4272 | | * buffer contents. (Note: a share-lock does not prevent updates of |
4273 | | * hint bits in the buffer, so the page could change while the write |
4274 | | * is in progress, but we assume that that will not invalidate the data |
4275 | | * written.) |
4276 | | * |
4277 | | * If the caller has an smgr reference for the buffer's relation, pass it |
4278 | | * as the second parameter. If not, pass NULL. |
4279 | | */ |
4280 | | static void |
4281 | | FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, |
4282 | | IOContext io_context) |
4283 | 0 | { |
4284 | 0 | XLogRecPtr recptr; |
4285 | 0 | ErrorContextCallback errcallback; |
4286 | 0 | instr_time io_start; |
4287 | 0 | Block bufBlock; |
4288 | 0 | char *bufToWrite; |
4289 | 0 | uint32 buf_state; |
4290 | | |
4291 | | /* |
4292 | | * Try to start an I/O operation. If StartBufferIO returns false, then |
4293 | | * someone else flushed the buffer before we could, so we need not do |
4294 | | * anything. |
4295 | | */ |
4296 | 0 | if (!StartBufferIO(buf, false, false)) |
4297 | 0 | return; |
4298 | | |
4299 | | /* Setup error traceback support for ereport() */ |
4300 | 0 | errcallback.callback = shared_buffer_write_error_callback; |
4301 | 0 | errcallback.arg = buf; |
4302 | 0 | errcallback.previous = error_context_stack; |
4303 | 0 | error_context_stack = &errcallback; |
4304 | | |
4305 | | /* Find smgr relation for buffer */ |
4306 | 0 | if (reln == NULL) |
4307 | 0 | reln = smgropen(BufTagGetRelFileLocator(&buf->tag), INVALID_PROC_NUMBER); |
4308 | |
|
4309 | 0 | TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag), |
4310 | 0 | buf->tag.blockNum, |
4311 | 0 | reln->smgr_rlocator.locator.spcOid, |
4312 | 0 | reln->smgr_rlocator.locator.dbOid, |
4313 | 0 | reln->smgr_rlocator.locator.relNumber); |
4314 | |
|
4315 | 0 | buf_state = LockBufHdr(buf); |
4316 | | |
4317 | | /* |
4318 | | * Run PageGetLSN while holding header lock, since we don't have the |
4319 | | * buffer locked exclusively in all cases. |
4320 | | */ |
4321 | 0 | recptr = BufferGetLSN(buf); |
4322 | | |
4323 | | /* To check if block content changes while flushing. - vadim 01/17/97 */ |
4324 | 0 | buf_state &= ~BM_JUST_DIRTIED; |
4325 | 0 | UnlockBufHdr(buf, buf_state); |
4326 | | |
4327 | | /* |
4328 | | * Force XLOG flush up to buffer's LSN. This implements the basic WAL |
4329 | | * rule that log updates must hit disk before any of the data-file changes |
4330 | | * they describe do. |
4331 | | * |
4332 | | * However, this rule does not apply to unlogged relations, which will be |
4333 | | * lost after a crash anyway. Most unlogged relation pages do not bear |
4334 | | * LSNs since we never emit WAL records for them, and therefore flushing |
4335 | | * up through the buffer LSN would be useless, but harmless. However, |
4336 | | * GiST indexes use LSNs internally to track page-splits, and therefore |
4337 | | * unlogged GiST pages bear "fake" LSNs generated by |
4338 | | * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake |
4339 | | * LSN counter could advance past the WAL insertion point; and if it did |
4340 | | * happen, attempting to flush WAL through that location would fail, with |
4341 | | * disastrous system-wide consequences. To make sure that can't happen, |
4342 | | * skip the flush if the buffer isn't permanent. |
4343 | | */ |
4344 | 0 | if (buf_state & BM_PERMANENT) |
4345 | 0 | XLogFlush(recptr); |
4346 | | |
4347 | | /* |
4348 | | * Now it's safe to write the buffer to disk. Note that no one else should |
4349 | | * have been able to write it, while we were busy with log flushing, |
4350 | | * because we got the exclusive right to perform I/O by setting the |
4351 | | * BM_IO_IN_PROGRESS bit. |
4352 | | */ |
4353 | 0 | bufBlock = BufHdrGetBlock(buf); |
4354 | | |
4355 | | /* |
4356 | | * Update page checksum if desired. Since we have only shared lock on the |
4357 | | * buffer, other processes might be updating hint bits in it, so we must |
4358 | | * copy the page to private storage if we do checksumming. |
4359 | | */ |
4360 | 0 | bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); |
4361 | |
|
4362 | 0 | io_start = pgstat_prepare_io_time(track_io_timing); |
4363 | | |
4364 | | /* |
4365 | | * bufToWrite is either the shared buffer or a copy, as appropriate. |
4366 | | */ |
4367 | 0 | smgrwrite(reln, |
4368 | 0 | BufTagGetForkNum(&buf->tag), |
4369 | 0 | buf->tag.blockNum, |
4370 | 0 | bufToWrite, |
4371 | 0 | false); |
4372 | | |
4373 | | /* |
4374 | | * When a strategy is in use, only flushes of dirty buffers already in the |
4375 | | * strategy ring are counted as strategy writes (IOCONTEXT |
4376 | | * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO |
4377 | | * statistics tracking. |
4378 | | * |
4379 | | * If a shared buffer initially added to the ring must be flushed before |
4380 | | * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE. |
4381 | | * |
4382 | | * If a shared buffer which was added to the ring later because the |
4383 | | * current strategy buffer is pinned or in use or because all strategy |
4384 | | * buffers were dirty and rejected (for BAS_BULKREAD operations only) |
4385 | | * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE |
4386 | | * (from_ring will be false). |
4387 | | * |
4388 | | * When a strategy is not in use, the write can only be a "regular" write |
4389 | | * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE). |
4390 | | */ |
4391 | 0 | pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, |
4392 | 0 | IOOP_WRITE, io_start, 1, BLCKSZ); |
4393 | |
|
4394 | 0 | pgBufferUsage.shared_blks_written++; |
4395 | | |
4396 | | /* |
4397 | | * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and |
4398 | | * end the BM_IO_IN_PROGRESS state. |
4399 | | */ |
4400 | 0 | TerminateBufferIO(buf, true, 0, true, false); |
4401 | |
|
4402 | 0 | TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag), |
4403 | 0 | buf->tag.blockNum, |
4404 | 0 | reln->smgr_rlocator.locator.spcOid, |
4405 | 0 | reln->smgr_rlocator.locator.dbOid, |
4406 | 0 | reln->smgr_rlocator.locator.relNumber); |
4407 | | |
4408 | | /* Pop the error context stack */ |
4409 | 0 | error_context_stack = errcallback.previous; |
4410 | 0 | } |
4411 | | |
4412 | | /* |
4413 | | * RelationGetNumberOfBlocksInFork |
4414 | | * Determines the current number of pages in the specified relation fork. |
4415 | | * |
4416 | | * Note that the accuracy of the result will depend on the details of the |
4417 | | * relation's storage. For builtin AMs it'll be accurate, but for external AMs |
4418 | | * it might not be. |
4419 | | */ |
4420 | | BlockNumber |
4421 | | RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum) |
4422 | 0 | { |
4423 | 0 | if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind)) |
4424 | 0 | { |
4425 | | /* |
4426 | | * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore |
4427 | | * tableam returns the size in bytes - but for the purpose of this |
4428 | | * routine, we want the number of blocks. Therefore divide, rounding |
4429 | | * up. |
4430 | | */ |
4431 | 0 | uint64 szbytes; |
4432 | |
|
4433 | 0 | szbytes = table_relation_size(relation, forkNum); |
4434 | |
|
4435 | 0 | return (szbytes + (BLCKSZ - 1)) / BLCKSZ; |
4436 | 0 | } |
4437 | 0 | else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) |
4438 | 0 | { |
4439 | 0 | return smgrnblocks(RelationGetSmgr(relation), forkNum); |
4440 | 0 | } |
4441 | 0 | else |
4442 | 0 | Assert(false); |
4443 | | |
4444 | 0 | return 0; /* keep compiler quiet */ |
4445 | 0 | } |
4446 | | |
4447 | | /* |
4448 | | * BufferIsPermanent |
4449 | | * Determines whether a buffer will potentially still be around after |
4450 | | * a crash. Caller must hold a buffer pin. |
4451 | | */ |
4452 | | bool |
4453 | | BufferIsPermanent(Buffer buffer) |
4454 | 0 | { |
4455 | 0 | BufferDesc *bufHdr; |
4456 | | |
4457 | | /* Local buffers are used only for temp relations. */ |
4458 | 0 | if (BufferIsLocal(buffer)) |
4459 | 0 | return false; |
4460 | | |
4461 | | /* Make sure we've got a real buffer, and that we hold a pin on it. */ |
4462 | 0 | Assert(BufferIsValid(buffer)); |
4463 | 0 | Assert(BufferIsPinned(buffer)); |
4464 | | |
4465 | | /* |
4466 | | * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we |
4467 | | * need not bother with the buffer header spinlock. Even if someone else |
4468 | | * changes the buffer header state while we're doing this, the state is |
4469 | | * changed atomically, so we'll read the old value or the new value, but |
4470 | | * not random garbage. |
4471 | | */ |
4472 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
4473 | 0 | return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0; |
4474 | 0 | } |
4475 | | |
4476 | | /* |
4477 | | * BufferGetLSNAtomic |
4478 | | * Retrieves the LSN of the buffer atomically using a buffer header lock. |
4479 | | * This is necessary for some callers who may not have an exclusive lock |
4480 | | * on the buffer. |
4481 | | */ |
4482 | | XLogRecPtr |
4483 | | BufferGetLSNAtomic(Buffer buffer) |
4484 | 0 | { |
4485 | 0 | char *page = BufferGetPage(buffer); |
4486 | 0 | BufferDesc *bufHdr; |
4487 | 0 | XLogRecPtr lsn; |
4488 | 0 | uint32 buf_state; |
4489 | | |
4490 | | /* |
4491 | | * If we don't need locking for correctness, fastpath out. |
4492 | | */ |
4493 | 0 | if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer)) |
4494 | 0 | return PageGetLSN(page); |
4495 | | |
4496 | | /* Make sure we've got a real buffer, and that we hold a pin on it. */ |
4497 | 0 | Assert(BufferIsValid(buffer)); |
4498 | 0 | Assert(BufferIsPinned(buffer)); |
4499 | |
|
4500 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
4501 | 0 | buf_state = LockBufHdr(bufHdr); |
4502 | 0 | lsn = PageGetLSN(page); |
4503 | 0 | UnlockBufHdr(bufHdr, buf_state); |
4504 | |
|
4505 | 0 | return lsn; |
4506 | 0 | } |
4507 | | |
4508 | | /* --------------------------------------------------------------------- |
4509 | | * DropRelationBuffers |
4510 | | * |
4511 | | * This function removes from the buffer pool all the pages of the |
4512 | | * specified relation forks that have block numbers >= firstDelBlock. |
4513 | | * (In particular, with firstDelBlock = 0, all pages are removed.) |
4514 | | * Dirty pages are simply dropped, without bothering to write them |
4515 | | * out first. Therefore, this is NOT rollback-able, and so should be |
4516 | | * used only with extreme caution! |
4517 | | * |
4518 | | * Currently, this is called only from smgr.c when the underlying file |
4519 | | * is about to be deleted or truncated (firstDelBlock is needed for |
4520 | | * the truncation case). The data in the affected pages would therefore |
4521 | | * be deleted momentarily anyway, and there is no point in writing it. |
4522 | | * It is the responsibility of higher-level code to ensure that the |
4523 | | * deletion or truncation does not lose any data that could be needed |
4524 | | * later. It is also the responsibility of higher-level code to ensure |
4525 | | * that no other process could be trying to load more pages of the |
4526 | | * relation into buffers. |
4527 | | * -------------------------------------------------------------------- |
4528 | | */ |
4529 | | void |
4530 | | DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, |
4531 | | int nforks, BlockNumber *firstDelBlock) |
4532 | 0 | { |
4533 | 0 | int i; |
4534 | 0 | int j; |
4535 | 0 | RelFileLocatorBackend rlocator; |
4536 | 0 | BlockNumber nForkBlock[MAX_FORKNUM]; |
4537 | 0 | uint64 nBlocksToInvalidate = 0; |
4538 | |
|
4539 | 0 | rlocator = smgr_reln->smgr_rlocator; |
4540 | | |
4541 | | /* If it's a local relation, it's localbuf.c's problem. */ |
4542 | 0 | if (RelFileLocatorBackendIsTemp(rlocator)) |
4543 | 0 | { |
4544 | 0 | if (rlocator.backend == MyProcNumber) |
4545 | 0 | DropRelationLocalBuffers(rlocator.locator, forkNum, nforks, |
4546 | 0 | firstDelBlock); |
4547 | |
|
4548 | 0 | return; |
4549 | 0 | } |
4550 | | |
4551 | | /* |
4552 | | * To remove all the pages of the specified relation forks from the buffer |
4553 | | * pool, we need to scan the entire buffer pool but we can optimize it by |
4554 | | * finding the buffers from BufMapping table provided we know the exact |
4555 | | * size of each fork of the relation. The exact size is required to ensure |
4556 | | * that we don't leave any buffer for the relation being dropped as |
4557 | | * otherwise the background writer or checkpointer can lead to a PANIC |
4558 | | * error while flushing buffers corresponding to files that don't exist. |
4559 | | * |
4560 | | * To know the exact size, we rely on the size cached for each fork by us |
4561 | | * during recovery which limits the optimization to recovery and on |
4562 | | * standbys but we can easily extend it once we have shared cache for |
4563 | | * relation size. |
4564 | | * |
4565 | | * In recovery, we cache the value returned by the first lseek(SEEK_END) |
4566 | | * and the future writes keeps the cached value up-to-date. See |
4567 | | * smgrextend. It is possible that the value of the first lseek is smaller |
4568 | | * than the actual number of existing blocks in the file due to buggy |
4569 | | * Linux kernels that might not have accounted for the recent write. But |
4570 | | * that should be fine because there must not be any buffers after that |
4571 | | * file size. |
4572 | | */ |
4573 | 0 | for (i = 0; i < nforks; i++) |
4574 | 0 | { |
4575 | | /* Get the number of blocks for a relation's fork */ |
4576 | 0 | nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]); |
4577 | |
|
4578 | 0 | if (nForkBlock[i] == InvalidBlockNumber) |
4579 | 0 | { |
4580 | 0 | nBlocksToInvalidate = InvalidBlockNumber; |
4581 | 0 | break; |
4582 | 0 | } |
4583 | | |
4584 | | /* calculate the number of blocks to be invalidated */ |
4585 | 0 | nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]); |
4586 | 0 | } |
4587 | | |
4588 | | /* |
4589 | | * We apply the optimization iff the total number of blocks to invalidate |
4590 | | * is below the BUF_DROP_FULL_SCAN_THRESHOLD. |
4591 | | */ |
4592 | 0 | if (BlockNumberIsValid(nBlocksToInvalidate) && |
4593 | 0 | nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD) |
4594 | 0 | { |
4595 | 0 | for (j = 0; j < nforks; j++) |
4596 | 0 | FindAndDropRelationBuffers(rlocator.locator, forkNum[j], |
4597 | 0 | nForkBlock[j], firstDelBlock[j]); |
4598 | 0 | return; |
4599 | 0 | } |
4600 | | |
4601 | 0 | for (i = 0; i < NBuffers; i++) |
4602 | 0 | { |
4603 | 0 | BufferDesc *bufHdr = GetBufferDescriptor(i); |
4604 | 0 | uint32 buf_state; |
4605 | | |
4606 | | /* |
4607 | | * We can make this a tad faster by prechecking the buffer tag before |
4608 | | * we attempt to lock the buffer; this saves a lot of lock |
4609 | | * acquisitions in typical cases. It should be safe because the |
4610 | | * caller must have AccessExclusiveLock on the relation, or some other |
4611 | | * reason to be certain that no one is loading new pages of the rel |
4612 | | * into the buffer pool. (Otherwise we might well miss such pages |
4613 | | * entirely.) Therefore, while the tag might be changing while we |
4614 | | * look at it, it can't be changing *to* a value we care about, only |
4615 | | * *away* from such a value. So false negatives are impossible, and |
4616 | | * false positives are safe because we'll recheck after getting the |
4617 | | * buffer lock. |
4618 | | * |
4619 | | * We could check forkNum and blockNum as well as the rlocator, but |
4620 | | * the incremental win from doing so seems small. |
4621 | | */ |
4622 | 0 | if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator)) |
4623 | 0 | continue; |
4624 | | |
4625 | 0 | buf_state = LockBufHdr(bufHdr); |
4626 | |
|
4627 | 0 | for (j = 0; j < nforks; j++) |
4628 | 0 | { |
4629 | 0 | if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) && |
4630 | 0 | BufTagGetForkNum(&bufHdr->tag) == forkNum[j] && |
4631 | 0 | bufHdr->tag.blockNum >= firstDelBlock[j]) |
4632 | 0 | { |
4633 | 0 | InvalidateBuffer(bufHdr); /* releases spinlock */ |
4634 | 0 | break; |
4635 | 0 | } |
4636 | 0 | } |
4637 | 0 | if (j >= nforks) |
4638 | 0 | UnlockBufHdr(bufHdr, buf_state); |
4639 | 0 | } |
4640 | 0 | } |
4641 | | |
4642 | | /* --------------------------------------------------------------------- |
4643 | | * DropRelationsAllBuffers |
4644 | | * |
4645 | | * This function removes from the buffer pool all the pages of all |
4646 | | * forks of the specified relations. It's equivalent to calling |
4647 | | * DropRelationBuffers once per fork per relation with firstDelBlock = 0. |
4648 | | * -------------------------------------------------------------------- |
4649 | | */ |
4650 | | void |
4651 | | DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) |
4652 | 0 | { |
4653 | 0 | int i; |
4654 | 0 | int n = 0; |
4655 | 0 | SMgrRelation *rels; |
4656 | 0 | BlockNumber (*block)[MAX_FORKNUM + 1]; |
4657 | 0 | uint64 nBlocksToInvalidate = 0; |
4658 | 0 | RelFileLocator *locators; |
4659 | 0 | bool cached = true; |
4660 | 0 | bool use_bsearch; |
4661 | |
|
4662 | 0 | if (nlocators == 0) |
4663 | 0 | return; |
4664 | | |
4665 | 0 | rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */ |
4666 | | |
4667 | | /* If it's a local relation, it's localbuf.c's problem. */ |
4668 | 0 | for (i = 0; i < nlocators; i++) |
4669 | 0 | { |
4670 | 0 | if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator)) |
4671 | 0 | { |
4672 | 0 | if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber) |
4673 | 0 | DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator); |
4674 | 0 | } |
4675 | 0 | else |
4676 | 0 | rels[n++] = smgr_reln[i]; |
4677 | 0 | } |
4678 | | |
4679 | | /* |
4680 | | * If there are no non-local relations, then we're done. Release the |
4681 | | * memory and return. |
4682 | | */ |
4683 | 0 | if (n == 0) |
4684 | 0 | { |
4685 | 0 | pfree(rels); |
4686 | 0 | return; |
4687 | 0 | } |
4688 | | |
4689 | | /* |
4690 | | * This is used to remember the number of blocks for all the relations |
4691 | | * forks. |
4692 | | */ |
4693 | 0 | block = (BlockNumber (*)[MAX_FORKNUM + 1]) |
4694 | 0 | palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1)); |
4695 | | |
4696 | | /* |
4697 | | * We can avoid scanning the entire buffer pool if we know the exact size |
4698 | | * of each of the given relation forks. See DropRelationBuffers. |
4699 | | */ |
4700 | 0 | for (i = 0; i < n && cached; i++) |
4701 | 0 | { |
4702 | 0 | for (int j = 0; j <= MAX_FORKNUM; j++) |
4703 | 0 | { |
4704 | | /* Get the number of blocks for a relation's fork. */ |
4705 | 0 | block[i][j] = smgrnblocks_cached(rels[i], j); |
4706 | | |
4707 | | /* We need to only consider the relation forks that exists. */ |
4708 | 0 | if (block[i][j] == InvalidBlockNumber) |
4709 | 0 | { |
4710 | 0 | if (!smgrexists(rels[i], j)) |
4711 | 0 | continue; |
4712 | 0 | cached = false; |
4713 | 0 | break; |
4714 | 0 | } |
4715 | | |
4716 | | /* calculate the total number of blocks to be invalidated */ |
4717 | 0 | nBlocksToInvalidate += block[i][j]; |
4718 | 0 | } |
4719 | 0 | } |
4720 | | |
4721 | | /* |
4722 | | * We apply the optimization iff the total number of blocks to invalidate |
4723 | | * is below the BUF_DROP_FULL_SCAN_THRESHOLD. |
4724 | | */ |
4725 | 0 | if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD) |
4726 | 0 | { |
4727 | 0 | for (i = 0; i < n; i++) |
4728 | 0 | { |
4729 | 0 | for (int j = 0; j <= MAX_FORKNUM; j++) |
4730 | 0 | { |
4731 | | /* ignore relation forks that doesn't exist */ |
4732 | 0 | if (!BlockNumberIsValid(block[i][j])) |
4733 | 0 | continue; |
4734 | | |
4735 | | /* drop all the buffers for a particular relation fork */ |
4736 | 0 | FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator, |
4737 | 0 | j, block[i][j], 0); |
4738 | 0 | } |
4739 | 0 | } |
4740 | |
|
4741 | 0 | pfree(block); |
4742 | 0 | pfree(rels); |
4743 | 0 | return; |
4744 | 0 | } |
4745 | | |
4746 | 0 | pfree(block); |
4747 | 0 | locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */ |
4748 | 0 | for (i = 0; i < n; i++) |
4749 | 0 | locators[i] = rels[i]->smgr_rlocator.locator; |
4750 | | |
4751 | | /* |
4752 | | * For low number of relations to drop just use a simple walk through, to |
4753 | | * save the bsearch overhead. The threshold to use is rather a guess than |
4754 | | * an exactly determined value, as it depends on many factors (CPU and RAM |
4755 | | * speeds, amount of shared buffers etc.). |
4756 | | */ |
4757 | 0 | use_bsearch = n > RELS_BSEARCH_THRESHOLD; |
4758 | | |
4759 | | /* sort the list of rlocators if necessary */ |
4760 | 0 | if (use_bsearch) |
4761 | 0 | qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator); |
4762 | |
|
4763 | 0 | for (i = 0; i < NBuffers; i++) |
4764 | 0 | { |
4765 | 0 | RelFileLocator *rlocator = NULL; |
4766 | 0 | BufferDesc *bufHdr = GetBufferDescriptor(i); |
4767 | 0 | uint32 buf_state; |
4768 | | |
4769 | | /* |
4770 | | * As in DropRelationBuffers, an unlocked precheck should be safe and |
4771 | | * saves some cycles. |
4772 | | */ |
4773 | |
|
4774 | 0 | if (!use_bsearch) |
4775 | 0 | { |
4776 | 0 | int j; |
4777 | |
|
4778 | 0 | for (j = 0; j < n; j++) |
4779 | 0 | { |
4780 | 0 | if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j])) |
4781 | 0 | { |
4782 | 0 | rlocator = &locators[j]; |
4783 | 0 | break; |
4784 | 0 | } |
4785 | 0 | } |
4786 | 0 | } |
4787 | 0 | else |
4788 | 0 | { |
4789 | 0 | RelFileLocator locator; |
4790 | |
|
4791 | 0 | locator = BufTagGetRelFileLocator(&bufHdr->tag); |
4792 | 0 | rlocator = bsearch(&locator, |
4793 | 0 | locators, n, sizeof(RelFileLocator), |
4794 | 0 | rlocator_comparator); |
4795 | 0 | } |
4796 | | |
4797 | | /* buffer doesn't belong to any of the given relfilelocators; skip it */ |
4798 | 0 | if (rlocator == NULL) |
4799 | 0 | continue; |
4800 | | |
4801 | 0 | buf_state = LockBufHdr(bufHdr); |
4802 | 0 | if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator)) |
4803 | 0 | InvalidateBuffer(bufHdr); /* releases spinlock */ |
4804 | 0 | else |
4805 | 0 | UnlockBufHdr(bufHdr, buf_state); |
4806 | 0 | } |
4807 | |
|
4808 | 0 | pfree(locators); |
4809 | 0 | pfree(rels); |
4810 | 0 | } |
4811 | | |
4812 | | /* --------------------------------------------------------------------- |
4813 | | * FindAndDropRelationBuffers |
4814 | | * |
4815 | | * This function performs look up in BufMapping table and removes from the |
4816 | | * buffer pool all the pages of the specified relation fork that has block |
4817 | | * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all |
4818 | | * pages are removed.) |
4819 | | * -------------------------------------------------------------------- |
4820 | | */ |
4821 | | static void |
4822 | | FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, |
4823 | | BlockNumber nForkBlock, |
4824 | | BlockNumber firstDelBlock) |
4825 | 0 | { |
4826 | 0 | BlockNumber curBlock; |
4827 | |
|
4828 | 0 | for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++) |
4829 | 0 | { |
4830 | 0 | uint32 bufHash; /* hash value for tag */ |
4831 | 0 | BufferTag bufTag; /* identity of requested block */ |
4832 | 0 | LWLock *bufPartitionLock; /* buffer partition lock for it */ |
4833 | 0 | int buf_id; |
4834 | 0 | BufferDesc *bufHdr; |
4835 | 0 | uint32 buf_state; |
4836 | | |
4837 | | /* create a tag so we can lookup the buffer */ |
4838 | 0 | InitBufferTag(&bufTag, &rlocator, forkNum, curBlock); |
4839 | | |
4840 | | /* determine its hash code and partition lock ID */ |
4841 | 0 | bufHash = BufTableHashCode(&bufTag); |
4842 | 0 | bufPartitionLock = BufMappingPartitionLock(bufHash); |
4843 | | |
4844 | | /* Check that it is in the buffer pool. If not, do nothing. */ |
4845 | 0 | LWLockAcquire(bufPartitionLock, LW_SHARED); |
4846 | 0 | buf_id = BufTableLookup(&bufTag, bufHash); |
4847 | 0 | LWLockRelease(bufPartitionLock); |
4848 | |
|
4849 | 0 | if (buf_id < 0) |
4850 | 0 | continue; |
4851 | | |
4852 | 0 | bufHdr = GetBufferDescriptor(buf_id); |
4853 | | |
4854 | | /* |
4855 | | * We need to lock the buffer header and recheck if the buffer is |
4856 | | * still associated with the same block because the buffer could be |
4857 | | * evicted by some other backend loading blocks for a different |
4858 | | * relation after we release lock on the BufMapping table. |
4859 | | */ |
4860 | 0 | buf_state = LockBufHdr(bufHdr); |
4861 | |
|
4862 | 0 | if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && |
4863 | 0 | BufTagGetForkNum(&bufHdr->tag) == forkNum && |
4864 | 0 | bufHdr->tag.blockNum >= firstDelBlock) |
4865 | 0 | InvalidateBuffer(bufHdr); /* releases spinlock */ |
4866 | 0 | else |
4867 | 0 | UnlockBufHdr(bufHdr, buf_state); |
4868 | 0 | } |
4869 | 0 | } |
4870 | | |
4871 | | /* --------------------------------------------------------------------- |
4872 | | * DropDatabaseBuffers |
4873 | | * |
4874 | | * This function removes all the buffers in the buffer cache for a |
4875 | | * particular database. Dirty pages are simply dropped, without |
4876 | | * bothering to write them out first. This is used when we destroy a |
4877 | | * database, to avoid trying to flush data to disk when the directory |
4878 | | * tree no longer exists. Implementation is pretty similar to |
4879 | | * DropRelationBuffers() which is for destroying just one relation. |
4880 | | * -------------------------------------------------------------------- |
4881 | | */ |
4882 | | void |
4883 | | DropDatabaseBuffers(Oid dbid) |
4884 | 0 | { |
4885 | 0 | int i; |
4886 | | |
4887 | | /* |
4888 | | * We needn't consider local buffers, since by assumption the target |
4889 | | * database isn't our own. |
4890 | | */ |
4891 | |
|
4892 | 0 | for (i = 0; i < NBuffers; i++) |
4893 | 0 | { |
4894 | 0 | BufferDesc *bufHdr = GetBufferDescriptor(i); |
4895 | 0 | uint32 buf_state; |
4896 | | |
4897 | | /* |
4898 | | * As in DropRelationBuffers, an unlocked precheck should be safe and |
4899 | | * saves some cycles. |
4900 | | */ |
4901 | 0 | if (bufHdr->tag.dbOid != dbid) |
4902 | 0 | continue; |
4903 | | |
4904 | 0 | buf_state = LockBufHdr(bufHdr); |
4905 | 0 | if (bufHdr->tag.dbOid == dbid) |
4906 | 0 | InvalidateBuffer(bufHdr); /* releases spinlock */ |
4907 | 0 | else |
4908 | 0 | UnlockBufHdr(bufHdr, buf_state); |
4909 | 0 | } |
4910 | 0 | } |
4911 | | |
4912 | | /* --------------------------------------------------------------------- |
4913 | | * FlushRelationBuffers |
4914 | | * |
4915 | | * This function writes all dirty pages of a relation out to disk |
4916 | | * (or more accurately, out to kernel disk buffers), ensuring that the |
4917 | | * kernel has an up-to-date view of the relation. |
4918 | | * |
4919 | | * Generally, the caller should be holding AccessExclusiveLock on the |
4920 | | * target relation to ensure that no other backend is busy dirtying |
4921 | | * more blocks of the relation; the effects can't be expected to last |
4922 | | * after the lock is released. |
4923 | | * |
4924 | | * XXX currently it sequentially searches the buffer pool, should be |
4925 | | * changed to more clever ways of searching. This routine is not |
4926 | | * used in any performance-critical code paths, so it's not worth |
4927 | | * adding additional overhead to normal paths to make it go faster. |
4928 | | * -------------------------------------------------------------------- |
4929 | | */ |
4930 | | void |
4931 | | FlushRelationBuffers(Relation rel) |
4932 | 0 | { |
4933 | 0 | int i; |
4934 | 0 | BufferDesc *bufHdr; |
4935 | 0 | SMgrRelation srel = RelationGetSmgr(rel); |
4936 | |
|
4937 | 0 | if (RelationUsesLocalBuffers(rel)) |
4938 | 0 | { |
4939 | 0 | for (i = 0; i < NLocBuffer; i++) |
4940 | 0 | { |
4941 | 0 | uint32 buf_state; |
4942 | |
|
4943 | 0 | bufHdr = GetLocalBufferDescriptor(i); |
4944 | 0 | if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) && |
4945 | 0 | ((buf_state = pg_atomic_read_u32(&bufHdr->state)) & |
4946 | 0 | (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) |
4947 | 0 | { |
4948 | 0 | ErrorContextCallback errcallback; |
4949 | | |
4950 | | /* Setup error traceback support for ereport() */ |
4951 | 0 | errcallback.callback = local_buffer_write_error_callback; |
4952 | 0 | errcallback.arg = bufHdr; |
4953 | 0 | errcallback.previous = error_context_stack; |
4954 | 0 | error_context_stack = &errcallback; |
4955 | | |
4956 | | /* Make sure we can handle the pin */ |
4957 | 0 | ReservePrivateRefCountEntry(); |
4958 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
4959 | | |
4960 | | /* |
4961 | | * Pin/unpin mostly to make valgrind work, but it also seems |
4962 | | * like the right thing to do. |
4963 | | */ |
4964 | 0 | PinLocalBuffer(bufHdr, false); |
4965 | | |
4966 | |
|
4967 | 0 | FlushLocalBuffer(bufHdr, srel); |
4968 | |
|
4969 | 0 | UnpinLocalBuffer(BufferDescriptorGetBuffer(bufHdr)); |
4970 | | |
4971 | | /* Pop the error context stack */ |
4972 | 0 | error_context_stack = errcallback.previous; |
4973 | 0 | } |
4974 | 0 | } |
4975 | |
|
4976 | 0 | return; |
4977 | 0 | } |
4978 | | |
4979 | 0 | for (i = 0; i < NBuffers; i++) |
4980 | 0 | { |
4981 | 0 | uint32 buf_state; |
4982 | |
|
4983 | 0 | bufHdr = GetBufferDescriptor(i); |
4984 | | |
4985 | | /* |
4986 | | * As in DropRelationBuffers, an unlocked precheck should be safe and |
4987 | | * saves some cycles. |
4988 | | */ |
4989 | 0 | if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator)) |
4990 | 0 | continue; |
4991 | | |
4992 | | /* Make sure we can handle the pin */ |
4993 | 0 | ReservePrivateRefCountEntry(); |
4994 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
4995 | |
|
4996 | 0 | buf_state = LockBufHdr(bufHdr); |
4997 | 0 | if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) && |
4998 | 0 | (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) |
4999 | 0 | { |
5000 | 0 | PinBuffer_Locked(bufHdr); |
5001 | 0 | LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); |
5002 | 0 | FlushBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); |
5003 | 0 | LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); |
5004 | 0 | UnpinBuffer(bufHdr); |
5005 | 0 | } |
5006 | 0 | else |
5007 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5008 | 0 | } |
5009 | 0 | } |
5010 | | |
5011 | | /* --------------------------------------------------------------------- |
5012 | | * FlushRelationsAllBuffers |
5013 | | * |
5014 | | * This function flushes out of the buffer pool all the pages of all |
5015 | | * forks of the specified smgr relations. It's equivalent to calling |
5016 | | * FlushRelationBuffers once per relation. The relations are assumed not |
5017 | | * to use local buffers. |
5018 | | * -------------------------------------------------------------------- |
5019 | | */ |
5020 | | void |
5021 | | FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) |
5022 | 0 | { |
5023 | 0 | int i; |
5024 | 0 | SMgrSortArray *srels; |
5025 | 0 | bool use_bsearch; |
5026 | |
|
5027 | 0 | if (nrels == 0) |
5028 | 0 | return; |
5029 | | |
5030 | | /* fill-in array for qsort */ |
5031 | 0 | srels = palloc(sizeof(SMgrSortArray) * nrels); |
5032 | |
|
5033 | 0 | for (i = 0; i < nrels; i++) |
5034 | 0 | { |
5035 | 0 | Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator)); |
5036 | |
|
5037 | 0 | srels[i].rlocator = smgrs[i]->smgr_rlocator.locator; |
5038 | 0 | srels[i].srel = smgrs[i]; |
5039 | 0 | } |
5040 | | |
5041 | | /* |
5042 | | * Save the bsearch overhead for low number of relations to sync. See |
5043 | | * DropRelationsAllBuffers for details. |
5044 | | */ |
5045 | 0 | use_bsearch = nrels > RELS_BSEARCH_THRESHOLD; |
5046 | | |
5047 | | /* sort the list of SMgrRelations if necessary */ |
5048 | 0 | if (use_bsearch) |
5049 | 0 | qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator); |
5050 | |
|
5051 | 0 | for (i = 0; i < NBuffers; i++) |
5052 | 0 | { |
5053 | 0 | SMgrSortArray *srelent = NULL; |
5054 | 0 | BufferDesc *bufHdr = GetBufferDescriptor(i); |
5055 | 0 | uint32 buf_state; |
5056 | | |
5057 | | /* |
5058 | | * As in DropRelationBuffers, an unlocked precheck should be safe and |
5059 | | * saves some cycles. |
5060 | | */ |
5061 | |
|
5062 | 0 | if (!use_bsearch) |
5063 | 0 | { |
5064 | 0 | int j; |
5065 | |
|
5066 | 0 | for (j = 0; j < nrels; j++) |
5067 | 0 | { |
5068 | 0 | if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator)) |
5069 | 0 | { |
5070 | 0 | srelent = &srels[j]; |
5071 | 0 | break; |
5072 | 0 | } |
5073 | 0 | } |
5074 | 0 | } |
5075 | 0 | else |
5076 | 0 | { |
5077 | 0 | RelFileLocator rlocator; |
5078 | |
|
5079 | 0 | rlocator = BufTagGetRelFileLocator(&bufHdr->tag); |
5080 | 0 | srelent = bsearch(&rlocator, |
5081 | 0 | srels, nrels, sizeof(SMgrSortArray), |
5082 | 0 | rlocator_comparator); |
5083 | 0 | } |
5084 | | |
5085 | | /* buffer doesn't belong to any of the given relfilelocators; skip it */ |
5086 | 0 | if (srelent == NULL) |
5087 | 0 | continue; |
5088 | | |
5089 | | /* Make sure we can handle the pin */ |
5090 | 0 | ReservePrivateRefCountEntry(); |
5091 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
5092 | |
|
5093 | 0 | buf_state = LockBufHdr(bufHdr); |
5094 | 0 | if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) && |
5095 | 0 | (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) |
5096 | 0 | { |
5097 | 0 | PinBuffer_Locked(bufHdr); |
5098 | 0 | LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); |
5099 | 0 | FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); |
5100 | 0 | LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); |
5101 | 0 | UnpinBuffer(bufHdr); |
5102 | 0 | } |
5103 | 0 | else |
5104 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5105 | 0 | } |
5106 | |
|
5107 | 0 | pfree(srels); |
5108 | 0 | } |
5109 | | |
5110 | | /* --------------------------------------------------------------------- |
5111 | | * RelationCopyStorageUsingBuffer |
5112 | | * |
5113 | | * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead |
5114 | | * of using smgrread and smgrextend this will copy using bufmgr APIs. |
5115 | | * |
5116 | | * Refer comments atop CreateAndCopyRelationData() for details about |
5117 | | * 'permanent' parameter. |
5118 | | * -------------------------------------------------------------------- |
5119 | | */ |
5120 | | static void |
5121 | | RelationCopyStorageUsingBuffer(RelFileLocator srclocator, |
5122 | | RelFileLocator dstlocator, |
5123 | | ForkNumber forkNum, bool permanent) |
5124 | 0 | { |
5125 | 0 | Buffer srcBuf; |
5126 | 0 | Buffer dstBuf; |
5127 | 0 | Page srcPage; |
5128 | 0 | Page dstPage; |
5129 | 0 | bool use_wal; |
5130 | 0 | BlockNumber nblocks; |
5131 | 0 | BlockNumber blkno; |
5132 | 0 | PGIOAlignedBlock buf; |
5133 | 0 | BufferAccessStrategy bstrategy_src; |
5134 | 0 | BufferAccessStrategy bstrategy_dst; |
5135 | 0 | BlockRangeReadStreamPrivate p; |
5136 | 0 | ReadStream *src_stream; |
5137 | 0 | SMgrRelation src_smgr; |
5138 | | |
5139 | | /* |
5140 | | * In general, we want to write WAL whenever wal_level > 'minimal', but we |
5141 | | * can skip it when copying any fork of an unlogged relation other than |
5142 | | * the init fork. |
5143 | | */ |
5144 | 0 | use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM); |
5145 | | |
5146 | | /* Get number of blocks in the source relation. */ |
5147 | 0 | nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER), |
5148 | 0 | forkNum); |
5149 | | |
5150 | | /* Nothing to copy; just return. */ |
5151 | 0 | if (nblocks == 0) |
5152 | 0 | return; |
5153 | | |
5154 | | /* |
5155 | | * Bulk extend the destination relation of the same size as the source |
5156 | | * relation before starting to copy block by block. |
5157 | | */ |
5158 | 0 | memset(buf.data, 0, BLCKSZ); |
5159 | 0 | smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1, |
5160 | 0 | buf.data, true); |
5161 | | |
5162 | | /* This is a bulk operation, so use buffer access strategies. */ |
5163 | 0 | bstrategy_src = GetAccessStrategy(BAS_BULKREAD); |
5164 | 0 | bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE); |
5165 | | |
5166 | | /* Initialize streaming read */ |
5167 | 0 | p.current_blocknum = 0; |
5168 | 0 | p.last_exclusive = nblocks; |
5169 | 0 | src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER); |
5170 | | |
5171 | | /* |
5172 | | * It is safe to use batchmode as block_range_read_stream_cb takes no |
5173 | | * locks. |
5174 | | */ |
5175 | 0 | src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL | |
5176 | 0 | READ_STREAM_USE_BATCHING, |
5177 | 0 | bstrategy_src, |
5178 | 0 | src_smgr, |
5179 | 0 | permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, |
5180 | 0 | forkNum, |
5181 | 0 | block_range_read_stream_cb, |
5182 | 0 | &p, |
5183 | 0 | 0); |
5184 | | |
5185 | | /* Iterate over each block of the source relation file. */ |
5186 | 0 | for (blkno = 0; blkno < nblocks; blkno++) |
5187 | 0 | { |
5188 | 0 | CHECK_FOR_INTERRUPTS(); |
5189 | | |
5190 | | /* Read block from source relation. */ |
5191 | 0 | srcBuf = read_stream_next_buffer(src_stream, NULL); |
5192 | 0 | LockBuffer(srcBuf, BUFFER_LOCK_SHARE); |
5193 | 0 | srcPage = BufferGetPage(srcBuf); |
5194 | |
|
5195 | 0 | dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, |
5196 | 0 | BufferGetBlockNumber(srcBuf), |
5197 | 0 | RBM_ZERO_AND_LOCK, bstrategy_dst, |
5198 | 0 | permanent); |
5199 | 0 | dstPage = BufferGetPage(dstBuf); |
5200 | |
|
5201 | 0 | START_CRIT_SECTION(); |
5202 | | |
5203 | | /* Copy page data from the source to the destination. */ |
5204 | 0 | memcpy(dstPage, srcPage, BLCKSZ); |
5205 | 0 | MarkBufferDirty(dstBuf); |
5206 | | |
5207 | | /* WAL-log the copied page. */ |
5208 | 0 | if (use_wal) |
5209 | 0 | log_newpage_buffer(dstBuf, true); |
5210 | |
|
5211 | 0 | END_CRIT_SECTION(); |
5212 | |
|
5213 | 0 | UnlockReleaseBuffer(dstBuf); |
5214 | 0 | UnlockReleaseBuffer(srcBuf); |
5215 | 0 | } |
5216 | 0 | Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer); |
5217 | 0 | read_stream_end(src_stream); |
5218 | |
|
5219 | 0 | FreeAccessStrategy(bstrategy_src); |
5220 | 0 | FreeAccessStrategy(bstrategy_dst); |
5221 | 0 | } |
5222 | | |
5223 | | /* --------------------------------------------------------------------- |
5224 | | * CreateAndCopyRelationData |
5225 | | * |
5226 | | * Create destination relation storage and copy all forks from the |
5227 | | * source relation to the destination. |
5228 | | * |
5229 | | * Pass permanent as true for permanent relations and false for |
5230 | | * unlogged relations. Currently this API is not supported for |
5231 | | * temporary relations. |
5232 | | * -------------------------------------------------------------------- |
5233 | | */ |
5234 | | void |
5235 | | CreateAndCopyRelationData(RelFileLocator src_rlocator, |
5236 | | RelFileLocator dst_rlocator, bool permanent) |
5237 | 0 | { |
5238 | 0 | char relpersistence; |
5239 | 0 | SMgrRelation src_rel; |
5240 | 0 | SMgrRelation dst_rel; |
5241 | | |
5242 | | /* Set the relpersistence. */ |
5243 | 0 | relpersistence = permanent ? |
5244 | 0 | RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED; |
5245 | |
|
5246 | 0 | src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER); |
5247 | 0 | dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER); |
5248 | | |
5249 | | /* |
5250 | | * Create and copy all forks of the relation. During create database we |
5251 | | * have a separate cleanup mechanism which deletes complete database |
5252 | | * directory. Therefore, each individual relation doesn't need to be |
5253 | | * registered for cleanup. |
5254 | | */ |
5255 | 0 | RelationCreateStorage(dst_rlocator, relpersistence, false); |
5256 | | |
5257 | | /* copy main fork. */ |
5258 | 0 | RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM, |
5259 | 0 | permanent); |
5260 | | |
5261 | | /* copy those extra forks that exist */ |
5262 | 0 | for (ForkNumber forkNum = MAIN_FORKNUM + 1; |
5263 | 0 | forkNum <= MAX_FORKNUM; forkNum++) |
5264 | 0 | { |
5265 | 0 | if (smgrexists(src_rel, forkNum)) |
5266 | 0 | { |
5267 | 0 | smgrcreate(dst_rel, forkNum, false); |
5268 | | |
5269 | | /* |
5270 | | * WAL log creation if the relation is persistent, or this is the |
5271 | | * init fork of an unlogged relation. |
5272 | | */ |
5273 | 0 | if (permanent || forkNum == INIT_FORKNUM) |
5274 | 0 | log_smgrcreate(&dst_rlocator, forkNum); |
5275 | | |
5276 | | /* Copy a fork's data, block by block. */ |
5277 | 0 | RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum, |
5278 | 0 | permanent); |
5279 | 0 | } |
5280 | 0 | } |
5281 | 0 | } |
5282 | | |
5283 | | /* --------------------------------------------------------------------- |
5284 | | * FlushDatabaseBuffers |
5285 | | * |
5286 | | * This function writes all dirty pages of a database out to disk |
5287 | | * (or more accurately, out to kernel disk buffers), ensuring that the |
5288 | | * kernel has an up-to-date view of the database. |
5289 | | * |
5290 | | * Generally, the caller should be holding an appropriate lock to ensure |
5291 | | * no other backend is active in the target database; otherwise more |
5292 | | * pages could get dirtied. |
5293 | | * |
5294 | | * Note we don't worry about flushing any pages of temporary relations. |
5295 | | * It's assumed these wouldn't be interesting. |
5296 | | * -------------------------------------------------------------------- |
5297 | | */ |
5298 | | void |
5299 | | FlushDatabaseBuffers(Oid dbid) |
5300 | 0 | { |
5301 | 0 | int i; |
5302 | 0 | BufferDesc *bufHdr; |
5303 | |
|
5304 | 0 | for (i = 0; i < NBuffers; i++) |
5305 | 0 | { |
5306 | 0 | uint32 buf_state; |
5307 | |
|
5308 | 0 | bufHdr = GetBufferDescriptor(i); |
5309 | | |
5310 | | /* |
5311 | | * As in DropRelationBuffers, an unlocked precheck should be safe and |
5312 | | * saves some cycles. |
5313 | | */ |
5314 | 0 | if (bufHdr->tag.dbOid != dbid) |
5315 | 0 | continue; |
5316 | | |
5317 | | /* Make sure we can handle the pin */ |
5318 | 0 | ReservePrivateRefCountEntry(); |
5319 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
5320 | |
|
5321 | 0 | buf_state = LockBufHdr(bufHdr); |
5322 | 0 | if (bufHdr->tag.dbOid == dbid && |
5323 | 0 | (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) |
5324 | 0 | { |
5325 | 0 | PinBuffer_Locked(bufHdr); |
5326 | 0 | LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); |
5327 | 0 | FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); |
5328 | 0 | LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); |
5329 | 0 | UnpinBuffer(bufHdr); |
5330 | 0 | } |
5331 | 0 | else |
5332 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5333 | 0 | } |
5334 | 0 | } |
5335 | | |
5336 | | /* |
5337 | | * Flush a previously, shared or exclusively, locked and pinned buffer to the |
5338 | | * OS. |
5339 | | */ |
5340 | | void |
5341 | | FlushOneBuffer(Buffer buffer) |
5342 | 0 | { |
5343 | 0 | BufferDesc *bufHdr; |
5344 | | |
5345 | | /* currently not needed, but no fundamental reason not to support */ |
5346 | 0 | Assert(!BufferIsLocal(buffer)); |
5347 | |
|
5348 | 0 | Assert(BufferIsPinned(buffer)); |
5349 | |
|
5350 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
5351 | |
|
5352 | 0 | Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); |
5353 | |
|
5354 | 0 | FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); |
5355 | 0 | } |
5356 | | |
5357 | | /* |
5358 | | * ReleaseBuffer -- release the pin on a buffer |
5359 | | */ |
5360 | | void |
5361 | | ReleaseBuffer(Buffer buffer) |
5362 | 0 | { |
5363 | 0 | if (!BufferIsValid(buffer)) |
5364 | 0 | elog(ERROR, "bad buffer ID: %d", buffer); |
5365 | | |
5366 | 0 | if (BufferIsLocal(buffer)) |
5367 | 0 | UnpinLocalBuffer(buffer); |
5368 | 0 | else |
5369 | 0 | UnpinBuffer(GetBufferDescriptor(buffer - 1)); |
5370 | 0 | } |
5371 | | |
5372 | | /* |
5373 | | * UnlockReleaseBuffer -- release the content lock and pin on a buffer |
5374 | | * |
5375 | | * This is just a shorthand for a common combination. |
5376 | | */ |
5377 | | void |
5378 | | UnlockReleaseBuffer(Buffer buffer) |
5379 | 0 | { |
5380 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
5381 | 0 | ReleaseBuffer(buffer); |
5382 | 0 | } |
5383 | | |
5384 | | /* |
5385 | | * IncrBufferRefCount |
5386 | | * Increment the pin count on a buffer that we have *already* pinned |
5387 | | * at least once. |
5388 | | * |
5389 | | * This function cannot be used on a buffer we do not have pinned, |
5390 | | * because it doesn't change the shared buffer state. |
5391 | | */ |
5392 | | void |
5393 | | IncrBufferRefCount(Buffer buffer) |
5394 | 0 | { |
5395 | 0 | Assert(BufferIsPinned(buffer)); |
5396 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
5397 | 0 | if (BufferIsLocal(buffer)) |
5398 | 0 | LocalRefCount[-buffer - 1]++; |
5399 | 0 | else |
5400 | 0 | { |
5401 | 0 | PrivateRefCountEntry *ref; |
5402 | |
|
5403 | 0 | ref = GetPrivateRefCountEntry(buffer, true); |
5404 | 0 | Assert(ref != NULL); |
5405 | 0 | ref->refcount++; |
5406 | 0 | } |
5407 | 0 | ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer); |
5408 | 0 | } |
5409 | | |
5410 | | /* |
5411 | | * MarkBufferDirtyHint |
5412 | | * |
5413 | | * Mark a buffer dirty for non-critical changes. |
5414 | | * |
5415 | | * This is essentially the same as MarkBufferDirty, except: |
5416 | | * |
5417 | | * 1. The caller does not write WAL; so if checksums are enabled, we may need |
5418 | | * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. |
5419 | | * 2. The caller might have only share-lock instead of exclusive-lock on the |
5420 | | * buffer's content lock. |
5421 | | * 3. This function does not guarantee that the buffer is always marked dirty |
5422 | | * (due to a race condition), so it cannot be used for important changes. |
5423 | | */ |
5424 | | void |
5425 | | MarkBufferDirtyHint(Buffer buffer, bool buffer_std) |
5426 | 0 | { |
5427 | 0 | BufferDesc *bufHdr; |
5428 | 0 | Page page = BufferGetPage(buffer); |
5429 | |
|
5430 | 0 | if (!BufferIsValid(buffer)) |
5431 | 0 | elog(ERROR, "bad buffer ID: %d", buffer); |
5432 | | |
5433 | 0 | if (BufferIsLocal(buffer)) |
5434 | 0 | { |
5435 | 0 | MarkLocalBufferDirty(buffer); |
5436 | 0 | return; |
5437 | 0 | } |
5438 | | |
5439 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
5440 | |
|
5441 | 0 | Assert(GetPrivateRefCount(buffer) > 0); |
5442 | | /* here, either share or exclusive lock is OK */ |
5443 | 0 | Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); |
5444 | | |
5445 | | /* |
5446 | | * This routine might get called many times on the same page, if we are |
5447 | | * making the first scan after commit of an xact that added/deleted many |
5448 | | * tuples. So, be as quick as we can if the buffer is already dirty. We |
5449 | | * do this by not acquiring spinlock if it looks like the status bits are |
5450 | | * already set. Since we make this test unlocked, there's a chance we |
5451 | | * might fail to notice that the flags have just been cleared, and failed |
5452 | | * to reset them, due to memory-ordering issues. But since this function |
5453 | | * is only intended to be used in cases where failing to write out the |
5454 | | * data would be harmless anyway, it doesn't really matter. |
5455 | | */ |
5456 | 0 | if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) != |
5457 | 0 | (BM_DIRTY | BM_JUST_DIRTIED)) |
5458 | 0 | { |
5459 | 0 | XLogRecPtr lsn = InvalidXLogRecPtr; |
5460 | 0 | bool dirtied = false; |
5461 | 0 | bool delayChkptFlags = false; |
5462 | 0 | uint32 buf_state; |
5463 | | |
5464 | | /* |
5465 | | * If we need to protect hint bit updates from torn writes, WAL-log a |
5466 | | * full page image of the page. This full page image is only necessary |
5467 | | * if the hint bit update is the first change to the page since the |
5468 | | * last checkpoint. |
5469 | | * |
5470 | | * We don't check full_page_writes here because that logic is included |
5471 | | * when we call XLogInsert() since the value changes dynamically. |
5472 | | */ |
5473 | 0 | if (XLogHintBitIsNeeded() && |
5474 | 0 | (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)) |
5475 | 0 | { |
5476 | | /* |
5477 | | * If we must not write WAL, due to a relfilelocator-specific |
5478 | | * condition or being in recovery, don't dirty the page. We can |
5479 | | * set the hint, just not dirty the page as a result so the hint |
5480 | | * is lost when we evict the page or shutdown. |
5481 | | * |
5482 | | * See src/backend/storage/page/README for longer discussion. |
5483 | | */ |
5484 | 0 | if (RecoveryInProgress() || |
5485 | 0 | RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag))) |
5486 | 0 | return; |
5487 | | |
5488 | | /* |
5489 | | * If the block is already dirty because we either made a change |
5490 | | * or set a hint already, then we don't need to write a full page |
5491 | | * image. Note that aggressive cleaning of blocks dirtied by hint |
5492 | | * bit setting would increase the call rate. Bulk setting of hint |
5493 | | * bits would reduce the call rate... |
5494 | | * |
5495 | | * We must issue the WAL record before we mark the buffer dirty. |
5496 | | * Otherwise we might write the page before we write the WAL. That |
5497 | | * causes a race condition, since a checkpoint might occur between |
5498 | | * writing the WAL record and marking the buffer dirty. We solve |
5499 | | * that with a kluge, but one that is already in use during |
5500 | | * transaction commit to prevent race conditions. Basically, we |
5501 | | * simply prevent the checkpoint WAL record from being written |
5502 | | * until we have marked the buffer dirty. We don't start the |
5503 | | * checkpoint flush until we have marked dirty, so our checkpoint |
5504 | | * must flush the change to disk successfully or the checkpoint |
5505 | | * never gets written, so crash recovery will fix. |
5506 | | * |
5507 | | * It's possible we may enter here without an xid, so it is |
5508 | | * essential that CreateCheckPoint waits for virtual transactions |
5509 | | * rather than full transactionids. |
5510 | | */ |
5511 | 0 | Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); |
5512 | 0 | MyProc->delayChkptFlags |= DELAY_CHKPT_START; |
5513 | 0 | delayChkptFlags = true; |
5514 | 0 | lsn = XLogSaveBufferForHint(buffer, buffer_std); |
5515 | 0 | } |
5516 | | |
5517 | 0 | buf_state = LockBufHdr(bufHdr); |
5518 | |
|
5519 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); |
5520 | |
|
5521 | 0 | if (!(buf_state & BM_DIRTY)) |
5522 | 0 | { |
5523 | 0 | dirtied = true; /* Means "will be dirtied by this action" */ |
5524 | | |
5525 | | /* |
5526 | | * Set the page LSN if we wrote a backup block. We aren't supposed |
5527 | | * to set this when only holding a share lock but as long as we |
5528 | | * serialise it somehow we're OK. We choose to set LSN while |
5529 | | * holding the buffer header lock, which causes any reader of an |
5530 | | * LSN who holds only a share lock to also obtain a buffer header |
5531 | | * lock before using PageGetLSN(), which is enforced in |
5532 | | * BufferGetLSNAtomic(). |
5533 | | * |
5534 | | * If checksums are enabled, you might think we should reset the |
5535 | | * checksum here. That will happen when the page is written |
5536 | | * sometime later in this checkpoint cycle. |
5537 | | */ |
5538 | 0 | if (!XLogRecPtrIsInvalid(lsn)) |
5539 | 0 | PageSetLSN(page, lsn); |
5540 | 0 | } |
5541 | |
|
5542 | 0 | buf_state |= BM_DIRTY | BM_JUST_DIRTIED; |
5543 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5544 | |
|
5545 | 0 | if (delayChkptFlags) |
5546 | 0 | MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; |
5547 | |
|
5548 | 0 | if (dirtied) |
5549 | 0 | { |
5550 | 0 | pgBufferUsage.shared_blks_dirtied++; |
5551 | 0 | if (VacuumCostActive) |
5552 | 0 | VacuumCostBalance += VacuumCostPageDirty; |
5553 | 0 | } |
5554 | 0 | } |
5555 | 0 | } |
5556 | | |
5557 | | /* |
5558 | | * Release buffer content locks for shared buffers. |
5559 | | * |
5560 | | * Used to clean up after errors. |
5561 | | * |
5562 | | * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care |
5563 | | * of releasing buffer content locks per se; the only thing we need to deal |
5564 | | * with here is clearing any PIN_COUNT request that was in progress. |
5565 | | */ |
5566 | | void |
5567 | | UnlockBuffers(void) |
5568 | 0 | { |
5569 | 0 | BufferDesc *buf = PinCountWaitBuf; |
5570 | |
|
5571 | 0 | if (buf) |
5572 | 0 | { |
5573 | 0 | uint32 buf_state; |
5574 | |
|
5575 | 0 | buf_state = LockBufHdr(buf); |
5576 | | |
5577 | | /* |
5578 | | * Don't complain if flag bit not set; it could have been reset but we |
5579 | | * got a cancel/die interrupt before getting the signal. |
5580 | | */ |
5581 | 0 | if ((buf_state & BM_PIN_COUNT_WAITER) != 0 && |
5582 | 0 | buf->wait_backend_pgprocno == MyProcNumber) |
5583 | 0 | buf_state &= ~BM_PIN_COUNT_WAITER; |
5584 | |
|
5585 | 0 | UnlockBufHdr(buf, buf_state); |
5586 | |
|
5587 | 0 | PinCountWaitBuf = NULL; |
5588 | 0 | } |
5589 | 0 | } |
5590 | | |
5591 | | /* |
5592 | | * Acquire or release the content_lock for the buffer. |
5593 | | */ |
5594 | | void |
5595 | | LockBuffer(Buffer buffer, int mode) |
5596 | 0 | { |
5597 | 0 | BufferDesc *buf; |
5598 | |
|
5599 | 0 | Assert(BufferIsPinned(buffer)); |
5600 | 0 | if (BufferIsLocal(buffer)) |
5601 | 0 | return; /* local buffers need no lock */ |
5602 | | |
5603 | 0 | buf = GetBufferDescriptor(buffer - 1); |
5604 | |
|
5605 | 0 | if (mode == BUFFER_LOCK_UNLOCK) |
5606 | 0 | LWLockRelease(BufferDescriptorGetContentLock(buf)); |
5607 | 0 | else if (mode == BUFFER_LOCK_SHARE) |
5608 | 0 | LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED); |
5609 | 0 | else if (mode == BUFFER_LOCK_EXCLUSIVE) |
5610 | 0 | LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE); |
5611 | 0 | else |
5612 | 0 | elog(ERROR, "unrecognized buffer lock mode: %d", mode); |
5613 | 0 | } |
5614 | | |
5615 | | /* |
5616 | | * Acquire the content_lock for the buffer, but only if we don't have to wait. |
5617 | | * |
5618 | | * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode. |
5619 | | */ |
5620 | | bool |
5621 | | ConditionalLockBuffer(Buffer buffer) |
5622 | 0 | { |
5623 | 0 | BufferDesc *buf; |
5624 | |
|
5625 | 0 | Assert(BufferIsPinned(buffer)); |
5626 | 0 | if (BufferIsLocal(buffer)) |
5627 | 0 | return true; /* act as though we got it */ |
5628 | | |
5629 | 0 | buf = GetBufferDescriptor(buffer - 1); |
5630 | |
|
5631 | 0 | return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), |
5632 | 0 | LW_EXCLUSIVE); |
5633 | 0 | } |
5634 | | |
5635 | | /* |
5636 | | * Verify that this backend is pinning the buffer exactly once. |
5637 | | * |
5638 | | * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend |
5639 | | * holds a pin on the buffer. We do not care whether some other backend does. |
5640 | | */ |
5641 | | void |
5642 | | CheckBufferIsPinnedOnce(Buffer buffer) |
5643 | 0 | { |
5644 | 0 | if (BufferIsLocal(buffer)) |
5645 | 0 | { |
5646 | 0 | if (LocalRefCount[-buffer - 1] != 1) |
5647 | 0 | elog(ERROR, "incorrect local pin count: %d", |
5648 | 0 | LocalRefCount[-buffer - 1]); |
5649 | 0 | } |
5650 | 0 | else |
5651 | 0 | { |
5652 | 0 | if (GetPrivateRefCount(buffer) != 1) |
5653 | 0 | elog(ERROR, "incorrect local pin count: %d", |
5654 | 0 | GetPrivateRefCount(buffer)); |
5655 | 0 | } |
5656 | 0 | } |
5657 | | |
5658 | | /* |
5659 | | * LockBufferForCleanup - lock a buffer in preparation for deleting items |
5660 | | * |
5661 | | * Items may be deleted from a disk page only when the caller (a) holds an |
5662 | | * exclusive lock on the buffer and (b) has observed that no other backend |
5663 | | * holds a pin on the buffer. If there is a pin, then the other backend |
5664 | | * might have a pointer into the buffer (for example, a heapscan reference |
5665 | | * to an item --- see README for more details). It's OK if a pin is added |
5666 | | * after the cleanup starts, however; the newly-arrived backend will be |
5667 | | * unable to look at the page until we release the exclusive lock. |
5668 | | * |
5669 | | * To implement this protocol, a would-be deleter must pin the buffer and |
5670 | | * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to |
5671 | | * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until |
5672 | | * it has successfully observed pin count = 1. |
5673 | | */ |
5674 | | void |
5675 | | LockBufferForCleanup(Buffer buffer) |
5676 | 0 | { |
5677 | 0 | BufferDesc *bufHdr; |
5678 | 0 | TimestampTz waitStart = 0; |
5679 | 0 | bool waiting = false; |
5680 | 0 | bool logged_recovery_conflict = false; |
5681 | |
|
5682 | 0 | Assert(BufferIsPinned(buffer)); |
5683 | 0 | Assert(PinCountWaitBuf == NULL); |
5684 | |
|
5685 | 0 | CheckBufferIsPinnedOnce(buffer); |
5686 | | |
5687 | | /* |
5688 | | * We do not yet need to be worried about in-progress AIOs holding a pin, |
5689 | | * as we, so far, only support doing reads via AIO and this function can |
5690 | | * only be called once the buffer is valid (i.e. no read can be in |
5691 | | * flight). |
5692 | | */ |
5693 | | |
5694 | | /* Nobody else to wait for */ |
5695 | 0 | if (BufferIsLocal(buffer)) |
5696 | 0 | return; |
5697 | | |
5698 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
5699 | |
|
5700 | 0 | for (;;) |
5701 | 0 | { |
5702 | 0 | uint32 buf_state; |
5703 | | |
5704 | | /* Try to acquire lock */ |
5705 | 0 | LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); |
5706 | 0 | buf_state = LockBufHdr(bufHdr); |
5707 | |
|
5708 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); |
5709 | 0 | if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) |
5710 | 0 | { |
5711 | | /* Successfully acquired exclusive lock with pincount 1 */ |
5712 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5713 | | |
5714 | | /* |
5715 | | * Emit the log message if recovery conflict on buffer pin was |
5716 | | * resolved but the startup process waited longer than |
5717 | | * deadlock_timeout for it. |
5718 | | */ |
5719 | 0 | if (logged_recovery_conflict) |
5720 | 0 | LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, |
5721 | 0 | waitStart, GetCurrentTimestamp(), |
5722 | 0 | NULL, false); |
5723 | |
|
5724 | 0 | if (waiting) |
5725 | 0 | { |
5726 | | /* reset ps display to remove the suffix if we added one */ |
5727 | 0 | set_ps_display_remove_suffix(); |
5728 | 0 | waiting = false; |
5729 | 0 | } |
5730 | 0 | return; |
5731 | 0 | } |
5732 | | /* Failed, so mark myself as waiting for pincount 1 */ |
5733 | 0 | if (buf_state & BM_PIN_COUNT_WAITER) |
5734 | 0 | { |
5735 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5736 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
5737 | 0 | elog(ERROR, "multiple backends attempting to wait for pincount 1"); |
5738 | 0 | } |
5739 | 0 | bufHdr->wait_backend_pgprocno = MyProcNumber; |
5740 | 0 | PinCountWaitBuf = bufHdr; |
5741 | 0 | buf_state |= BM_PIN_COUNT_WAITER; |
5742 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5743 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
5744 | | |
5745 | | /* Wait to be signaled by UnpinBuffer() */ |
5746 | 0 | if (InHotStandby) |
5747 | 0 | { |
5748 | 0 | if (!waiting) |
5749 | 0 | { |
5750 | | /* adjust the process title to indicate that it's waiting */ |
5751 | 0 | set_ps_display_suffix("waiting"); |
5752 | 0 | waiting = true; |
5753 | 0 | } |
5754 | | |
5755 | | /* |
5756 | | * Emit the log message if the startup process is waiting longer |
5757 | | * than deadlock_timeout for recovery conflict on buffer pin. |
5758 | | * |
5759 | | * Skip this if first time through because the startup process has |
5760 | | * not started waiting yet in this case. So, the wait start |
5761 | | * timestamp is set after this logic. |
5762 | | */ |
5763 | 0 | if (waitStart != 0 && !logged_recovery_conflict) |
5764 | 0 | { |
5765 | 0 | TimestampTz now = GetCurrentTimestamp(); |
5766 | |
|
5767 | 0 | if (TimestampDifferenceExceeds(waitStart, now, |
5768 | 0 | DeadlockTimeout)) |
5769 | 0 | { |
5770 | 0 | LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, |
5771 | 0 | waitStart, now, NULL, true); |
5772 | 0 | logged_recovery_conflict = true; |
5773 | 0 | } |
5774 | 0 | } |
5775 | | |
5776 | | /* |
5777 | | * Set the wait start timestamp if logging is enabled and first |
5778 | | * time through. |
5779 | | */ |
5780 | 0 | if (log_recovery_conflict_waits && waitStart == 0) |
5781 | 0 | waitStart = GetCurrentTimestamp(); |
5782 | | |
5783 | | /* Publish the bufid that Startup process waits on */ |
5784 | 0 | SetStartupBufferPinWaitBufId(buffer - 1); |
5785 | | /* Set alarm and then wait to be signaled by UnpinBuffer() */ |
5786 | 0 | ResolveRecoveryConflictWithBufferPin(); |
5787 | | /* Reset the published bufid */ |
5788 | 0 | SetStartupBufferPinWaitBufId(-1); |
5789 | 0 | } |
5790 | 0 | else |
5791 | 0 | ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN); |
5792 | | |
5793 | | /* |
5794 | | * Remove flag marking us as waiter. Normally this will not be set |
5795 | | * anymore, but ProcWaitForSignal() can return for other signals as |
5796 | | * well. We take care to only reset the flag if we're the waiter, as |
5797 | | * theoretically another backend could have started waiting. That's |
5798 | | * impossible with the current usages due to table level locking, but |
5799 | | * better be safe. |
5800 | | */ |
5801 | 0 | buf_state = LockBufHdr(bufHdr); |
5802 | 0 | if ((buf_state & BM_PIN_COUNT_WAITER) != 0 && |
5803 | 0 | bufHdr->wait_backend_pgprocno == MyProcNumber) |
5804 | 0 | buf_state &= ~BM_PIN_COUNT_WAITER; |
5805 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5806 | |
|
5807 | 0 | PinCountWaitBuf = NULL; |
5808 | | /* Loop back and try again */ |
5809 | 0 | } |
5810 | 0 | } |
5811 | | |
5812 | | /* |
5813 | | * Check called from ProcessRecoveryConflictInterrupts() when Startup process |
5814 | | * requests cancellation of all pin holders that are blocking it. |
5815 | | */ |
5816 | | bool |
5817 | | HoldingBufferPinThatDelaysRecovery(void) |
5818 | 0 | { |
5819 | 0 | int bufid = GetStartupBufferPinWaitBufId(); |
5820 | | |
5821 | | /* |
5822 | | * If we get woken slowly then it's possible that the Startup process was |
5823 | | * already woken by other backends before we got here. Also possible that |
5824 | | * we get here by multiple interrupts or interrupts at inappropriate |
5825 | | * times, so make sure we do nothing if the bufid is not set. |
5826 | | */ |
5827 | 0 | if (bufid < 0) |
5828 | 0 | return false; |
5829 | | |
5830 | 0 | if (GetPrivateRefCount(bufid + 1) > 0) |
5831 | 0 | return true; |
5832 | | |
5833 | 0 | return false; |
5834 | 0 | } |
5835 | | |
5836 | | /* |
5837 | | * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock |
5838 | | * |
5839 | | * We won't loop, but just check once to see if the pin count is OK. If |
5840 | | * not, return false with no lock held. |
5841 | | */ |
5842 | | bool |
5843 | | ConditionalLockBufferForCleanup(Buffer buffer) |
5844 | 0 | { |
5845 | 0 | BufferDesc *bufHdr; |
5846 | 0 | uint32 buf_state, |
5847 | 0 | refcount; |
5848 | |
|
5849 | 0 | Assert(BufferIsValid(buffer)); |
5850 | | |
5851 | | /* see AIO related comment in LockBufferForCleanup() */ |
5852 | |
|
5853 | 0 | if (BufferIsLocal(buffer)) |
5854 | 0 | { |
5855 | 0 | refcount = LocalRefCount[-buffer - 1]; |
5856 | | /* There should be exactly one pin */ |
5857 | 0 | Assert(refcount > 0); |
5858 | 0 | if (refcount != 1) |
5859 | 0 | return false; |
5860 | | /* Nobody else to wait for */ |
5861 | 0 | return true; |
5862 | 0 | } |
5863 | | |
5864 | | /* There should be exactly one local pin */ |
5865 | 0 | refcount = GetPrivateRefCount(buffer); |
5866 | 0 | Assert(refcount); |
5867 | 0 | if (refcount != 1) |
5868 | 0 | return false; |
5869 | | |
5870 | | /* Try to acquire lock */ |
5871 | 0 | if (!ConditionalLockBuffer(buffer)) |
5872 | 0 | return false; |
5873 | | |
5874 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
5875 | 0 | buf_state = LockBufHdr(bufHdr); |
5876 | 0 | refcount = BUF_STATE_GET_REFCOUNT(buf_state); |
5877 | |
|
5878 | 0 | Assert(refcount > 0); |
5879 | 0 | if (refcount == 1) |
5880 | 0 | { |
5881 | | /* Successfully acquired exclusive lock with pincount 1 */ |
5882 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5883 | 0 | return true; |
5884 | 0 | } |
5885 | | |
5886 | | /* Failed, so release the lock */ |
5887 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5888 | 0 | LockBuffer(buffer, BUFFER_LOCK_UNLOCK); |
5889 | 0 | return false; |
5890 | 0 | } |
5891 | | |
5892 | | /* |
5893 | | * IsBufferCleanupOK - as above, but we already have the lock |
5894 | | * |
5895 | | * Check whether it's OK to perform cleanup on a buffer we've already |
5896 | | * locked. If we observe that the pin count is 1, our exclusive lock |
5897 | | * happens to be a cleanup lock, and we can proceed with anything that |
5898 | | * would have been allowable had we sought a cleanup lock originally. |
5899 | | */ |
5900 | | bool |
5901 | | IsBufferCleanupOK(Buffer buffer) |
5902 | 0 | { |
5903 | 0 | BufferDesc *bufHdr; |
5904 | 0 | uint32 buf_state; |
5905 | |
|
5906 | 0 | Assert(BufferIsValid(buffer)); |
5907 | | |
5908 | | /* see AIO related comment in LockBufferForCleanup() */ |
5909 | |
|
5910 | 0 | if (BufferIsLocal(buffer)) |
5911 | 0 | { |
5912 | | /* There should be exactly one pin */ |
5913 | 0 | if (LocalRefCount[-buffer - 1] != 1) |
5914 | 0 | return false; |
5915 | | /* Nobody else to wait for */ |
5916 | 0 | return true; |
5917 | 0 | } |
5918 | | |
5919 | | /* There should be exactly one local pin */ |
5920 | 0 | if (GetPrivateRefCount(buffer) != 1) |
5921 | 0 | return false; |
5922 | | |
5923 | 0 | bufHdr = GetBufferDescriptor(buffer - 1); |
5924 | | |
5925 | | /* caller must hold exclusive lock on buffer */ |
5926 | 0 | Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), |
5927 | 0 | LW_EXCLUSIVE)); |
5928 | |
|
5929 | 0 | buf_state = LockBufHdr(bufHdr); |
5930 | |
|
5931 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); |
5932 | 0 | if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) |
5933 | 0 | { |
5934 | | /* pincount is OK. */ |
5935 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5936 | 0 | return true; |
5937 | 0 | } |
5938 | | |
5939 | 0 | UnlockBufHdr(bufHdr, buf_state); |
5940 | 0 | return false; |
5941 | 0 | } |
5942 | | |
5943 | | |
5944 | | /* |
5945 | | * Functions for buffer I/O handling |
5946 | | * |
5947 | | * Also note that these are used only for shared buffers, not local ones. |
5948 | | */ |
5949 | | |
5950 | | /* |
5951 | | * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared. |
5952 | | */ |
5953 | | static void |
5954 | | WaitIO(BufferDesc *buf) |
5955 | 0 | { |
5956 | 0 | ConditionVariable *cv = BufferDescriptorGetIOCV(buf); |
5957 | |
|
5958 | 0 | ConditionVariablePrepareToSleep(cv); |
5959 | 0 | for (;;) |
5960 | 0 | { |
5961 | 0 | uint32 buf_state; |
5962 | 0 | PgAioWaitRef iow; |
5963 | | |
5964 | | /* |
5965 | | * It may not be necessary to acquire the spinlock to check the flag |
5966 | | * here, but since this test is essential for correctness, we'd better |
5967 | | * play it safe. |
5968 | | */ |
5969 | 0 | buf_state = LockBufHdr(buf); |
5970 | | |
5971 | | /* |
5972 | | * Copy the wait reference while holding the spinlock. This protects |
5973 | | * against a concurrent TerminateBufferIO() in another backend from |
5974 | | * clearing the wref while it's being read. |
5975 | | */ |
5976 | 0 | iow = buf->io_wref; |
5977 | 0 | UnlockBufHdr(buf, buf_state); |
5978 | | |
5979 | | /* no IO in progress, we don't need to wait */ |
5980 | 0 | if (!(buf_state & BM_IO_IN_PROGRESS)) |
5981 | 0 | break; |
5982 | | |
5983 | | /* |
5984 | | * The buffer has asynchronous IO in progress, wait for it to |
5985 | | * complete. |
5986 | | */ |
5987 | 0 | if (pgaio_wref_valid(&iow)) |
5988 | 0 | { |
5989 | 0 | pgaio_wref_wait(&iow); |
5990 | | |
5991 | | /* |
5992 | | * The AIO subsystem internally uses condition variables and thus |
5993 | | * might remove this backend from the BufferDesc's CV. While that |
5994 | | * wouldn't cause a correctness issue (the first CV sleep just |
5995 | | * immediately returns if not already registered), it seems worth |
5996 | | * avoiding unnecessary loop iterations, given that we take care |
5997 | | * to do so at the start of the function. |
5998 | | */ |
5999 | 0 | ConditionVariablePrepareToSleep(cv); |
6000 | 0 | continue; |
6001 | 0 | } |
6002 | | |
6003 | | /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */ |
6004 | 0 | ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO); |
6005 | 0 | } |
6006 | 0 | ConditionVariableCancelSleep(); |
6007 | 0 | } |
6008 | | |
6009 | | /* |
6010 | | * StartBufferIO: begin I/O on this buffer |
6011 | | * (Assumptions) |
6012 | | * My process is executing no IO on this buffer |
6013 | | * The buffer is Pinned |
6014 | | * |
6015 | | * In some scenarios multiple backends could attempt the same I/O operation |
6016 | | * concurrently. If someone else has already started I/O on this buffer then |
6017 | | * we will wait for completion of the IO using WaitIO(). |
6018 | | * |
6019 | | * Input operations are only attempted on buffers that are not BM_VALID, |
6020 | | * and output operations only on buffers that are BM_VALID and BM_DIRTY, |
6021 | | * so we can always tell if the work is already done. |
6022 | | * |
6023 | | * Returns true if we successfully marked the buffer as I/O busy, |
6024 | | * false if someone else already did the work. |
6025 | | * |
6026 | | * If nowait is true, then we don't wait for an I/O to be finished by another |
6027 | | * backend. In that case, false indicates either that the I/O was already |
6028 | | * finished, or is still in progress. This is useful for callers that want to |
6029 | | * find out if they can perform the I/O as part of a larger operation, without |
6030 | | * waiting for the answer or distinguishing the reasons why not. |
6031 | | */ |
6032 | | bool |
6033 | | StartBufferIO(BufferDesc *buf, bool forInput, bool nowait) |
6034 | 0 | { |
6035 | 0 | uint32 buf_state; |
6036 | |
|
6037 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
6038 | |
|
6039 | 0 | for (;;) |
6040 | 0 | { |
6041 | 0 | buf_state = LockBufHdr(buf); |
6042 | |
|
6043 | 0 | if (!(buf_state & BM_IO_IN_PROGRESS)) |
6044 | 0 | break; |
6045 | 0 | UnlockBufHdr(buf, buf_state); |
6046 | 0 | if (nowait) |
6047 | 0 | return false; |
6048 | 0 | WaitIO(buf); |
6049 | 0 | } |
6050 | | |
6051 | | /* Once we get here, there is definitely no I/O active on this buffer */ |
6052 | | |
6053 | | /* Check if someone else already did the I/O */ |
6054 | 0 | if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY)) |
6055 | 0 | { |
6056 | 0 | UnlockBufHdr(buf, buf_state); |
6057 | 0 | return false; |
6058 | 0 | } |
6059 | | |
6060 | 0 | buf_state |= BM_IO_IN_PROGRESS; |
6061 | 0 | UnlockBufHdr(buf, buf_state); |
6062 | |
|
6063 | 0 | ResourceOwnerRememberBufferIO(CurrentResourceOwner, |
6064 | 0 | BufferDescriptorGetBuffer(buf)); |
6065 | |
|
6066 | 0 | return true; |
6067 | 0 | } |
6068 | | |
6069 | | /* |
6070 | | * TerminateBufferIO: release a buffer we were doing I/O on |
6071 | | * (Assumptions) |
6072 | | * My process is executing IO for the buffer |
6073 | | * BM_IO_IN_PROGRESS bit is set for the buffer |
6074 | | * The buffer is Pinned |
6075 | | * |
6076 | | * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the |
6077 | | * buffer's BM_DIRTY flag. This is appropriate when terminating a |
6078 | | * successful write. The check on BM_JUST_DIRTIED is necessary to avoid |
6079 | | * marking the buffer clean if it was re-dirtied while we were writing. |
6080 | | * |
6081 | | * set_flag_bits gets ORed into the buffer's flags. It must include |
6082 | | * BM_IO_ERROR in a failure case. For successful completion it could |
6083 | | * be 0, or BM_VALID if we just finished reading in the page. |
6084 | | * |
6085 | | * If forget_owner is true, we release the buffer I/O from the current |
6086 | | * resource owner. (forget_owner=false is used when the resource owner itself |
6087 | | * is being released) |
6088 | | */ |
6089 | | void |
6090 | | TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, |
6091 | | bool forget_owner, bool release_aio) |
6092 | 0 | { |
6093 | 0 | uint32 buf_state; |
6094 | |
|
6095 | 0 | buf_state = LockBufHdr(buf); |
6096 | |
|
6097 | 0 | Assert(buf_state & BM_IO_IN_PROGRESS); |
6098 | 0 | buf_state &= ~BM_IO_IN_PROGRESS; |
6099 | | |
6100 | | /* Clear earlier errors, if this IO failed, it'll be marked again */ |
6101 | 0 | buf_state &= ~BM_IO_ERROR; |
6102 | |
|
6103 | 0 | if (clear_dirty && !(buf_state & BM_JUST_DIRTIED)) |
6104 | 0 | buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED); |
6105 | |
|
6106 | 0 | if (release_aio) |
6107 | 0 | { |
6108 | | /* release ownership by the AIO subsystem */ |
6109 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); |
6110 | 0 | buf_state -= BUF_REFCOUNT_ONE; |
6111 | 0 | pgaio_wref_clear(&buf->io_wref); |
6112 | 0 | } |
6113 | |
|
6114 | 0 | buf_state |= set_flag_bits; |
6115 | 0 | UnlockBufHdr(buf, buf_state); |
6116 | |
|
6117 | 0 | if (forget_owner) |
6118 | 0 | ResourceOwnerForgetBufferIO(CurrentResourceOwner, |
6119 | 0 | BufferDescriptorGetBuffer(buf)); |
6120 | |
|
6121 | 0 | ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf)); |
6122 | | |
6123 | | /* |
6124 | | * Support LockBufferForCleanup() |
6125 | | * |
6126 | | * We may have just released the last pin other than the waiter's. In most |
6127 | | * cases, this backend holds another pin on the buffer. But, if, for |
6128 | | * example, this backend is completing an IO issued by another backend, it |
6129 | | * may be time to wake the waiter. |
6130 | | */ |
6131 | 0 | if (release_aio && (buf_state & BM_PIN_COUNT_WAITER)) |
6132 | 0 | WakePinCountWaiter(buf); |
6133 | 0 | } |
6134 | | |
6135 | | /* |
6136 | | * AbortBufferIO: Clean up active buffer I/O after an error. |
6137 | | * |
6138 | | * All LWLocks we might have held have been released, |
6139 | | * but we haven't yet released buffer pins, so the buffer is still pinned. |
6140 | | * |
6141 | | * If I/O was in progress, we always set BM_IO_ERROR, even though it's |
6142 | | * possible the error condition wasn't related to the I/O. |
6143 | | * |
6144 | | * Note: this does not remove the buffer I/O from the resource owner. |
6145 | | * That's correct when we're releasing the whole resource owner, but |
6146 | | * beware if you use this in other contexts. |
6147 | | */ |
6148 | | static void |
6149 | | AbortBufferIO(Buffer buffer) |
6150 | 0 | { |
6151 | 0 | BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1); |
6152 | 0 | uint32 buf_state; |
6153 | |
|
6154 | 0 | buf_state = LockBufHdr(buf_hdr); |
6155 | 0 | Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID)); |
6156 | |
|
6157 | 0 | if (!(buf_state & BM_VALID)) |
6158 | 0 | { |
6159 | 0 | Assert(!(buf_state & BM_DIRTY)); |
6160 | 0 | UnlockBufHdr(buf_hdr, buf_state); |
6161 | 0 | } |
6162 | 0 | else |
6163 | 0 | { |
6164 | 0 | Assert(buf_state & BM_DIRTY); |
6165 | 0 | UnlockBufHdr(buf_hdr, buf_state); |
6166 | | |
6167 | | /* Issue notice if this is not the first failure... */ |
6168 | 0 | if (buf_state & BM_IO_ERROR) |
6169 | 0 | { |
6170 | | /* Buffer is pinned, so we can read tag without spinlock */ |
6171 | 0 | ereport(WARNING, |
6172 | 0 | (errcode(ERRCODE_IO_ERROR), |
6173 | 0 | errmsg("could not write block %u of %s", |
6174 | 0 | buf_hdr->tag.blockNum, |
6175 | 0 | relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag), |
6176 | 0 | BufTagGetForkNum(&buf_hdr->tag)).str), |
6177 | 0 | errdetail("Multiple failures --- write error might be permanent."))); |
6178 | 0 | } |
6179 | 0 | } |
6180 | | |
6181 | 0 | TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false); |
6182 | 0 | } |
6183 | | |
6184 | | /* |
6185 | | * Error context callback for errors occurring during shared buffer writes. |
6186 | | */ |
6187 | | static void |
6188 | | shared_buffer_write_error_callback(void *arg) |
6189 | 0 | { |
6190 | 0 | BufferDesc *bufHdr = (BufferDesc *) arg; |
6191 | | |
6192 | | /* Buffer is pinned, so we can read the tag without locking the spinlock */ |
6193 | 0 | if (bufHdr != NULL) |
6194 | 0 | errcontext("writing block %u of relation %s", |
6195 | 0 | bufHdr->tag.blockNum, |
6196 | 0 | relpathperm(BufTagGetRelFileLocator(&bufHdr->tag), |
6197 | 0 | BufTagGetForkNum(&bufHdr->tag)).str); |
6198 | 0 | } |
6199 | | |
6200 | | /* |
6201 | | * Error context callback for errors occurring during local buffer writes. |
6202 | | */ |
6203 | | static void |
6204 | | local_buffer_write_error_callback(void *arg) |
6205 | 0 | { |
6206 | 0 | BufferDesc *bufHdr = (BufferDesc *) arg; |
6207 | |
|
6208 | 0 | if (bufHdr != NULL) |
6209 | 0 | errcontext("writing block %u of relation %s", |
6210 | 0 | bufHdr->tag.blockNum, |
6211 | 0 | relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag), |
6212 | 0 | MyProcNumber, |
6213 | 0 | BufTagGetForkNum(&bufHdr->tag)).str); |
6214 | 0 | } |
6215 | | |
6216 | | /* |
6217 | | * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals. |
6218 | | */ |
6219 | | static int |
6220 | | rlocator_comparator(const void *p1, const void *p2) |
6221 | 0 | { |
6222 | 0 | RelFileLocator n1 = *(const RelFileLocator *) p1; |
6223 | 0 | RelFileLocator n2 = *(const RelFileLocator *) p2; |
6224 | |
|
6225 | 0 | if (n1.relNumber < n2.relNumber) |
6226 | 0 | return -1; |
6227 | 0 | else if (n1.relNumber > n2.relNumber) |
6228 | 0 | return 1; |
6229 | | |
6230 | 0 | if (n1.dbOid < n2.dbOid) |
6231 | 0 | return -1; |
6232 | 0 | else if (n1.dbOid > n2.dbOid) |
6233 | 0 | return 1; |
6234 | | |
6235 | 0 | if (n1.spcOid < n2.spcOid) |
6236 | 0 | return -1; |
6237 | 0 | else if (n1.spcOid > n2.spcOid) |
6238 | 0 | return 1; |
6239 | 0 | else |
6240 | 0 | return 0; |
6241 | 0 | } |
6242 | | |
6243 | | /* |
6244 | | * Lock buffer header - set BM_LOCKED in buffer state. |
6245 | | */ |
6246 | | uint32 |
6247 | | LockBufHdr(BufferDesc *desc) |
6248 | 0 | { |
6249 | 0 | SpinDelayStatus delayStatus; |
6250 | 0 | uint32 old_buf_state; |
6251 | |
|
6252 | 0 | Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc))); |
6253 | |
|
6254 | 0 | init_local_spin_delay(&delayStatus); |
6255 | |
|
6256 | 0 | while (true) |
6257 | 0 | { |
6258 | | /* set BM_LOCKED flag */ |
6259 | 0 | old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED); |
6260 | | /* if it wasn't set before we're OK */ |
6261 | 0 | if (!(old_buf_state & BM_LOCKED)) |
6262 | 0 | break; |
6263 | 0 | perform_spin_delay(&delayStatus); |
6264 | 0 | } |
6265 | 0 | finish_spin_delay(&delayStatus); |
6266 | 0 | return old_buf_state | BM_LOCKED; |
6267 | 0 | } |
6268 | | |
6269 | | /* |
6270 | | * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's |
6271 | | * state at that point. |
6272 | | * |
6273 | | * Obviously the buffer could be locked by the time the value is returned, so |
6274 | | * this is primarily useful in CAS style loops. |
6275 | | */ |
6276 | | static uint32 |
6277 | | WaitBufHdrUnlocked(BufferDesc *buf) |
6278 | 0 | { |
6279 | 0 | SpinDelayStatus delayStatus; |
6280 | 0 | uint32 buf_state; |
6281 | |
|
6282 | 0 | init_local_spin_delay(&delayStatus); |
6283 | |
|
6284 | 0 | buf_state = pg_atomic_read_u32(&buf->state); |
6285 | |
|
6286 | 0 | while (buf_state & BM_LOCKED) |
6287 | 0 | { |
6288 | 0 | perform_spin_delay(&delayStatus); |
6289 | 0 | buf_state = pg_atomic_read_u32(&buf->state); |
6290 | 0 | } |
6291 | |
|
6292 | 0 | finish_spin_delay(&delayStatus); |
6293 | |
|
6294 | 0 | return buf_state; |
6295 | 0 | } |
6296 | | |
6297 | | /* |
6298 | | * BufferTag comparator. |
6299 | | */ |
6300 | | static inline int |
6301 | | buffertag_comparator(const BufferTag *ba, const BufferTag *bb) |
6302 | 0 | { |
6303 | 0 | int ret; |
6304 | 0 | RelFileLocator rlocatora; |
6305 | 0 | RelFileLocator rlocatorb; |
6306 | |
|
6307 | 0 | rlocatora = BufTagGetRelFileLocator(ba); |
6308 | 0 | rlocatorb = BufTagGetRelFileLocator(bb); |
6309 | |
|
6310 | 0 | ret = rlocator_comparator(&rlocatora, &rlocatorb); |
6311 | |
|
6312 | 0 | if (ret != 0) |
6313 | 0 | return ret; |
6314 | | |
6315 | 0 | if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb)) |
6316 | 0 | return -1; |
6317 | 0 | if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb)) |
6318 | 0 | return 1; |
6319 | | |
6320 | 0 | if (ba->blockNum < bb->blockNum) |
6321 | 0 | return -1; |
6322 | 0 | if (ba->blockNum > bb->blockNum) |
6323 | 0 | return 1; |
6324 | | |
6325 | 0 | return 0; |
6326 | 0 | } |
6327 | | |
6328 | | /* |
6329 | | * Comparator determining the writeout order in a checkpoint. |
6330 | | * |
6331 | | * It is important that tablespaces are compared first, the logic balancing |
6332 | | * writes between tablespaces relies on it. |
6333 | | */ |
6334 | | static inline int |
6335 | | ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b) |
6336 | 0 | { |
6337 | | /* compare tablespace */ |
6338 | 0 | if (a->tsId < b->tsId) |
6339 | 0 | return -1; |
6340 | 0 | else if (a->tsId > b->tsId) |
6341 | 0 | return 1; |
6342 | | /* compare relation */ |
6343 | 0 | if (a->relNumber < b->relNumber) |
6344 | 0 | return -1; |
6345 | 0 | else if (a->relNumber > b->relNumber) |
6346 | 0 | return 1; |
6347 | | /* compare fork */ |
6348 | 0 | else if (a->forkNum < b->forkNum) |
6349 | 0 | return -1; |
6350 | 0 | else if (a->forkNum > b->forkNum) |
6351 | 0 | return 1; |
6352 | | /* compare block number */ |
6353 | 0 | else if (a->blockNum < b->blockNum) |
6354 | 0 | return -1; |
6355 | 0 | else if (a->blockNum > b->blockNum) |
6356 | 0 | return 1; |
6357 | | /* equal page IDs are unlikely, but not impossible */ |
6358 | 0 | return 0; |
6359 | 0 | } |
6360 | | |
6361 | | /* |
6362 | | * Comparator for a Min-Heap over the per-tablespace checkpoint completion |
6363 | | * progress. |
6364 | | */ |
6365 | | static int |
6366 | | ts_ckpt_progress_comparator(Datum a, Datum b, void *arg) |
6367 | 0 | { |
6368 | 0 | CkptTsStatus *sa = (CkptTsStatus *) DatumGetPointer(a); |
6369 | 0 | CkptTsStatus *sb = (CkptTsStatus *) DatumGetPointer(b); |
6370 | | |
6371 | | /* we want a min-heap, so return 1 for the a < b */ |
6372 | 0 | if (sa->progress < sb->progress) |
6373 | 0 | return 1; |
6374 | 0 | else if (sa->progress == sb->progress) |
6375 | 0 | return 0; |
6376 | 0 | else |
6377 | 0 | return -1; |
6378 | 0 | } |
6379 | | |
6380 | | /* |
6381 | | * Initialize a writeback context, discarding potential previous state. |
6382 | | * |
6383 | | * *max_pending is a pointer instead of an immediate value, so the coalesce |
6384 | | * limits can easily changed by the GUC mechanism, and so calling code does |
6385 | | * not have to check the current configuration. A value of 0 means that no |
6386 | | * writeback control will be performed. |
6387 | | */ |
6388 | | void |
6389 | | WritebackContextInit(WritebackContext *context, int *max_pending) |
6390 | 0 | { |
6391 | 0 | Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES); |
6392 | |
|
6393 | 0 | context->max_pending = max_pending; |
6394 | 0 | context->nr_pending = 0; |
6395 | 0 | } |
6396 | | |
6397 | | /* |
6398 | | * Add buffer to list of pending writeback requests. |
6399 | | */ |
6400 | | void |
6401 | | ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, |
6402 | | BufferTag *tag) |
6403 | 0 | { |
6404 | 0 | PendingWriteback *pending; |
6405 | | |
6406 | | /* |
6407 | | * As pg_flush_data() doesn't do anything with fsync disabled, there's no |
6408 | | * point in tracking in that case. |
6409 | | */ |
6410 | 0 | if (io_direct_flags & IO_DIRECT_DATA || |
6411 | 0 | !enableFsync) |
6412 | 0 | return; |
6413 | | |
6414 | | /* |
6415 | | * Add buffer to the pending writeback array, unless writeback control is |
6416 | | * disabled. |
6417 | | */ |
6418 | 0 | if (*wb_context->max_pending > 0) |
6419 | 0 | { |
6420 | 0 | Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES); |
6421 | |
|
6422 | 0 | pending = &wb_context->pending_writebacks[wb_context->nr_pending++]; |
6423 | |
|
6424 | 0 | pending->tag = *tag; |
6425 | 0 | } |
6426 | | |
6427 | | /* |
6428 | | * Perform pending flushes if the writeback limit is exceeded. This |
6429 | | * includes the case where previously an item has been added, but control |
6430 | | * is now disabled. |
6431 | | */ |
6432 | 0 | if (wb_context->nr_pending >= *wb_context->max_pending) |
6433 | 0 | IssuePendingWritebacks(wb_context, io_context); |
6434 | 0 | } |
6435 | | |
6436 | 0 | #define ST_SORT sort_pending_writebacks |
6437 | 0 | #define ST_ELEMENT_TYPE PendingWriteback |
6438 | 0 | #define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag) |
6439 | | #define ST_SCOPE static |
6440 | | #define ST_DEFINE |
6441 | | #include "lib/sort_template.h" |
6442 | | |
6443 | | /* |
6444 | | * Issue all pending writeback requests, previously scheduled with |
6445 | | * ScheduleBufferTagForWriteback, to the OS. |
6446 | | * |
6447 | | * Because this is only used to improve the OSs IO scheduling we try to never |
6448 | | * error out - it's just a hint. |
6449 | | */ |
6450 | | void |
6451 | | IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context) |
6452 | 0 | { |
6453 | 0 | instr_time io_start; |
6454 | 0 | int i; |
6455 | |
|
6456 | 0 | if (wb_context->nr_pending == 0) |
6457 | 0 | return; |
6458 | | |
6459 | | /* |
6460 | | * Executing the writes in-order can make them a lot faster, and allows to |
6461 | | * merge writeback requests to consecutive blocks into larger writebacks. |
6462 | | */ |
6463 | 0 | sort_pending_writebacks(wb_context->pending_writebacks, |
6464 | 0 | wb_context->nr_pending); |
6465 | |
|
6466 | 0 | io_start = pgstat_prepare_io_time(track_io_timing); |
6467 | | |
6468 | | /* |
6469 | | * Coalesce neighbouring writes, but nothing else. For that we iterate |
6470 | | * through the, now sorted, array of pending flushes, and look forward to |
6471 | | * find all neighbouring (or identical) writes. |
6472 | | */ |
6473 | 0 | for (i = 0; i < wb_context->nr_pending; i++) |
6474 | 0 | { |
6475 | 0 | PendingWriteback *cur; |
6476 | 0 | PendingWriteback *next; |
6477 | 0 | SMgrRelation reln; |
6478 | 0 | int ahead; |
6479 | 0 | BufferTag tag; |
6480 | 0 | RelFileLocator currlocator; |
6481 | 0 | Size nblocks = 1; |
6482 | |
|
6483 | 0 | cur = &wb_context->pending_writebacks[i]; |
6484 | 0 | tag = cur->tag; |
6485 | 0 | currlocator = BufTagGetRelFileLocator(&tag); |
6486 | | |
6487 | | /* |
6488 | | * Peek ahead, into following writeback requests, to see if they can |
6489 | | * be combined with the current one. |
6490 | | */ |
6491 | 0 | for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++) |
6492 | 0 | { |
6493 | |
|
6494 | 0 | next = &wb_context->pending_writebacks[i + ahead + 1]; |
6495 | | |
6496 | | /* different file, stop */ |
6497 | 0 | if (!RelFileLocatorEquals(currlocator, |
6498 | 0 | BufTagGetRelFileLocator(&next->tag)) || |
6499 | 0 | BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag)) |
6500 | 0 | break; |
6501 | | |
6502 | | /* ok, block queued twice, skip */ |
6503 | 0 | if (cur->tag.blockNum == next->tag.blockNum) |
6504 | 0 | continue; |
6505 | | |
6506 | | /* only merge consecutive writes */ |
6507 | 0 | if (cur->tag.blockNum + 1 != next->tag.blockNum) |
6508 | 0 | break; |
6509 | | |
6510 | 0 | nblocks++; |
6511 | 0 | cur = next; |
6512 | 0 | } |
6513 | |
|
6514 | 0 | i += ahead; |
6515 | | |
6516 | | /* and finally tell the kernel to write the data to storage */ |
6517 | 0 | reln = smgropen(currlocator, INVALID_PROC_NUMBER); |
6518 | 0 | smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks); |
6519 | 0 | } |
6520 | | |
6521 | | /* |
6522 | | * Assume that writeback requests are only issued for buffers containing |
6523 | | * blocks of permanent relations. |
6524 | | */ |
6525 | 0 | pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, |
6526 | 0 | IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0); |
6527 | |
|
6528 | 0 | wb_context->nr_pending = 0; |
6529 | 0 | } |
6530 | | |
6531 | | /* ResourceOwner callbacks */ |
6532 | | |
6533 | | static void |
6534 | | ResOwnerReleaseBufferIO(Datum res) |
6535 | 0 | { |
6536 | 0 | Buffer buffer = DatumGetInt32(res); |
6537 | |
|
6538 | 0 | AbortBufferIO(buffer); |
6539 | 0 | } |
6540 | | |
6541 | | static char * |
6542 | | ResOwnerPrintBufferIO(Datum res) |
6543 | 0 | { |
6544 | 0 | Buffer buffer = DatumGetInt32(res); |
6545 | |
|
6546 | 0 | return psprintf("lost track of buffer IO on buffer %d", buffer); |
6547 | 0 | } |
6548 | | |
6549 | | static void |
6550 | | ResOwnerReleaseBufferPin(Datum res) |
6551 | 0 | { |
6552 | 0 | Buffer buffer = DatumGetInt32(res); |
6553 | | |
6554 | | /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */ |
6555 | 0 | if (!BufferIsValid(buffer)) |
6556 | 0 | elog(ERROR, "bad buffer ID: %d", buffer); |
6557 | | |
6558 | 0 | if (BufferIsLocal(buffer)) |
6559 | 0 | UnpinLocalBufferNoOwner(buffer); |
6560 | 0 | else |
6561 | 0 | UnpinBufferNoOwner(GetBufferDescriptor(buffer - 1)); |
6562 | 0 | } |
6563 | | |
6564 | | static char * |
6565 | | ResOwnerPrintBufferPin(Datum res) |
6566 | 0 | { |
6567 | 0 | return DebugPrintBufferRefcount(DatumGetInt32(res)); |
6568 | 0 | } |
6569 | | |
6570 | | /* |
6571 | | * Helper function to evict unpinned buffer whose buffer header lock is |
6572 | | * already acquired. |
6573 | | */ |
6574 | | static bool |
6575 | | EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed) |
6576 | 0 | { |
6577 | 0 | uint32 buf_state; |
6578 | 0 | bool result; |
6579 | |
|
6580 | 0 | *buffer_flushed = false; |
6581 | |
|
6582 | 0 | buf_state = pg_atomic_read_u32(&(desc->state)); |
6583 | 0 | Assert(buf_state & BM_LOCKED); |
6584 | |
|
6585 | 0 | if ((buf_state & BM_VALID) == 0) |
6586 | 0 | { |
6587 | 0 | UnlockBufHdr(desc, buf_state); |
6588 | 0 | return false; |
6589 | 0 | } |
6590 | | |
6591 | | /* Check that it's not pinned already. */ |
6592 | 0 | if (BUF_STATE_GET_REFCOUNT(buf_state) > 0) |
6593 | 0 | { |
6594 | 0 | UnlockBufHdr(desc, buf_state); |
6595 | 0 | return false; |
6596 | 0 | } |
6597 | | |
6598 | 0 | PinBuffer_Locked(desc); /* releases spinlock */ |
6599 | | |
6600 | | /* If it was dirty, try to clean it once. */ |
6601 | 0 | if (buf_state & BM_DIRTY) |
6602 | 0 | { |
6603 | 0 | LWLockAcquire(BufferDescriptorGetContentLock(desc), LW_SHARED); |
6604 | 0 | FlushBuffer(desc, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); |
6605 | 0 | *buffer_flushed = true; |
6606 | 0 | LWLockRelease(BufferDescriptorGetContentLock(desc)); |
6607 | 0 | } |
6608 | | |
6609 | | /* This will return false if it becomes dirty or someone else pins it. */ |
6610 | 0 | result = InvalidateVictimBuffer(desc); |
6611 | |
|
6612 | 0 | UnpinBuffer(desc); |
6613 | |
|
6614 | 0 | return result; |
6615 | 0 | } |
6616 | | |
6617 | | /* |
6618 | | * Try to evict the current block in a shared buffer. |
6619 | | * |
6620 | | * This function is intended for testing/development use only! |
6621 | | * |
6622 | | * To succeed, the buffer must not be pinned on entry, so if the caller had a |
6623 | | * particular block in mind, it might already have been replaced by some other |
6624 | | * block by the time this function runs. It's also unpinned on return, so the |
6625 | | * buffer might be occupied again by the time control is returned, potentially |
6626 | | * even by the same block. This inherent raciness without other interlocking |
6627 | | * makes the function unsuitable for non-testing usage. |
6628 | | * |
6629 | | * *buffer_flushed is set to true if the buffer was dirty and has been |
6630 | | * flushed, false otherwise. However, *buffer_flushed=true does not |
6631 | | * necessarily mean that we flushed the buffer, it could have been flushed by |
6632 | | * someone else. |
6633 | | * |
6634 | | * Returns true if the buffer was valid and it has now been made invalid. |
6635 | | * Returns false if it wasn't valid, if it couldn't be evicted due to a pin, |
6636 | | * or if the buffer becomes dirty again while we're trying to write it out. |
6637 | | */ |
6638 | | bool |
6639 | | EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed) |
6640 | 0 | { |
6641 | 0 | BufferDesc *desc; |
6642 | |
|
6643 | 0 | Assert(BufferIsValid(buf) && !BufferIsLocal(buf)); |
6644 | | |
6645 | | /* Make sure we can pin the buffer. */ |
6646 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
6647 | 0 | ReservePrivateRefCountEntry(); |
6648 | |
|
6649 | 0 | desc = GetBufferDescriptor(buf - 1); |
6650 | 0 | LockBufHdr(desc); |
6651 | |
|
6652 | 0 | return EvictUnpinnedBufferInternal(desc, buffer_flushed); |
6653 | 0 | } |
6654 | | |
6655 | | /* |
6656 | | * Try to evict all the shared buffers. |
6657 | | * |
6658 | | * This function is intended for testing/development use only! See |
6659 | | * EvictUnpinnedBuffer(). |
6660 | | * |
6661 | | * The buffers_* parameters are mandatory and indicate the total count of |
6662 | | * buffers that: |
6663 | | * - buffers_evicted - were evicted |
6664 | | * - buffers_flushed - were flushed |
6665 | | * - buffers_skipped - could not be evicted |
6666 | | */ |
6667 | | void |
6668 | | EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, |
6669 | | int32 *buffers_skipped) |
6670 | 0 | { |
6671 | 0 | *buffers_evicted = 0; |
6672 | 0 | *buffers_skipped = 0; |
6673 | 0 | *buffers_flushed = 0; |
6674 | |
|
6675 | 0 | for (int buf = 1; buf <= NBuffers; buf++) |
6676 | 0 | { |
6677 | 0 | BufferDesc *desc = GetBufferDescriptor(buf - 1); |
6678 | 0 | uint32 buf_state; |
6679 | 0 | bool buffer_flushed; |
6680 | |
|
6681 | 0 | buf_state = pg_atomic_read_u32(&desc->state); |
6682 | 0 | if (!(buf_state & BM_VALID)) |
6683 | 0 | continue; |
6684 | | |
6685 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
6686 | 0 | ReservePrivateRefCountEntry(); |
6687 | |
|
6688 | 0 | LockBufHdr(desc); |
6689 | |
|
6690 | 0 | if (EvictUnpinnedBufferInternal(desc, &buffer_flushed)) |
6691 | 0 | (*buffers_evicted)++; |
6692 | 0 | else |
6693 | 0 | (*buffers_skipped)++; |
6694 | |
|
6695 | 0 | if (buffer_flushed) |
6696 | 0 | (*buffers_flushed)++; |
6697 | 0 | } |
6698 | 0 | } |
6699 | | |
6700 | | /* |
6701 | | * Try to evict all the shared buffers containing provided relation's pages. |
6702 | | * |
6703 | | * This function is intended for testing/development use only! See |
6704 | | * EvictUnpinnedBuffer(). |
6705 | | * |
6706 | | * The caller must hold at least AccessShareLock on the relation to prevent |
6707 | | * the relation from being dropped. |
6708 | | * |
6709 | | * The buffers_* parameters are mandatory and indicate the total count of |
6710 | | * buffers that: |
6711 | | * - buffers_evicted - were evicted |
6712 | | * - buffers_flushed - were flushed |
6713 | | * - buffers_skipped - could not be evicted |
6714 | | */ |
6715 | | void |
6716 | | EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, |
6717 | | int32 *buffers_flushed, int32 *buffers_skipped) |
6718 | 0 | { |
6719 | 0 | Assert(!RelationUsesLocalBuffers(rel)); |
6720 | |
|
6721 | 0 | *buffers_skipped = 0; |
6722 | 0 | *buffers_evicted = 0; |
6723 | 0 | *buffers_flushed = 0; |
6724 | |
|
6725 | 0 | for (int buf = 1; buf <= NBuffers; buf++) |
6726 | 0 | { |
6727 | 0 | BufferDesc *desc = GetBufferDescriptor(buf - 1); |
6728 | 0 | uint32 buf_state = pg_atomic_read_u32(&(desc->state)); |
6729 | 0 | bool buffer_flushed; |
6730 | | |
6731 | | /* An unlocked precheck should be safe and saves some cycles. */ |
6732 | 0 | if ((buf_state & BM_VALID) == 0 || |
6733 | 0 | !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator)) |
6734 | 0 | continue; |
6735 | | |
6736 | | /* Make sure we can pin the buffer. */ |
6737 | 0 | ResourceOwnerEnlarge(CurrentResourceOwner); |
6738 | 0 | ReservePrivateRefCountEntry(); |
6739 | |
|
6740 | 0 | buf_state = LockBufHdr(desc); |
6741 | | |
6742 | | /* recheck, could have changed without the lock */ |
6743 | 0 | if ((buf_state & BM_VALID) == 0 || |
6744 | 0 | !BufTagMatchesRelFileLocator(&desc->tag, &rel->rd_locator)) |
6745 | 0 | { |
6746 | 0 | UnlockBufHdr(desc, buf_state); |
6747 | 0 | continue; |
6748 | 0 | } |
6749 | | |
6750 | 0 | if (EvictUnpinnedBufferInternal(desc, &buffer_flushed)) |
6751 | 0 | (*buffers_evicted)++; |
6752 | 0 | else |
6753 | 0 | (*buffers_skipped)++; |
6754 | |
|
6755 | 0 | if (buffer_flushed) |
6756 | 0 | (*buffers_flushed)++; |
6757 | 0 | } |
6758 | 0 | } |
6759 | | |
6760 | | /* |
6761 | | * Generic implementation of the AIO handle staging callback for readv/writev |
6762 | | * on local/shared buffers. |
6763 | | * |
6764 | | * Each readv/writev can target multiple buffers. The buffers have already |
6765 | | * been registered with the IO handle. |
6766 | | * |
6767 | | * To make the IO ready for execution ("staging"), we need to ensure that the |
6768 | | * targeted buffers are in an appropriate state while the IO is ongoing. For |
6769 | | * that the AIO subsystem needs to have its own buffer pin, otherwise an error |
6770 | | * in this backend could lead to this backend's buffer pin being released as |
6771 | | * part of error handling, which in turn could lead to the buffer being |
6772 | | * replaced while IO is ongoing. |
6773 | | */ |
6774 | | static pg_attribute_always_inline void |
6775 | | buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp) |
6776 | 0 | { |
6777 | 0 | uint64 *io_data; |
6778 | 0 | uint8 handle_data_len; |
6779 | 0 | PgAioWaitRef io_ref; |
6780 | 0 | BufferTag first PG_USED_FOR_ASSERTS_ONLY = {0}; |
6781 | |
|
6782 | 0 | io_data = pgaio_io_get_handle_data(ioh, &handle_data_len); |
6783 | |
|
6784 | 0 | pgaio_io_get_wref(ioh, &io_ref); |
6785 | | |
6786 | | /* iterate over all buffers affected by the vectored readv/writev */ |
6787 | 0 | for (int i = 0; i < handle_data_len; i++) |
6788 | 0 | { |
6789 | 0 | Buffer buffer = (Buffer) io_data[i]; |
6790 | 0 | BufferDesc *buf_hdr = is_temp ? |
6791 | 0 | GetLocalBufferDescriptor(-buffer - 1) |
6792 | 0 | : GetBufferDescriptor(buffer - 1); |
6793 | 0 | uint32 buf_state; |
6794 | | |
6795 | | /* |
6796 | | * Check that all the buffers are actually ones that could conceivably |
6797 | | * be done in one IO, i.e. are sequential. This is the last |
6798 | | * buffer-aware code before IO is actually executed and confusion |
6799 | | * about which buffers are targeted by IO can be hard to debug, making |
6800 | | * it worth doing extra-paranoid checks. |
6801 | | */ |
6802 | 0 | if (i == 0) |
6803 | 0 | first = buf_hdr->tag; |
6804 | 0 | else |
6805 | 0 | { |
6806 | 0 | Assert(buf_hdr->tag.relNumber == first.relNumber); |
6807 | 0 | Assert(buf_hdr->tag.blockNum == first.blockNum + i); |
6808 | 0 | } |
6809 | |
|
6810 | 0 | if (is_temp) |
6811 | 0 | buf_state = pg_atomic_read_u32(&buf_hdr->state); |
6812 | 0 | else |
6813 | 0 | buf_state = LockBufHdr(buf_hdr); |
6814 | | |
6815 | | /* verify the buffer is in the expected state */ |
6816 | 0 | Assert(buf_state & BM_TAG_VALID); |
6817 | 0 | if (is_write) |
6818 | 0 | { |
6819 | 0 | Assert(buf_state & BM_VALID); |
6820 | 0 | Assert(buf_state & BM_DIRTY); |
6821 | 0 | } |
6822 | 0 | else |
6823 | 0 | { |
6824 | 0 | Assert(!(buf_state & BM_VALID)); |
6825 | 0 | Assert(!(buf_state & BM_DIRTY)); |
6826 | 0 | } |
6827 | | |
6828 | | /* temp buffers don't use BM_IO_IN_PROGRESS */ |
6829 | 0 | if (!is_temp) |
6830 | 0 | Assert(buf_state & BM_IO_IN_PROGRESS); |
6831 | |
|
6832 | 0 | Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1); |
6833 | | |
6834 | | /* |
6835 | | * Reflect that the buffer is now owned by the AIO subsystem. |
6836 | | * |
6837 | | * For local buffers: This can't be done just via LocalRefCount, as |
6838 | | * one might initially think, as this backend could error out while |
6839 | | * AIO is still in progress, releasing all the pins by the backend |
6840 | | * itself. |
6841 | | * |
6842 | | * This pin is released again in TerminateBufferIO(). |
6843 | | */ |
6844 | 0 | buf_state += BUF_REFCOUNT_ONE; |
6845 | 0 | buf_hdr->io_wref = io_ref; |
6846 | |
|
6847 | 0 | if (is_temp) |
6848 | 0 | pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state); |
6849 | 0 | else |
6850 | 0 | UnlockBufHdr(buf_hdr, buf_state); |
6851 | | |
6852 | | /* |
6853 | | * Ensure the content lock that prevents buffer modifications while |
6854 | | * the buffer is being written out is not released early due to an |
6855 | | * error. |
6856 | | */ |
6857 | 0 | if (is_write && !is_temp) |
6858 | 0 | { |
6859 | 0 | LWLock *content_lock; |
6860 | |
|
6861 | 0 | content_lock = BufferDescriptorGetContentLock(buf_hdr); |
6862 | |
|
6863 | 0 | Assert(LWLockHeldByMe(content_lock)); |
6864 | | |
6865 | | /* |
6866 | | * Lock is now owned by AIO subsystem. |
6867 | | */ |
6868 | 0 | LWLockDisown(content_lock); |
6869 | 0 | } |
6870 | | |
6871 | | /* |
6872 | | * Stop tracking this buffer via the resowner - the AIO system now |
6873 | | * keeps track. |
6874 | | */ |
6875 | 0 | if (!is_temp) |
6876 | 0 | ResourceOwnerForgetBufferIO(CurrentResourceOwner, buffer); |
6877 | 0 | } |
6878 | 0 | } |
6879 | | |
6880 | | /* |
6881 | | * Decode readv errors as encoded by buffer_readv_encode_error(). |
6882 | | */ |
6883 | | static inline void |
6884 | | buffer_readv_decode_error(PgAioResult result, |
6885 | | bool *zeroed_any, |
6886 | | bool *ignored_any, |
6887 | | uint8 *zeroed_or_error_count, |
6888 | | uint8 *checkfail_count, |
6889 | | uint8 *first_off) |
6890 | 0 | { |
6891 | 0 | uint32 rem_error = result.error_data; |
6892 | | |
6893 | | /* see static asserts in buffer_readv_encode_error */ |
6894 | 0 | #define READV_COUNT_BITS 7 |
6895 | 0 | #define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1) |
6896 | |
|
6897 | 0 | *zeroed_any = rem_error & 1; |
6898 | 0 | rem_error >>= 1; |
6899 | |
|
6900 | 0 | *ignored_any = rem_error & 1; |
6901 | 0 | rem_error >>= 1; |
6902 | |
|
6903 | 0 | *zeroed_or_error_count = rem_error & READV_COUNT_MASK; |
6904 | 0 | rem_error >>= READV_COUNT_BITS; |
6905 | |
|
6906 | 0 | *checkfail_count = rem_error & READV_COUNT_MASK; |
6907 | 0 | rem_error >>= READV_COUNT_BITS; |
6908 | |
|
6909 | 0 | *first_off = rem_error & READV_COUNT_MASK; |
6910 | 0 | rem_error >>= READV_COUNT_BITS; |
6911 | 0 | } |
6912 | | |
6913 | | /* |
6914 | | * Helper to encode errors for buffer_readv_complete() |
6915 | | * |
6916 | | * Errors are encoded as follows: |
6917 | | * - bit 0 indicates whether any page was zeroed (1) or not (0) |
6918 | | * - bit 1 indicates whether any checksum failure was ignored (1) or not (0) |
6919 | | * - next READV_COUNT_BITS bits indicate the number of errored or zeroed pages |
6920 | | * - next READV_COUNT_BITS bits indicate the number of checksum failures |
6921 | | * - next READV_COUNT_BITS bits indicate the first offset of the first page |
6922 | | * that was errored or zeroed or, if no errors/zeroes, the first ignored |
6923 | | * checksum |
6924 | | */ |
6925 | | static inline void |
6926 | | buffer_readv_encode_error(PgAioResult *result, |
6927 | | bool is_temp, |
6928 | | bool zeroed_any, |
6929 | | bool ignored_any, |
6930 | | uint8 error_count, |
6931 | | uint8 zeroed_count, |
6932 | | uint8 checkfail_count, |
6933 | | uint8 first_error_off, |
6934 | | uint8 first_zeroed_off, |
6935 | | uint8 first_ignored_off) |
6936 | 0 | { |
6937 | |
|
6938 | 0 | uint8 shift = 0; |
6939 | 0 | uint8 zeroed_or_error_count = |
6940 | 0 | error_count > 0 ? error_count : zeroed_count; |
6941 | 0 | uint8 first_off; |
6942 | |
|
6943 | 0 | StaticAssertStmt(PG_IOV_MAX <= 1 << READV_COUNT_BITS, |
6944 | 0 | "PG_IOV_MAX is bigger than reserved space for error data"); |
6945 | 0 | StaticAssertStmt((1 + 1 + 3 * READV_COUNT_BITS) <= PGAIO_RESULT_ERROR_BITS, |
6946 | 0 | "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv"); |
6947 | | |
6948 | | /* |
6949 | | * We only have space to encode one offset - but luckily that's good |
6950 | | * enough. If there is an error, the error is the interesting offset, same |
6951 | | * with a zeroed buffer vs an ignored buffer. |
6952 | | */ |
6953 | 0 | if (error_count > 0) |
6954 | 0 | first_off = first_error_off; |
6955 | 0 | else if (zeroed_count > 0) |
6956 | 0 | first_off = first_zeroed_off; |
6957 | 0 | else |
6958 | 0 | first_off = first_ignored_off; |
6959 | |
|
6960 | 0 | Assert(!zeroed_any || error_count == 0); |
6961 | |
|
6962 | 0 | result->error_data = 0; |
6963 | |
|
6964 | 0 | result->error_data |= zeroed_any << shift; |
6965 | 0 | shift += 1; |
6966 | |
|
6967 | 0 | result->error_data |= ignored_any << shift; |
6968 | 0 | shift += 1; |
6969 | |
|
6970 | 0 | result->error_data |= ((uint32) zeroed_or_error_count) << shift; |
6971 | 0 | shift += READV_COUNT_BITS; |
6972 | |
|
6973 | 0 | result->error_data |= ((uint32) checkfail_count) << shift; |
6974 | 0 | shift += READV_COUNT_BITS; |
6975 | |
|
6976 | 0 | result->error_data |= ((uint32) first_off) << shift; |
6977 | 0 | shift += READV_COUNT_BITS; |
6978 | |
|
6979 | 0 | result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV : |
6980 | 0 | PGAIO_HCB_SHARED_BUFFER_READV; |
6981 | |
|
6982 | 0 | if (error_count > 0) |
6983 | 0 | result->status = PGAIO_RS_ERROR; |
6984 | 0 | else |
6985 | 0 | result->status = PGAIO_RS_WARNING; |
6986 | | |
6987 | | /* |
6988 | | * The encoding is complicated enough to warrant cross-checking it against |
6989 | | * the decode function. |
6990 | | */ |
6991 | | #ifdef USE_ASSERT_CHECKING |
6992 | | { |
6993 | | bool zeroed_any_2, |
6994 | | ignored_any_2; |
6995 | | uint8 zeroed_or_error_count_2, |
6996 | | checkfail_count_2, |
6997 | | first_off_2; |
6998 | | |
6999 | | buffer_readv_decode_error(*result, |
7000 | | &zeroed_any_2, &ignored_any_2, |
7001 | | &zeroed_or_error_count_2, |
7002 | | &checkfail_count_2, |
7003 | | &first_off_2); |
7004 | | Assert(zeroed_any == zeroed_any_2); |
7005 | | Assert(ignored_any == ignored_any_2); |
7006 | | Assert(zeroed_or_error_count == zeroed_or_error_count_2); |
7007 | | Assert(checkfail_count == checkfail_count_2); |
7008 | | Assert(first_off == first_off_2); |
7009 | | } |
7010 | | #endif |
7011 | |
|
7012 | 0 | #undef READV_COUNT_BITS |
7013 | 0 | #undef READV_COUNT_MASK |
7014 | 0 | } |
7015 | | |
7016 | | /* |
7017 | | * Helper for AIO readv completion callbacks, supporting both shared and temp |
7018 | | * buffers. Gets called once for each buffer in a multi-page read. |
7019 | | */ |
7020 | | static pg_attribute_always_inline void |
7021 | | buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, |
7022 | | uint8 flags, bool failed, bool is_temp, |
7023 | | bool *buffer_invalid, |
7024 | | bool *failed_checksum, |
7025 | | bool *ignored_checksum, |
7026 | | bool *zeroed_buffer) |
7027 | 0 | { |
7028 | 0 | BufferDesc *buf_hdr = is_temp ? |
7029 | 0 | GetLocalBufferDescriptor(-buffer - 1) |
7030 | 0 | : GetBufferDescriptor(buffer - 1); |
7031 | 0 | BufferTag tag = buf_hdr->tag; |
7032 | 0 | char *bufdata = BufferGetBlock(buffer); |
7033 | 0 | uint32 set_flag_bits; |
7034 | 0 | int piv_flags; |
7035 | | |
7036 | | /* check that the buffer is in the expected state for a read */ |
7037 | | #ifdef USE_ASSERT_CHECKING |
7038 | | { |
7039 | | uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state); |
7040 | | |
7041 | | Assert(buf_state & BM_TAG_VALID); |
7042 | | Assert(!(buf_state & BM_VALID)); |
7043 | | /* temp buffers don't use BM_IO_IN_PROGRESS */ |
7044 | | if (!is_temp) |
7045 | | Assert(buf_state & BM_IO_IN_PROGRESS); |
7046 | | Assert(!(buf_state & BM_DIRTY)); |
7047 | | } |
7048 | | #endif |
7049 | |
|
7050 | 0 | *buffer_invalid = false; |
7051 | 0 | *failed_checksum = false; |
7052 | 0 | *ignored_checksum = false; |
7053 | 0 | *zeroed_buffer = false; |
7054 | | |
7055 | | /* |
7056 | | * We ask PageIsVerified() to only log the message about checksum errors, |
7057 | | * as the completion might be run in any backend (or IO workers). We will |
7058 | | * report checksum errors in buffer_readv_report(). |
7059 | | */ |
7060 | 0 | piv_flags = PIV_LOG_LOG; |
7061 | | |
7062 | | /* the local zero_damaged_pages may differ from the definer's */ |
7063 | 0 | if (flags & READ_BUFFERS_IGNORE_CHECKSUM_FAILURES) |
7064 | 0 | piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE; |
7065 | | |
7066 | | /* Check for garbage data. */ |
7067 | 0 | if (!failed) |
7068 | 0 | { |
7069 | | /* |
7070 | | * If the buffer is not currently pinned by this backend, e.g. because |
7071 | | * we're completing this IO after an error, the buffer data will have |
7072 | | * been marked as inaccessible when the buffer was unpinned. The AIO |
7073 | | * subsystem holds a pin, but that doesn't prevent the buffer from |
7074 | | * having been marked as inaccessible. The completion might also be |
7075 | | * executed in a different process. |
7076 | | */ |
7077 | | #ifdef USE_VALGRIND |
7078 | | if (!BufferIsPinned(buffer)) |
7079 | | VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ); |
7080 | | #endif |
7081 | |
|
7082 | 0 | if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags, |
7083 | 0 | failed_checksum)) |
7084 | 0 | { |
7085 | 0 | if (flags & READ_BUFFERS_ZERO_ON_ERROR) |
7086 | 0 | { |
7087 | 0 | memset(bufdata, 0, BLCKSZ); |
7088 | 0 | *zeroed_buffer = true; |
7089 | 0 | } |
7090 | 0 | else |
7091 | 0 | { |
7092 | 0 | *buffer_invalid = true; |
7093 | | /* mark buffer as having failed */ |
7094 | 0 | failed = true; |
7095 | 0 | } |
7096 | 0 | } |
7097 | 0 | else if (*failed_checksum) |
7098 | 0 | *ignored_checksum = true; |
7099 | | |
7100 | | /* undo what we did above */ |
7101 | | #ifdef USE_VALGRIND |
7102 | | if (!BufferIsPinned(buffer)) |
7103 | | VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ); |
7104 | | #endif |
7105 | | |
7106 | | /* |
7107 | | * Immediately log a message about the invalid page, but only to the |
7108 | | * server log. The reason to do so immediately is that this may be |
7109 | | * executed in a different backend than the one that originated the |
7110 | | * request. The reason to do so immediately is that the originator |
7111 | | * might not process the query result immediately (because it is busy |
7112 | | * doing another part of query processing) or at all (e.g. if it was |
7113 | | * cancelled or errored out due to another IO also failing). The |
7114 | | * definer of the IO will emit an ERROR or WARNING when processing the |
7115 | | * IO's results |
7116 | | * |
7117 | | * To avoid duplicating the code to emit these log messages, we reuse |
7118 | | * buffer_readv_report(). |
7119 | | */ |
7120 | 0 | if (*buffer_invalid || *failed_checksum || *zeroed_buffer) |
7121 | 0 | { |
7122 | 0 | PgAioResult result_one = {0}; |
7123 | |
|
7124 | 0 | buffer_readv_encode_error(&result_one, is_temp, |
7125 | 0 | *zeroed_buffer, |
7126 | 0 | *ignored_checksum, |
7127 | 0 | *buffer_invalid, |
7128 | 0 | *zeroed_buffer ? 1 : 0, |
7129 | 0 | *failed_checksum ? 1 : 0, |
7130 | 0 | buf_off, buf_off, buf_off); |
7131 | 0 | pgaio_result_report(result_one, td, LOG_SERVER_ONLY); |
7132 | 0 | } |
7133 | 0 | } |
7134 | | |
7135 | | /* Terminate I/O and set BM_VALID. */ |
7136 | 0 | set_flag_bits = failed ? BM_IO_ERROR : BM_VALID; |
7137 | 0 | if (is_temp) |
7138 | 0 | TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true); |
7139 | 0 | else |
7140 | 0 | TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true); |
7141 | | |
7142 | | /* |
7143 | | * Call the BUFFER_READ_DONE tracepoint in the callback, even though the |
7144 | | * callback may not be executed in the same backend that called |
7145 | | * BUFFER_READ_START. The alternative would be to defer calling the |
7146 | | * tracepoint to a later point (e.g. the local completion callback for |
7147 | | * shared buffer reads), which seems even less helpful. |
7148 | | */ |
7149 | 0 | TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum, |
7150 | 0 | tag.blockNum, |
7151 | 0 | tag.spcOid, |
7152 | 0 | tag.dbOid, |
7153 | 0 | tag.relNumber, |
7154 | 0 | is_temp ? MyProcNumber : INVALID_PROC_NUMBER, |
7155 | 0 | false); |
7156 | 0 | } |
7157 | | |
7158 | | /* |
7159 | | * Perform completion handling of a single AIO read. This read may cover |
7160 | | * multiple blocks / buffers. |
7161 | | * |
7162 | | * Shared between shared and local buffers, to reduce code duplication. |
7163 | | */ |
7164 | | static pg_attribute_always_inline PgAioResult |
7165 | | buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, |
7166 | | uint8 cb_data, bool is_temp) |
7167 | 0 | { |
7168 | 0 | PgAioResult result = prior_result; |
7169 | 0 | PgAioTargetData *td = pgaio_io_get_target_data(ioh); |
7170 | 0 | uint8 first_error_off = 0; |
7171 | 0 | uint8 first_zeroed_off = 0; |
7172 | 0 | uint8 first_ignored_off = 0; |
7173 | 0 | uint8 error_count = 0; |
7174 | 0 | uint8 zeroed_count = 0; |
7175 | 0 | uint8 ignored_count = 0; |
7176 | 0 | uint8 checkfail_count = 0; |
7177 | 0 | uint64 *io_data; |
7178 | 0 | uint8 handle_data_len; |
7179 | |
|
7180 | 0 | if (is_temp) |
7181 | 0 | { |
7182 | 0 | Assert(td->smgr.is_temp); |
7183 | 0 | Assert(pgaio_io_get_owner(ioh) == MyProcNumber); |
7184 | 0 | } |
7185 | 0 | else |
7186 | 0 | Assert(!td->smgr.is_temp); |
7187 | | |
7188 | | /* |
7189 | | * Iterate over all the buffers affected by this IO and call the |
7190 | | * per-buffer completion function for each buffer. |
7191 | | */ |
7192 | 0 | io_data = pgaio_io_get_handle_data(ioh, &handle_data_len); |
7193 | 0 | for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++) |
7194 | 0 | { |
7195 | 0 | Buffer buf = io_data[buf_off]; |
7196 | 0 | bool failed; |
7197 | 0 | bool failed_verification = false; |
7198 | 0 | bool failed_checksum = false; |
7199 | 0 | bool zeroed_buffer = false; |
7200 | 0 | bool ignored_checksum = false; |
7201 | |
|
7202 | 0 | Assert(BufferIsValid(buf)); |
7203 | | |
7204 | | /* |
7205 | | * If the entire I/O failed on a lower-level, each buffer needs to be |
7206 | | * marked as failed. In case of a partial read, the first few buffers |
7207 | | * may be ok. |
7208 | | */ |
7209 | 0 | failed = |
7210 | 0 | prior_result.status == PGAIO_RS_ERROR |
7211 | 0 | || prior_result.result <= buf_off; |
7212 | |
|
7213 | 0 | buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp, |
7214 | 0 | &failed_verification, |
7215 | 0 | &failed_checksum, |
7216 | 0 | &ignored_checksum, |
7217 | 0 | &zeroed_buffer); |
7218 | | |
7219 | | /* |
7220 | | * Track information about the number of different kinds of error |
7221 | | * conditions across all pages, as there can be multiple pages failing |
7222 | | * verification as part of one IO. |
7223 | | */ |
7224 | 0 | if (failed_verification && !zeroed_buffer && error_count++ == 0) |
7225 | 0 | first_error_off = buf_off; |
7226 | 0 | if (zeroed_buffer && zeroed_count++ == 0) |
7227 | 0 | first_zeroed_off = buf_off; |
7228 | 0 | if (ignored_checksum && ignored_count++ == 0) |
7229 | 0 | first_ignored_off = buf_off; |
7230 | 0 | if (failed_checksum) |
7231 | 0 | checkfail_count++; |
7232 | 0 | } |
7233 | | |
7234 | | /* |
7235 | | * If the smgr read succeeded [partially] and page verification failed for |
7236 | | * some of the pages, adjust the IO's result state appropriately. |
7237 | | */ |
7238 | 0 | if (prior_result.status != PGAIO_RS_ERROR && |
7239 | 0 | (error_count > 0 || ignored_count > 0 || zeroed_count > 0)) |
7240 | 0 | { |
7241 | 0 | buffer_readv_encode_error(&result, is_temp, |
7242 | 0 | zeroed_count > 0, ignored_count > 0, |
7243 | 0 | error_count, zeroed_count, checkfail_count, |
7244 | 0 | first_error_off, first_zeroed_off, |
7245 | 0 | first_ignored_off); |
7246 | 0 | pgaio_result_report(result, td, DEBUG1); |
7247 | 0 | } |
7248 | | |
7249 | | /* |
7250 | | * For shared relations this reporting is done in |
7251 | | * shared_buffer_readv_complete_local(). |
7252 | | */ |
7253 | 0 | if (is_temp && checkfail_count > 0) |
7254 | 0 | pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid, |
7255 | 0 | checkfail_count); |
7256 | |
|
7257 | 0 | return result; |
7258 | 0 | } |
7259 | | |
7260 | | /* |
7261 | | * AIO error reporting callback for aio_shared_buffer_readv_cb and |
7262 | | * aio_local_buffer_readv_cb. |
7263 | | * |
7264 | | * The error is encoded / decoded in buffer_readv_encode_error() / |
7265 | | * buffer_readv_decode_error(). |
7266 | | */ |
7267 | | static void |
7268 | | buffer_readv_report(PgAioResult result, const PgAioTargetData *td, |
7269 | | int elevel) |
7270 | 0 | { |
7271 | 0 | int nblocks = td->smgr.nblocks; |
7272 | 0 | BlockNumber first = td->smgr.blockNum; |
7273 | 0 | BlockNumber last = first + nblocks - 1; |
7274 | 0 | ProcNumber errProc = |
7275 | 0 | td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER; |
7276 | 0 | RelPathStr rpath = |
7277 | 0 | relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum); |
7278 | 0 | bool zeroed_any, |
7279 | 0 | ignored_any; |
7280 | 0 | uint8 zeroed_or_error_count, |
7281 | 0 | checkfail_count, |
7282 | 0 | first_off; |
7283 | 0 | uint8 affected_count; |
7284 | 0 | const char *msg_one, |
7285 | 0 | *msg_mult, |
7286 | 0 | *det_mult, |
7287 | 0 | *hint_mult; |
7288 | |
|
7289 | 0 | buffer_readv_decode_error(result, &zeroed_any, &ignored_any, |
7290 | 0 | &zeroed_or_error_count, |
7291 | 0 | &checkfail_count, |
7292 | 0 | &first_off); |
7293 | | |
7294 | | /* |
7295 | | * Treat a read that had both zeroed buffers *and* ignored checksums as a |
7296 | | * special case, it's too irregular to be emitted the same way as the |
7297 | | * other cases. |
7298 | | */ |
7299 | 0 | if (zeroed_any && ignored_any) |
7300 | 0 | { |
7301 | 0 | Assert(zeroed_any && ignored_any); |
7302 | 0 | Assert(nblocks > 1); /* same block can't be both zeroed and ignored */ |
7303 | 0 | Assert(result.status != PGAIO_RS_ERROR); |
7304 | 0 | affected_count = zeroed_or_error_count; |
7305 | |
|
7306 | 0 | ereport(elevel, |
7307 | 0 | errcode(ERRCODE_DATA_CORRUPTED), |
7308 | 0 | errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation %s", |
7309 | 0 | affected_count, checkfail_count, first, last, rpath.str), |
7310 | 0 | affected_count > 1 ? |
7311 | 0 | errdetail("Block %u held first zeroed page.", |
7312 | 0 | first + first_off) : 0, |
7313 | 0 | errhint("See server log for details about the other %d invalid block(s).", |
7314 | 0 | affected_count + checkfail_count - 1)); |
7315 | 0 | return; |
7316 | 0 | } |
7317 | | |
7318 | | /* |
7319 | | * The other messages are highly repetitive. To avoid duplicating a long |
7320 | | * and complicated ereport(), gather the translated format strings |
7321 | | * separately and then do one common ereport. |
7322 | | */ |
7323 | 0 | if (result.status == PGAIO_RS_ERROR) |
7324 | 0 | { |
7325 | 0 | Assert(!zeroed_any); /* can't have invalid pages when zeroing them */ |
7326 | 0 | affected_count = zeroed_or_error_count; |
7327 | 0 | msg_one = _("invalid page in block %u of relation %s"); |
7328 | 0 | msg_mult = _("%u invalid pages among blocks %u..%u of relation %s"); |
7329 | 0 | det_mult = _("Block %u held first invalid page."); |
7330 | 0 | hint_mult = _("See server log for the other %u invalid block(s)."); |
7331 | 0 | } |
7332 | 0 | else if (zeroed_any && !ignored_any) |
7333 | 0 | { |
7334 | 0 | affected_count = zeroed_or_error_count; |
7335 | 0 | msg_one = _("invalid page in block %u of relation %s; zeroing out page"); |
7336 | 0 | msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation %s"); |
7337 | 0 | det_mult = _("Block %u held first zeroed page."); |
7338 | 0 | hint_mult = _("See server log for the other %u zeroed block(s)."); |
7339 | 0 | } |
7340 | 0 | else if (!zeroed_any && ignored_any) |
7341 | 0 | { |
7342 | 0 | affected_count = checkfail_count; |
7343 | 0 | msg_one = _("ignoring checksum failure in block %u of relation %s"); |
7344 | 0 | msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation %s"); |
7345 | 0 | det_mult = _("Block %u held first ignored page."); |
7346 | 0 | hint_mult = _("See server log for the other %u ignored block(s)."); |
7347 | 0 | } |
7348 | 0 | else |
7349 | 0 | pg_unreachable(); |
7350 | | |
7351 | 0 | ereport(elevel, |
7352 | 0 | errcode(ERRCODE_DATA_CORRUPTED), |
7353 | 0 | affected_count == 1 ? |
7354 | 0 | errmsg_internal(msg_one, first + first_off, rpath.str) : |
7355 | 0 | errmsg_internal(msg_mult, affected_count, first, last, rpath.str), |
7356 | 0 | affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0, |
7357 | 0 | affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0); |
7358 | 0 | } |
7359 | | |
7360 | | static void |
7361 | | shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data) |
7362 | 0 | { |
7363 | 0 | buffer_stage_common(ioh, false, false); |
7364 | 0 | } |
7365 | | |
7366 | | static PgAioResult |
7367 | | shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, |
7368 | | uint8 cb_data) |
7369 | 0 | { |
7370 | 0 | return buffer_readv_complete(ioh, prior_result, cb_data, false); |
7371 | 0 | } |
7372 | | |
7373 | | /* |
7374 | | * We need a backend-local completion callback for shared buffers, to be able |
7375 | | * to report checksum errors correctly. Unfortunately that can only safely |
7376 | | * happen if the reporting backend has previously called |
7377 | | * pgstat_prepare_report_checksum_failure(), which we can only guarantee in |
7378 | | * the backend that started the IO. Hence this callback. |
7379 | | */ |
7380 | | static PgAioResult |
7381 | | shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, |
7382 | | uint8 cb_data) |
7383 | 0 | { |
7384 | 0 | bool zeroed_any, |
7385 | 0 | ignored_any; |
7386 | 0 | uint8 zeroed_or_error_count, |
7387 | 0 | checkfail_count, |
7388 | 0 | first_off; |
7389 | |
|
7390 | 0 | if (prior_result.status == PGAIO_RS_OK) |
7391 | 0 | return prior_result; |
7392 | | |
7393 | 0 | buffer_readv_decode_error(prior_result, |
7394 | 0 | &zeroed_any, |
7395 | 0 | &ignored_any, |
7396 | 0 | &zeroed_or_error_count, |
7397 | 0 | &checkfail_count, |
7398 | 0 | &first_off); |
7399 | |
|
7400 | 0 | if (checkfail_count) |
7401 | 0 | { |
7402 | 0 | PgAioTargetData *td = pgaio_io_get_target_data(ioh); |
7403 | |
|
7404 | 0 | pgstat_report_checksum_failures_in_db(td->smgr.rlocator.dbOid, |
7405 | 0 | checkfail_count); |
7406 | 0 | } |
7407 | |
|
7408 | 0 | return prior_result; |
7409 | 0 | } |
7410 | | |
7411 | | static void |
7412 | | local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data) |
7413 | 0 | { |
7414 | 0 | buffer_stage_common(ioh, false, true); |
7415 | 0 | } |
7416 | | |
7417 | | static PgAioResult |
7418 | | local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, |
7419 | | uint8 cb_data) |
7420 | 0 | { |
7421 | 0 | return buffer_readv_complete(ioh, prior_result, cb_data, true); |
7422 | 0 | } |
7423 | | |
7424 | | /* readv callback is passed READ_BUFFERS_* flags as callback data */ |
7425 | | const PgAioHandleCallbacks aio_shared_buffer_readv_cb = { |
7426 | | .stage = shared_buffer_readv_stage, |
7427 | | .complete_shared = shared_buffer_readv_complete, |
7428 | | /* need a local callback to report checksum failures */ |
7429 | | .complete_local = shared_buffer_readv_complete_local, |
7430 | | .report = buffer_readv_report, |
7431 | | }; |
7432 | | |
7433 | | /* readv callback is passed READ_BUFFERS_* flags as callback data */ |
7434 | | const PgAioHandleCallbacks aio_local_buffer_readv_cb = { |
7435 | | .stage = local_buffer_readv_stage, |
7436 | | |
7437 | | /* |
7438 | | * Note that this, in contrast to the shared_buffers case, uses |
7439 | | * complete_local, as only the issuing backend has access to the required |
7440 | | * datastructures. This is important in case the IO completion may be |
7441 | | * consumed incidentally by another backend. |
7442 | | */ |
7443 | | .complete_local = local_buffer_readv_complete, |
7444 | | .report = buffer_readv_report, |
7445 | | }; |