/src/postgres/src/backend/access/brin/brin_pageops.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * brin_pageops.c |
3 | | * Page-handling routines for BRIN indexes |
4 | | * |
5 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
6 | | * Portions Copyright (c) 1994, Regents of the University of California |
7 | | * |
8 | | * IDENTIFICATION |
9 | | * src/backend/access/brin/brin_pageops.c |
10 | | */ |
11 | | #include "postgres.h" |
12 | | |
13 | | #include "access/brin_page.h" |
14 | | #include "access/brin_pageops.h" |
15 | | #include "access/brin_revmap.h" |
16 | | #include "access/brin_xlog.h" |
17 | | #include "access/xloginsert.h" |
18 | | #include "miscadmin.h" |
19 | | #include "storage/bufmgr.h" |
20 | | #include "storage/freespace.h" |
21 | | #include "storage/lmgr.h" |
22 | | #include "utils/rel.h" |
23 | | |
24 | | /* |
25 | | * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page. We can tolerate |
26 | | * a single item per page, unlike other index AMs. |
27 | | */ |
28 | | #define BrinMaxItemSize \ |
29 | 0 | MAXALIGN_DOWN(BLCKSZ - \ |
30 | 0 | (MAXALIGN(SizeOfPageHeaderData + \ |
31 | 0 | sizeof(ItemIdData)) + \ |
32 | 0 | MAXALIGN(sizeof(BrinSpecialSpace)))) |
33 | | |
34 | | static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, |
35 | | bool *extended); |
36 | | static Size br_page_get_freespace(Page page); |
37 | | static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer); |
38 | | |
39 | | |
40 | | /* |
41 | | * Update tuple origtup (size origsz), located in offset oldoff of buffer |
42 | | * oldbuf, to newtup (size newsz) as summary tuple for the page range starting |
43 | | * at heapBlk. oldbuf must not be locked on entry, and is not locked at exit. |
44 | | * |
45 | | * If samepage is true, attempt to put the new tuple in the same page, but if |
46 | | * there's no room, use some other one. |
47 | | * |
48 | | * If the update is successful, return true; the revmap is updated to point to |
49 | | * the new tuple. If the update is not done for whatever reason, return false. |
50 | | * Caller may retry the update if this happens. |
51 | | */ |
52 | | bool |
53 | | brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, |
54 | | BrinRevmap *revmap, BlockNumber heapBlk, |
55 | | Buffer oldbuf, OffsetNumber oldoff, |
56 | | const BrinTuple *origtup, Size origsz, |
57 | | const BrinTuple *newtup, Size newsz, |
58 | | bool samepage) |
59 | 0 | { |
60 | 0 | Page oldpage; |
61 | 0 | ItemId oldlp; |
62 | 0 | BrinTuple *oldtup; |
63 | 0 | Size oldsz; |
64 | 0 | Buffer newbuf; |
65 | 0 | BlockNumber newblk = InvalidBlockNumber; |
66 | 0 | bool extended; |
67 | |
|
68 | 0 | Assert(newsz == MAXALIGN(newsz)); |
69 | | |
70 | | /* If the item is oversized, don't bother. */ |
71 | 0 | if (newsz > BrinMaxItemSize) |
72 | 0 | { |
73 | 0 | ereport(ERROR, |
74 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
75 | 0 | errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", |
76 | 0 | newsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); |
77 | 0 | return false; /* keep compiler quiet */ |
78 | 0 | } |
79 | | |
80 | | /* make sure the revmap is long enough to contain the entry we need */ |
81 | 0 | brinRevmapExtend(revmap, heapBlk); |
82 | |
|
83 | 0 | if (!samepage) |
84 | 0 | { |
85 | | /* need a page on which to put the item */ |
86 | 0 | newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended); |
87 | 0 | if (!BufferIsValid(newbuf)) |
88 | 0 | { |
89 | 0 | Assert(!extended); |
90 | 0 | return false; |
91 | 0 | } |
92 | | |
93 | | /* |
94 | | * Note: it's possible (though unlikely) that the returned newbuf is |
95 | | * the same as oldbuf, if brin_getinsertbuffer determined that the old |
96 | | * buffer does in fact have enough space. |
97 | | */ |
98 | 0 | if (newbuf == oldbuf) |
99 | 0 | { |
100 | 0 | Assert(!extended); |
101 | 0 | newbuf = InvalidBuffer; |
102 | 0 | } |
103 | 0 | else |
104 | 0 | newblk = BufferGetBlockNumber(newbuf); |
105 | 0 | } |
106 | 0 | else |
107 | 0 | { |
108 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); |
109 | 0 | newbuf = InvalidBuffer; |
110 | 0 | extended = false; |
111 | 0 | } |
112 | 0 | oldpage = BufferGetPage(oldbuf); |
113 | 0 | oldlp = PageGetItemId(oldpage, oldoff); |
114 | | |
115 | | /* |
116 | | * Check that the old tuple wasn't updated concurrently: it might have |
117 | | * moved someplace else entirely, and for that matter the whole page |
118 | | * might've become a revmap page. Note that in the first two cases |
119 | | * checked here, the "oldlp" we just calculated is garbage; but |
120 | | * PageGetItemId() is simple enough that it was safe to do that |
121 | | * calculation anyway. |
122 | | */ |
123 | 0 | if (!BRIN_IS_REGULAR_PAGE(oldpage) || |
124 | 0 | oldoff > PageGetMaxOffsetNumber(oldpage) || |
125 | 0 | !ItemIdIsNormal(oldlp)) |
126 | 0 | { |
127 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
128 | | |
129 | | /* |
130 | | * If this happens, and the new buffer was obtained by extending the |
131 | | * relation, then we need to ensure we don't leave it uninitialized or |
132 | | * forget about it. |
133 | | */ |
134 | 0 | if (BufferIsValid(newbuf)) |
135 | 0 | { |
136 | 0 | if (extended) |
137 | 0 | brin_initialize_empty_new_buffer(idxrel, newbuf); |
138 | 0 | UnlockReleaseBuffer(newbuf); |
139 | 0 | if (extended) |
140 | 0 | FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); |
141 | 0 | } |
142 | 0 | return false; |
143 | 0 | } |
144 | | |
145 | 0 | oldsz = ItemIdGetLength(oldlp); |
146 | 0 | oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp); |
147 | | |
148 | | /* |
149 | | * ... or it might have been updated in place to different contents. |
150 | | */ |
151 | 0 | if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz)) |
152 | 0 | { |
153 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
154 | 0 | if (BufferIsValid(newbuf)) |
155 | 0 | { |
156 | | /* As above, initialize and record new page if we got one */ |
157 | 0 | if (extended) |
158 | 0 | brin_initialize_empty_new_buffer(idxrel, newbuf); |
159 | 0 | UnlockReleaseBuffer(newbuf); |
160 | 0 | if (extended) |
161 | 0 | FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); |
162 | 0 | } |
163 | 0 | return false; |
164 | 0 | } |
165 | | |
166 | | /* |
167 | | * Great, the old tuple is intact. We can proceed with the update. |
168 | | * |
169 | | * If there's enough room in the old page for the new tuple, replace it. |
170 | | * |
171 | | * Note that there might now be enough space on the page even though the |
172 | | * caller told us there isn't, if a concurrent update moved another tuple |
173 | | * elsewhere or replaced a tuple with a smaller one. |
174 | | */ |
175 | 0 | if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) && |
176 | 0 | brin_can_do_samepage_update(oldbuf, origsz, newsz)) |
177 | 0 | { |
178 | 0 | START_CRIT_SECTION(); |
179 | 0 | if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz)) |
180 | 0 | elog(ERROR, "failed to replace BRIN tuple"); |
181 | 0 | MarkBufferDirty(oldbuf); |
182 | | |
183 | | /* XLOG stuff */ |
184 | 0 | if (RelationNeedsWAL(idxrel)) |
185 | 0 | { |
186 | 0 | xl_brin_samepage_update xlrec; |
187 | 0 | XLogRecPtr recptr; |
188 | 0 | uint8 info = XLOG_BRIN_SAMEPAGE_UPDATE; |
189 | |
|
190 | 0 | xlrec.offnum = oldoff; |
191 | |
|
192 | 0 | XLogBeginInsert(); |
193 | 0 | XLogRegisterData(&xlrec, SizeOfBrinSamepageUpdate); |
194 | |
|
195 | 0 | XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD); |
196 | 0 | XLogRegisterBufData(0, newtup, newsz); |
197 | |
|
198 | 0 | recptr = XLogInsert(RM_BRIN_ID, info); |
199 | |
|
200 | 0 | PageSetLSN(oldpage, recptr); |
201 | 0 | } |
202 | |
|
203 | 0 | END_CRIT_SECTION(); |
204 | |
|
205 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
206 | |
|
207 | 0 | if (BufferIsValid(newbuf)) |
208 | 0 | { |
209 | | /* As above, initialize and record new page if we got one */ |
210 | 0 | if (extended) |
211 | 0 | brin_initialize_empty_new_buffer(idxrel, newbuf); |
212 | 0 | UnlockReleaseBuffer(newbuf); |
213 | 0 | if (extended) |
214 | 0 | FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); |
215 | 0 | } |
216 | |
|
217 | 0 | return true; |
218 | 0 | } |
219 | 0 | else if (newbuf == InvalidBuffer) |
220 | 0 | { |
221 | | /* |
222 | | * Not enough space, but caller said that there was. Tell them to |
223 | | * start over. |
224 | | */ |
225 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
226 | 0 | return false; |
227 | 0 | } |
228 | 0 | else |
229 | 0 | { |
230 | | /* |
231 | | * Not enough free space on the oldpage. Put the new tuple on the new |
232 | | * page, and update the revmap. |
233 | | */ |
234 | 0 | Page newpage = BufferGetPage(newbuf); |
235 | 0 | Buffer revmapbuf; |
236 | 0 | ItemPointerData newtid; |
237 | 0 | OffsetNumber newoff; |
238 | 0 | Size freespace = 0; |
239 | |
|
240 | 0 | revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); |
241 | |
|
242 | 0 | START_CRIT_SECTION(); |
243 | | |
244 | | /* |
245 | | * We need to initialize the page if it's newly obtained. Note we |
246 | | * will WAL-log the initialization as part of the update, so we don't |
247 | | * need to do that here. |
248 | | */ |
249 | 0 | if (extended) |
250 | 0 | brin_page_init(newpage, BRIN_PAGETYPE_REGULAR); |
251 | |
|
252 | 0 | PageIndexTupleDeleteNoCompact(oldpage, oldoff); |
253 | 0 | newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz, |
254 | 0 | InvalidOffsetNumber, false, false); |
255 | 0 | if (newoff == InvalidOffsetNumber) |
256 | 0 | elog(ERROR, "failed to add BRIN tuple to new page"); |
257 | 0 | MarkBufferDirty(oldbuf); |
258 | 0 | MarkBufferDirty(newbuf); |
259 | | |
260 | | /* needed to update FSM below */ |
261 | 0 | if (extended) |
262 | 0 | freespace = br_page_get_freespace(newpage); |
263 | |
|
264 | 0 | ItemPointerSet(&newtid, newblk, newoff); |
265 | 0 | brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid); |
266 | 0 | MarkBufferDirty(revmapbuf); |
267 | | |
268 | | /* XLOG stuff */ |
269 | 0 | if (RelationNeedsWAL(idxrel)) |
270 | 0 | { |
271 | 0 | xl_brin_update xlrec; |
272 | 0 | XLogRecPtr recptr; |
273 | 0 | uint8 info; |
274 | |
|
275 | 0 | info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0); |
276 | |
|
277 | 0 | xlrec.insert.offnum = newoff; |
278 | 0 | xlrec.insert.heapBlk = heapBlk; |
279 | 0 | xlrec.insert.pagesPerRange = pagesPerRange; |
280 | 0 | xlrec.oldOffnum = oldoff; |
281 | |
|
282 | 0 | XLogBeginInsert(); |
283 | | |
284 | | /* new page */ |
285 | 0 | XLogRegisterData(&xlrec, SizeOfBrinUpdate); |
286 | |
|
287 | 0 | XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); |
288 | 0 | XLogRegisterBufData(0, newtup, newsz); |
289 | | |
290 | | /* revmap page */ |
291 | 0 | XLogRegisterBuffer(1, revmapbuf, 0); |
292 | | |
293 | | /* old page */ |
294 | 0 | XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD); |
295 | |
|
296 | 0 | recptr = XLogInsert(RM_BRIN_ID, info); |
297 | |
|
298 | 0 | PageSetLSN(oldpage, recptr); |
299 | 0 | PageSetLSN(newpage, recptr); |
300 | 0 | PageSetLSN(BufferGetPage(revmapbuf), recptr); |
301 | 0 | } |
302 | |
|
303 | 0 | END_CRIT_SECTION(); |
304 | |
|
305 | 0 | LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); |
306 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
307 | 0 | UnlockReleaseBuffer(newbuf); |
308 | |
|
309 | 0 | if (extended) |
310 | 0 | { |
311 | 0 | RecordPageWithFreeSpace(idxrel, newblk, freespace); |
312 | 0 | FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1); |
313 | 0 | } |
314 | |
|
315 | 0 | return true; |
316 | 0 | } |
317 | 0 | } |
318 | | |
319 | | /* |
320 | | * Return whether brin_doupdate can do a samepage update. |
321 | | */ |
322 | | bool |
323 | | brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz) |
324 | 0 | { |
325 | 0 | return |
326 | 0 | ((newsz <= origsz) || |
327 | 0 | PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz)); |
328 | 0 | } |
329 | | |
330 | | /* |
331 | | * Insert an index tuple into the index relation. The revmap is updated to |
332 | | * mark the range containing the given page as pointing to the inserted entry. |
333 | | * A WAL record is written. |
334 | | * |
335 | | * The buffer, if valid, is first checked for free space to insert the new |
336 | | * entry; if there isn't enough, a new buffer is obtained and pinned. No |
337 | | * buffer lock must be held on entry, no buffer lock is held on exit. |
338 | | * |
339 | | * Return value is the offset number where the tuple was inserted. |
340 | | */ |
341 | | OffsetNumber |
342 | | brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, |
343 | | BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, |
344 | | BrinTuple *tup, Size itemsz) |
345 | 0 | { |
346 | 0 | Page page; |
347 | 0 | BlockNumber blk; |
348 | 0 | OffsetNumber off; |
349 | 0 | Size freespace = 0; |
350 | 0 | Buffer revmapbuf; |
351 | 0 | ItemPointerData tid; |
352 | 0 | bool extended; |
353 | |
|
354 | 0 | Assert(itemsz == MAXALIGN(itemsz)); |
355 | | |
356 | | /* If the item is oversized, don't even bother. */ |
357 | 0 | if (itemsz > BrinMaxItemSize) |
358 | 0 | { |
359 | 0 | ereport(ERROR, |
360 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
361 | 0 | errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", |
362 | 0 | itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel)))); |
363 | 0 | return InvalidOffsetNumber; /* keep compiler quiet */ |
364 | 0 | } |
365 | | |
366 | | /* Make sure the revmap is long enough to contain the entry we need */ |
367 | 0 | brinRevmapExtend(revmap, heapBlk); |
368 | | |
369 | | /* |
370 | | * Acquire lock on buffer supplied by caller, if any. If it doesn't have |
371 | | * enough space, unpin it to obtain a new one below. |
372 | | */ |
373 | 0 | if (BufferIsValid(*buffer)) |
374 | 0 | { |
375 | | /* |
376 | | * It's possible that another backend (or ourselves!) extended the |
377 | | * revmap over the page we held a pin on, so we cannot assume that |
378 | | * it's still a regular page. |
379 | | */ |
380 | 0 | LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE); |
381 | 0 | if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz) |
382 | 0 | { |
383 | 0 | UnlockReleaseBuffer(*buffer); |
384 | 0 | *buffer = InvalidBuffer; |
385 | 0 | } |
386 | 0 | } |
387 | | |
388 | | /* |
389 | | * If we still don't have a usable buffer, have brin_getinsertbuffer |
390 | | * obtain one for us. |
391 | | */ |
392 | 0 | if (!BufferIsValid(*buffer)) |
393 | 0 | { |
394 | 0 | do |
395 | 0 | *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended); |
396 | 0 | while (!BufferIsValid(*buffer)); |
397 | 0 | } |
398 | 0 | else |
399 | 0 | extended = false; |
400 | | |
401 | | /* Now obtain lock on revmap buffer */ |
402 | 0 | revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk); |
403 | |
|
404 | 0 | page = BufferGetPage(*buffer); |
405 | 0 | blk = BufferGetBlockNumber(*buffer); |
406 | | |
407 | | /* Execute the actual insertion */ |
408 | 0 | START_CRIT_SECTION(); |
409 | 0 | if (extended) |
410 | 0 | brin_page_init(page, BRIN_PAGETYPE_REGULAR); |
411 | 0 | off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber, |
412 | 0 | false, false); |
413 | 0 | if (off == InvalidOffsetNumber) |
414 | 0 | elog(ERROR, "failed to add BRIN tuple to new page"); |
415 | 0 | MarkBufferDirty(*buffer); |
416 | | |
417 | | /* needed to update FSM below */ |
418 | 0 | if (extended) |
419 | 0 | freespace = br_page_get_freespace(page); |
420 | |
|
421 | 0 | ItemPointerSet(&tid, blk, off); |
422 | 0 | brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid); |
423 | 0 | MarkBufferDirty(revmapbuf); |
424 | | |
425 | | /* XLOG stuff */ |
426 | 0 | if (RelationNeedsWAL(idxrel)) |
427 | 0 | { |
428 | 0 | xl_brin_insert xlrec; |
429 | 0 | XLogRecPtr recptr; |
430 | 0 | uint8 info; |
431 | |
|
432 | 0 | info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0); |
433 | 0 | xlrec.heapBlk = heapBlk; |
434 | 0 | xlrec.pagesPerRange = pagesPerRange; |
435 | 0 | xlrec.offnum = off; |
436 | |
|
437 | 0 | XLogBeginInsert(); |
438 | 0 | XLogRegisterData(&xlrec, SizeOfBrinInsert); |
439 | |
|
440 | 0 | XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0)); |
441 | 0 | XLogRegisterBufData(0, tup, itemsz); |
442 | |
|
443 | 0 | XLogRegisterBuffer(1, revmapbuf, 0); |
444 | |
|
445 | 0 | recptr = XLogInsert(RM_BRIN_ID, info); |
446 | |
|
447 | 0 | PageSetLSN(page, recptr); |
448 | 0 | PageSetLSN(BufferGetPage(revmapbuf), recptr); |
449 | 0 | } |
450 | |
|
451 | 0 | END_CRIT_SECTION(); |
452 | | |
453 | | /* Tuple is firmly on buffer; we can release our locks */ |
454 | 0 | LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); |
455 | 0 | LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK); |
456 | |
|
457 | 0 | BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u", |
458 | 0 | blk, off, heapBlk)); |
459 | |
|
460 | 0 | if (extended) |
461 | 0 | { |
462 | 0 | RecordPageWithFreeSpace(idxrel, blk, freespace); |
463 | 0 | FreeSpaceMapVacuumRange(idxrel, blk, blk + 1); |
464 | 0 | } |
465 | |
|
466 | 0 | return off; |
467 | 0 | } |
468 | | |
469 | | /* |
470 | | * Initialize a page with the given type. |
471 | | * |
472 | | * Caller is responsible for marking it dirty, as appropriate. |
473 | | */ |
474 | | void |
475 | | brin_page_init(Page page, uint16 type) |
476 | 0 | { |
477 | 0 | PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace)); |
478 | |
|
479 | 0 | BrinPageType(page) = type; |
480 | 0 | } |
481 | | |
482 | | /* |
483 | | * Initialize a new BRIN index's metapage. |
484 | | */ |
485 | | void |
486 | | brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version) |
487 | 0 | { |
488 | 0 | BrinMetaPageData *metadata; |
489 | |
|
490 | 0 | brin_page_init(page, BRIN_PAGETYPE_META); |
491 | |
|
492 | 0 | metadata = (BrinMetaPageData *) PageGetContents(page); |
493 | |
|
494 | 0 | metadata->brinMagic = BRIN_META_MAGIC; |
495 | 0 | metadata->brinVersion = version; |
496 | 0 | metadata->pagesPerRange = pagesPerRange; |
497 | | |
498 | | /* |
499 | | * Note we cheat here a little. 0 is not a valid revmap block number |
500 | | * (because it's the metapage buffer), but doing this enables the first |
501 | | * revmap page to be created when the index is. |
502 | | */ |
503 | 0 | metadata->lastRevmapPage = 0; |
504 | | |
505 | | /* |
506 | | * Set pd_lower just past the end of the metadata. This is essential, |
507 | | * because without doing so, metadata will be lost if xlog.c compresses |
508 | | * the page. |
509 | | */ |
510 | 0 | ((PageHeader) page)->pd_lower = |
511 | 0 | ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page; |
512 | 0 | } |
513 | | |
514 | | /* |
515 | | * Initiate page evacuation protocol. |
516 | | * |
517 | | * The page must be locked in exclusive mode by the caller. |
518 | | * |
519 | | * If the page is not yet initialized or empty, return false without doing |
520 | | * anything; it can be used for revmap without any further changes. If it |
521 | | * contains tuples, mark it for evacuation and return true. |
522 | | */ |
523 | | bool |
524 | | brin_start_evacuating_page(Relation idxRel, Buffer buf) |
525 | 0 | { |
526 | 0 | OffsetNumber off; |
527 | 0 | OffsetNumber maxoff; |
528 | 0 | Page page; |
529 | |
|
530 | 0 | page = BufferGetPage(buf); |
531 | |
|
532 | 0 | if (PageIsNew(page)) |
533 | 0 | return false; |
534 | | |
535 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
536 | 0 | for (off = FirstOffsetNumber; off <= maxoff; off++) |
537 | 0 | { |
538 | 0 | ItemId lp; |
539 | |
|
540 | 0 | lp = PageGetItemId(page, off); |
541 | 0 | if (ItemIdIsUsed(lp)) |
542 | 0 | { |
543 | | /* |
544 | | * Prevent other backends from adding more stuff to this page: |
545 | | * BRIN_EVACUATE_PAGE informs br_page_get_freespace that this page |
546 | | * can no longer be used to add new tuples. Note that this flag |
547 | | * is not WAL-logged, except accidentally. |
548 | | */ |
549 | 0 | BrinPageFlags(page) |= BRIN_EVACUATE_PAGE; |
550 | 0 | MarkBufferDirtyHint(buf, true); |
551 | |
|
552 | 0 | return true; |
553 | 0 | } |
554 | 0 | } |
555 | 0 | return false; |
556 | 0 | } |
557 | | |
558 | | /* |
559 | | * Move all tuples out of a page. |
560 | | * |
561 | | * The caller must hold lock on the page. The lock and pin are released. |
562 | | */ |
563 | | void |
564 | | brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange, |
565 | | BrinRevmap *revmap, Buffer buf) |
566 | 0 | { |
567 | 0 | OffsetNumber off; |
568 | 0 | OffsetNumber maxoff; |
569 | 0 | Page page; |
570 | 0 | BrinTuple *btup = NULL; |
571 | 0 | Size btupsz = 0; |
572 | |
|
573 | 0 | page = BufferGetPage(buf); |
574 | |
|
575 | 0 | Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE); |
576 | |
|
577 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
578 | 0 | for (off = FirstOffsetNumber; off <= maxoff; off++) |
579 | 0 | { |
580 | 0 | BrinTuple *tup; |
581 | 0 | Size sz; |
582 | 0 | ItemId lp; |
583 | |
|
584 | 0 | CHECK_FOR_INTERRUPTS(); |
585 | |
|
586 | 0 | lp = PageGetItemId(page, off); |
587 | 0 | if (ItemIdIsUsed(lp)) |
588 | 0 | { |
589 | 0 | sz = ItemIdGetLength(lp); |
590 | 0 | tup = (BrinTuple *) PageGetItem(page, lp); |
591 | 0 | tup = brin_copy_tuple(tup, sz, btup, &btupsz); |
592 | |
|
593 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
594 | |
|
595 | 0 | if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno, |
596 | 0 | buf, off, tup, sz, tup, sz, false)) |
597 | 0 | off--; /* retry */ |
598 | |
|
599 | 0 | LockBuffer(buf, BUFFER_LOCK_SHARE); |
600 | | |
601 | | /* It's possible that someone extended the revmap over this page */ |
602 | 0 | if (!BRIN_IS_REGULAR_PAGE(page)) |
603 | 0 | break; |
604 | 0 | } |
605 | 0 | } |
606 | |
|
607 | 0 | UnlockReleaseBuffer(buf); |
608 | 0 | } |
609 | | |
610 | | /* |
611 | | * Given a BRIN index page, initialize it if necessary, and record its |
612 | | * current free space in the FSM. |
613 | | * |
614 | | * The main use for this is when, during vacuuming, an uninitialized page is |
615 | | * found, which could be the result of relation extension followed by a crash |
616 | | * before the page can be used. |
617 | | * |
618 | | * Here, we don't bother to update upper FSM pages, instead expecting that our |
619 | | * caller (brin_vacuum_scan) will fix them at the end of the scan. Elsewhere |
620 | | * in this file, it's generally a good idea to propagate additions of free |
621 | | * space into the upper FSM pages immediately. |
622 | | */ |
623 | | void |
624 | | brin_page_cleanup(Relation idxrel, Buffer buf) |
625 | 0 | { |
626 | 0 | Page page = BufferGetPage(buf); |
627 | | |
628 | | /* |
629 | | * If a page was left uninitialized, initialize it now; also record it in |
630 | | * FSM. |
631 | | * |
632 | | * Somebody else might be extending the relation concurrently. To avoid |
633 | | * re-initializing the page before they can grab the buffer lock, we |
634 | | * acquire the extension lock momentarily. Since they hold the extension |
635 | | * lock from before getting the page and after its been initialized, we're |
636 | | * sure to see their initialization. |
637 | | */ |
638 | 0 | if (PageIsNew(page)) |
639 | 0 | { |
640 | 0 | LockRelationForExtension(idxrel, ShareLock); |
641 | 0 | UnlockRelationForExtension(idxrel, ShareLock); |
642 | |
|
643 | 0 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
644 | 0 | if (PageIsNew(page)) |
645 | 0 | { |
646 | 0 | brin_initialize_empty_new_buffer(idxrel, buf); |
647 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
648 | 0 | return; |
649 | 0 | } |
650 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
651 | 0 | } |
652 | | |
653 | | /* Nothing to be done for non-regular index pages */ |
654 | 0 | if (BRIN_IS_META_PAGE(BufferGetPage(buf)) || |
655 | 0 | BRIN_IS_REVMAP_PAGE(BufferGetPage(buf))) |
656 | 0 | return; |
657 | | |
658 | | /* Measure free space and record it */ |
659 | 0 | RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf), |
660 | 0 | br_page_get_freespace(page)); |
661 | 0 | } |
662 | | |
663 | | /* |
664 | | * Return a pinned and exclusively locked buffer which can be used to insert an |
665 | | * index item of size itemsz (caller must ensure not to request sizes |
666 | | * impossible to fulfill). If oldbuf is a valid buffer, it is also locked (in |
667 | | * an order determined to avoid deadlocks). |
668 | | * |
669 | | * If we find that the old page is no longer a regular index page (because |
670 | | * of a revmap extension), the old buffer is unlocked and we return |
671 | | * InvalidBuffer. |
672 | | * |
673 | | * If there's no existing page with enough free space to accommodate the new |
674 | | * item, the relation is extended. If this happens, *extended is set to true, |
675 | | * and it is the caller's responsibility to initialize the page (and WAL-log |
676 | | * that fact) prior to use. The caller should also update the FSM with the |
677 | | * page's remaining free space after the insertion. |
678 | | * |
679 | | * Note that the caller is not expected to update FSM unless *extended is set |
680 | | * true. This policy means that we'll update FSM when a page is created, and |
681 | | * when it's found to have too little space for a desired tuple insertion, |
682 | | * but not every single time we add a tuple to the page. |
683 | | * |
684 | | * Note that in some corner cases it is possible for this routine to extend |
685 | | * the relation and then not return the new page. It is this routine's |
686 | | * responsibility to WAL-log the page initialization and to record the page in |
687 | | * FSM if that happens, since the caller certainly can't do it. |
688 | | */ |
689 | | static Buffer |
690 | | brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz, |
691 | | bool *extended) |
692 | 0 | { |
693 | 0 | BlockNumber oldblk; |
694 | 0 | BlockNumber newblk; |
695 | 0 | Page page; |
696 | 0 | Size freespace; |
697 | | |
698 | | /* callers must have checked */ |
699 | 0 | Assert(itemsz <= BrinMaxItemSize); |
700 | |
|
701 | 0 | if (BufferIsValid(oldbuf)) |
702 | 0 | oldblk = BufferGetBlockNumber(oldbuf); |
703 | 0 | else |
704 | 0 | oldblk = InvalidBlockNumber; |
705 | | |
706 | | /* Choose initial target page, re-using existing target if known */ |
707 | 0 | newblk = RelationGetTargetBlock(irel); |
708 | 0 | if (newblk == InvalidBlockNumber) |
709 | 0 | newblk = GetPageWithFreeSpace(irel, itemsz); |
710 | | |
711 | | /* |
712 | | * Loop until we find a page with sufficient free space. By the time we |
713 | | * return to caller out of this loop, both buffers are valid and locked; |
714 | | * if we have to restart here, neither page is locked and newblk isn't |
715 | | * pinned (if it's even valid). |
716 | | */ |
717 | 0 | for (;;) |
718 | 0 | { |
719 | 0 | Buffer buf; |
720 | 0 | bool extensionLockHeld = false; |
721 | |
|
722 | 0 | CHECK_FOR_INTERRUPTS(); |
723 | |
|
724 | 0 | *extended = false; |
725 | |
|
726 | 0 | if (newblk == InvalidBlockNumber) |
727 | 0 | { |
728 | | /* |
729 | | * There's not enough free space in any existing index page, |
730 | | * according to the FSM: extend the relation to obtain a shiny new |
731 | | * page. |
732 | | * |
733 | | * XXX: It's likely possible to use RBM_ZERO_AND_LOCK here, |
734 | | * which'd avoid the need to hold the extension lock during buffer |
735 | | * reclaim. |
736 | | */ |
737 | 0 | if (!RELATION_IS_LOCAL(irel)) |
738 | 0 | { |
739 | 0 | LockRelationForExtension(irel, ExclusiveLock); |
740 | 0 | extensionLockHeld = true; |
741 | 0 | } |
742 | 0 | buf = ReadBuffer(irel, P_NEW); |
743 | 0 | newblk = BufferGetBlockNumber(buf); |
744 | 0 | *extended = true; |
745 | |
|
746 | 0 | BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u", |
747 | 0 | BufferGetBlockNumber(buf))); |
748 | 0 | } |
749 | 0 | else if (newblk == oldblk) |
750 | 0 | { |
751 | | /* |
752 | | * There's an odd corner-case here where the FSM is out-of-date, |
753 | | * and gave us the old page. |
754 | | */ |
755 | 0 | buf = oldbuf; |
756 | 0 | } |
757 | 0 | else |
758 | 0 | { |
759 | 0 | buf = ReadBuffer(irel, newblk); |
760 | 0 | } |
761 | | |
762 | | /* |
763 | | * We lock the old buffer first, if it's earlier than the new one; but |
764 | | * then we need to check that it hasn't been turned into a revmap page |
765 | | * concurrently. If we detect that that happened, give up and tell |
766 | | * caller to start over. |
767 | | */ |
768 | 0 | if (BufferIsValid(oldbuf) && oldblk < newblk) |
769 | 0 | { |
770 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); |
771 | 0 | if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))) |
772 | 0 | { |
773 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
774 | | |
775 | | /* |
776 | | * It is possible that the new page was obtained from |
777 | | * extending the relation. In that case, we must be sure to |
778 | | * record it in the FSM before leaving, because otherwise the |
779 | | * space would be lost forever. However, we cannot let an |
780 | | * uninitialized page get in the FSM, so we need to initialize |
781 | | * it first. |
782 | | */ |
783 | 0 | if (*extended) |
784 | 0 | brin_initialize_empty_new_buffer(irel, buf); |
785 | |
|
786 | 0 | if (extensionLockHeld) |
787 | 0 | UnlockRelationForExtension(irel, ExclusiveLock); |
788 | |
|
789 | 0 | ReleaseBuffer(buf); |
790 | |
|
791 | 0 | if (*extended) |
792 | 0 | { |
793 | 0 | FreeSpaceMapVacuumRange(irel, newblk, newblk + 1); |
794 | | /* shouldn't matter, but don't confuse caller */ |
795 | 0 | *extended = false; |
796 | 0 | } |
797 | |
|
798 | 0 | return InvalidBuffer; |
799 | 0 | } |
800 | 0 | } |
801 | | |
802 | 0 | LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); |
803 | |
|
804 | 0 | if (extensionLockHeld) |
805 | 0 | UnlockRelationForExtension(irel, ExclusiveLock); |
806 | |
|
807 | 0 | page = BufferGetPage(buf); |
808 | | |
809 | | /* |
810 | | * We have a new buffer to insert into. Check that the new page has |
811 | | * enough free space, and return it if it does; otherwise start over. |
812 | | * (br_page_get_freespace also checks that the FSM didn't hand us a |
813 | | * page that has since been repurposed for the revmap.) |
814 | | */ |
815 | 0 | freespace = *extended ? |
816 | 0 | BrinMaxItemSize : br_page_get_freespace(page); |
817 | 0 | if (freespace >= itemsz) |
818 | 0 | { |
819 | 0 | RelationSetTargetBlock(irel, newblk); |
820 | | |
821 | | /* |
822 | | * Lock the old buffer if not locked already. Note that in this |
823 | | * case we know for sure it's a regular page: it's later than the |
824 | | * new page we just got, which is not a revmap page, and revmap |
825 | | * pages are always consecutive. |
826 | | */ |
827 | 0 | if (BufferIsValid(oldbuf) && oldblk > newblk) |
828 | 0 | { |
829 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE); |
830 | 0 | Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf))); |
831 | 0 | } |
832 | |
|
833 | 0 | return buf; |
834 | 0 | } |
835 | | |
836 | | /* This page is no good. */ |
837 | | |
838 | | /* |
839 | | * If an entirely new page does not contain enough free space for the |
840 | | * new item, then surely that item is oversized. Complain loudly; but |
841 | | * first make sure we initialize the page and record it as free, for |
842 | | * next time. |
843 | | */ |
844 | 0 | if (*extended) |
845 | 0 | { |
846 | 0 | brin_initialize_empty_new_buffer(irel, buf); |
847 | | /* since this should not happen, skip FreeSpaceMapVacuum */ |
848 | |
|
849 | 0 | ereport(ERROR, |
850 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
851 | 0 | errmsg("index row size %zu exceeds maximum %zu for index \"%s\"", |
852 | 0 | itemsz, freespace, RelationGetRelationName(irel)))); |
853 | 0 | return InvalidBuffer; /* keep compiler quiet */ |
854 | 0 | } |
855 | | |
856 | 0 | if (newblk != oldblk) |
857 | 0 | UnlockReleaseBuffer(buf); |
858 | 0 | if (BufferIsValid(oldbuf) && oldblk <= newblk) |
859 | 0 | LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK); |
860 | | |
861 | | /* |
862 | | * Update the FSM with the new, presumably smaller, freespace value |
863 | | * for this page, then search for a new target page. |
864 | | */ |
865 | 0 | newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz); |
866 | 0 | } |
867 | 0 | } |
868 | | |
869 | | /* |
870 | | * Initialize a page as an empty regular BRIN page, WAL-log this, and record |
871 | | * the page in FSM. |
872 | | * |
873 | | * There are several corner situations in which we extend the relation to |
874 | | * obtain a new page and later find that we cannot use it immediately. When |
875 | | * that happens, we don't want to leave the page go unrecorded in FSM, because |
876 | | * there is no mechanism to get the space back and the index would bloat. |
877 | | * Also, because we would not WAL-log the action that would initialize the |
878 | | * page, the page would go uninitialized in a standby (or after recovery). |
879 | | * |
880 | | * While we record the page in FSM here, caller is responsible for doing FSM |
881 | | * upper-page update if that seems appropriate. |
882 | | */ |
883 | | static void |
884 | | brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer) |
885 | 0 | { |
886 | 0 | Page page; |
887 | |
|
888 | 0 | BRIN_elog((DEBUG2, |
889 | 0 | "brin_initialize_empty_new_buffer: initializing blank page %u", |
890 | 0 | BufferGetBlockNumber(buffer))); |
891 | |
|
892 | 0 | START_CRIT_SECTION(); |
893 | 0 | page = BufferGetPage(buffer); |
894 | 0 | brin_page_init(page, BRIN_PAGETYPE_REGULAR); |
895 | 0 | MarkBufferDirty(buffer); |
896 | 0 | log_newpage_buffer(buffer, true); |
897 | 0 | END_CRIT_SECTION(); |
898 | | |
899 | | /* |
900 | | * We update the FSM for this page, but this is not WAL-logged. This is |
901 | | * acceptable because VACUUM will scan the index and update the FSM with |
902 | | * pages whose FSM records were forgotten in a crash. |
903 | | */ |
904 | 0 | RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer), |
905 | 0 | br_page_get_freespace(page)); |
906 | 0 | } |
907 | | |
908 | | |
909 | | /* |
910 | | * Return the amount of free space on a regular BRIN index page. |
911 | | * |
912 | | * If the page is not a regular page, or has been marked with the |
913 | | * BRIN_EVACUATE_PAGE flag, returns 0. |
914 | | */ |
915 | | static Size |
916 | | br_page_get_freespace(Page page) |
917 | 0 | { |
918 | 0 | if (!BRIN_IS_REGULAR_PAGE(page) || |
919 | 0 | (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0) |
920 | 0 | return 0; |
921 | 0 | else |
922 | 0 | return PageGetFreeSpace(page); |
923 | 0 | } |