Coverage Report

Created: 2025-06-13 06:06

/src/postgres/src/backend/access/brin/brin_pageops.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * brin_pageops.c
3
 *    Page-handling routines for BRIN indexes
4
 *
5
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
6
 * Portions Copyright (c) 1994, Regents of the University of California
7
 *
8
 * IDENTIFICATION
9
 *    src/backend/access/brin/brin_pageops.c
10
 */
11
#include "postgres.h"
12
13
#include "access/brin_page.h"
14
#include "access/brin_pageops.h"
15
#include "access/brin_revmap.h"
16
#include "access/brin_xlog.h"
17
#include "access/xloginsert.h"
18
#include "miscadmin.h"
19
#include "storage/bufmgr.h"
20
#include "storage/freespace.h"
21
#include "storage/lmgr.h"
22
#include "utils/rel.h"
23
24
/*
25
 * Maximum size of an entry in a BRIN_PAGETYPE_REGULAR page.  We can tolerate
26
 * a single item per page, unlike other index AMs.
27
 */
28
#define BrinMaxItemSize \
29
0
  MAXALIGN_DOWN(BLCKSZ - \
30
0
          (MAXALIGN(SizeOfPageHeaderData + \
31
0
              sizeof(ItemIdData)) + \
32
0
           MAXALIGN(sizeof(BrinSpecialSpace))))
33
34
static Buffer brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
35
                   bool *extended);
36
static Size br_page_get_freespace(Page page);
37
static void brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer);
38
39
40
/*
41
 * Update tuple origtup (size origsz), located in offset oldoff of buffer
42
 * oldbuf, to newtup (size newsz) as summary tuple for the page range starting
43
 * at heapBlk.  oldbuf must not be locked on entry, and is not locked at exit.
44
 *
45
 * If samepage is true, attempt to put the new tuple in the same page, but if
46
 * there's no room, use some other one.
47
 *
48
 * If the update is successful, return true; the revmap is updated to point to
49
 * the new tuple.  If the update is not done for whatever reason, return false.
50
 * Caller may retry the update if this happens.
51
 */
52
bool
53
brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
54
        BrinRevmap *revmap, BlockNumber heapBlk,
55
        Buffer oldbuf, OffsetNumber oldoff,
56
        const BrinTuple *origtup, Size origsz,
57
        const BrinTuple *newtup, Size newsz,
58
        bool samepage)
59
0
{
60
0
  Page    oldpage;
61
0
  ItemId    oldlp;
62
0
  BrinTuple  *oldtup;
63
0
  Size    oldsz;
64
0
  Buffer    newbuf;
65
0
  BlockNumber newblk = InvalidBlockNumber;
66
0
  bool    extended;
67
68
0
  Assert(newsz == MAXALIGN(newsz));
69
70
  /* If the item is oversized, don't bother. */
71
0
  if (newsz > BrinMaxItemSize)
72
0
  {
73
0
    ereport(ERROR,
74
0
        (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
75
0
         errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
76
0
            newsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
77
0
    return false;     /* keep compiler quiet */
78
0
  }
79
80
  /* make sure the revmap is long enough to contain the entry we need */
81
0
  brinRevmapExtend(revmap, heapBlk);
82
83
0
  if (!samepage)
84
0
  {
85
    /* need a page on which to put the item */
86
0
    newbuf = brin_getinsertbuffer(idxrel, oldbuf, newsz, &extended);
87
0
    if (!BufferIsValid(newbuf))
88
0
    {
89
0
      Assert(!extended);
90
0
      return false;
91
0
    }
92
93
    /*
94
     * Note: it's possible (though unlikely) that the returned newbuf is
95
     * the same as oldbuf, if brin_getinsertbuffer determined that the old
96
     * buffer does in fact have enough space.
97
     */
98
0
    if (newbuf == oldbuf)
99
0
    {
100
0
      Assert(!extended);
101
0
      newbuf = InvalidBuffer;
102
0
    }
103
0
    else
104
0
      newblk = BufferGetBlockNumber(newbuf);
105
0
  }
106
0
  else
107
0
  {
108
0
    LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
109
0
    newbuf = InvalidBuffer;
110
0
    extended = false;
111
0
  }
112
0
  oldpage = BufferGetPage(oldbuf);
113
0
  oldlp = PageGetItemId(oldpage, oldoff);
114
115
  /*
116
   * Check that the old tuple wasn't updated concurrently: it might have
117
   * moved someplace else entirely, and for that matter the whole page
118
   * might've become a revmap page.  Note that in the first two cases
119
   * checked here, the "oldlp" we just calculated is garbage; but
120
   * PageGetItemId() is simple enough that it was safe to do that
121
   * calculation anyway.
122
   */
123
0
  if (!BRIN_IS_REGULAR_PAGE(oldpage) ||
124
0
    oldoff > PageGetMaxOffsetNumber(oldpage) ||
125
0
    !ItemIdIsNormal(oldlp))
126
0
  {
127
0
    LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
128
129
    /*
130
     * If this happens, and the new buffer was obtained by extending the
131
     * relation, then we need to ensure we don't leave it uninitialized or
132
     * forget about it.
133
     */
134
0
    if (BufferIsValid(newbuf))
135
0
    {
136
0
      if (extended)
137
0
        brin_initialize_empty_new_buffer(idxrel, newbuf);
138
0
      UnlockReleaseBuffer(newbuf);
139
0
      if (extended)
140
0
        FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
141
0
    }
142
0
    return false;
143
0
  }
144
145
0
  oldsz = ItemIdGetLength(oldlp);
146
0
  oldtup = (BrinTuple *) PageGetItem(oldpage, oldlp);
147
148
  /*
149
   * ... or it might have been updated in place to different contents.
150
   */
151
0
  if (!brin_tuples_equal(oldtup, oldsz, origtup, origsz))
152
0
  {
153
0
    LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
154
0
    if (BufferIsValid(newbuf))
155
0
    {
156
      /* As above, initialize and record new page if we got one */
157
0
      if (extended)
158
0
        brin_initialize_empty_new_buffer(idxrel, newbuf);
159
0
      UnlockReleaseBuffer(newbuf);
160
0
      if (extended)
161
0
        FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
162
0
    }
163
0
    return false;
164
0
  }
165
166
  /*
167
   * Great, the old tuple is intact.  We can proceed with the update.
168
   *
169
   * If there's enough room in the old page for the new tuple, replace it.
170
   *
171
   * Note that there might now be enough space on the page even though the
172
   * caller told us there isn't, if a concurrent update moved another tuple
173
   * elsewhere or replaced a tuple with a smaller one.
174
   */
175
0
  if (((BrinPageFlags(oldpage) & BRIN_EVACUATE_PAGE) == 0) &&
176
0
    brin_can_do_samepage_update(oldbuf, origsz, newsz))
177
0
  {
178
0
    START_CRIT_SECTION();
179
0
    if (!PageIndexTupleOverwrite(oldpage, oldoff, (Item) unconstify(BrinTuple *, newtup), newsz))
180
0
      elog(ERROR, "failed to replace BRIN tuple");
181
0
    MarkBufferDirty(oldbuf);
182
183
    /* XLOG stuff */
184
0
    if (RelationNeedsWAL(idxrel))
185
0
    {
186
0
      xl_brin_samepage_update xlrec;
187
0
      XLogRecPtr  recptr;
188
0
      uint8   info = XLOG_BRIN_SAMEPAGE_UPDATE;
189
190
0
      xlrec.offnum = oldoff;
191
192
0
      XLogBeginInsert();
193
0
      XLogRegisterData(&xlrec, SizeOfBrinSamepageUpdate);
194
195
0
      XLogRegisterBuffer(0, oldbuf, REGBUF_STANDARD);
196
0
      XLogRegisterBufData(0, newtup, newsz);
197
198
0
      recptr = XLogInsert(RM_BRIN_ID, info);
199
200
0
      PageSetLSN(oldpage, recptr);
201
0
    }
202
203
0
    END_CRIT_SECTION();
204
205
0
    LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
206
207
0
    if (BufferIsValid(newbuf))
208
0
    {
209
      /* As above, initialize and record new page if we got one */
210
0
      if (extended)
211
0
        brin_initialize_empty_new_buffer(idxrel, newbuf);
212
0
      UnlockReleaseBuffer(newbuf);
213
0
      if (extended)
214
0
        FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
215
0
    }
216
217
0
    return true;
218
0
  }
219
0
  else if (newbuf == InvalidBuffer)
220
0
  {
221
    /*
222
     * Not enough space, but caller said that there was. Tell them to
223
     * start over.
224
     */
225
0
    LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
226
0
    return false;
227
0
  }
228
0
  else
229
0
  {
230
    /*
231
     * Not enough free space on the oldpage. Put the new tuple on the new
232
     * page, and update the revmap.
233
     */
234
0
    Page    newpage = BufferGetPage(newbuf);
235
0
    Buffer    revmapbuf;
236
0
    ItemPointerData newtid;
237
0
    OffsetNumber newoff;
238
0
    Size    freespace = 0;
239
240
0
    revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
241
242
0
    START_CRIT_SECTION();
243
244
    /*
245
     * We need to initialize the page if it's newly obtained.  Note we
246
     * will WAL-log the initialization as part of the update, so we don't
247
     * need to do that here.
248
     */
249
0
    if (extended)
250
0
      brin_page_init(newpage, BRIN_PAGETYPE_REGULAR);
251
252
0
    PageIndexTupleDeleteNoCompact(oldpage, oldoff);
253
0
    newoff = PageAddItem(newpage, (Item) unconstify(BrinTuple *, newtup), newsz,
254
0
               InvalidOffsetNumber, false, false);
255
0
    if (newoff == InvalidOffsetNumber)
256
0
      elog(ERROR, "failed to add BRIN tuple to new page");
257
0
    MarkBufferDirty(oldbuf);
258
0
    MarkBufferDirty(newbuf);
259
260
    /* needed to update FSM below */
261
0
    if (extended)
262
0
      freespace = br_page_get_freespace(newpage);
263
264
0
    ItemPointerSet(&newtid, newblk, newoff);
265
0
    brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, newtid);
266
0
    MarkBufferDirty(revmapbuf);
267
268
    /* XLOG stuff */
269
0
    if (RelationNeedsWAL(idxrel))
270
0
    {
271
0
      xl_brin_update xlrec;
272
0
      XLogRecPtr  recptr;
273
0
      uint8   info;
274
275
0
      info = XLOG_BRIN_UPDATE | (extended ? XLOG_BRIN_INIT_PAGE : 0);
276
277
0
      xlrec.insert.offnum = newoff;
278
0
      xlrec.insert.heapBlk = heapBlk;
279
0
      xlrec.insert.pagesPerRange = pagesPerRange;
280
0
      xlrec.oldOffnum = oldoff;
281
282
0
      XLogBeginInsert();
283
284
      /* new page */
285
0
      XLogRegisterData(&xlrec, SizeOfBrinUpdate);
286
287
0
      XLogRegisterBuffer(0, newbuf, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
288
0
      XLogRegisterBufData(0, newtup, newsz);
289
290
      /* revmap page */
291
0
      XLogRegisterBuffer(1, revmapbuf, 0);
292
293
      /* old page */
294
0
      XLogRegisterBuffer(2, oldbuf, REGBUF_STANDARD);
295
296
0
      recptr = XLogInsert(RM_BRIN_ID, info);
297
298
0
      PageSetLSN(oldpage, recptr);
299
0
      PageSetLSN(newpage, recptr);
300
0
      PageSetLSN(BufferGetPage(revmapbuf), recptr);
301
0
    }
302
303
0
    END_CRIT_SECTION();
304
305
0
    LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
306
0
    LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
307
0
    UnlockReleaseBuffer(newbuf);
308
309
0
    if (extended)
310
0
    {
311
0
      RecordPageWithFreeSpace(idxrel, newblk, freespace);
312
0
      FreeSpaceMapVacuumRange(idxrel, newblk, newblk + 1);
313
0
    }
314
315
0
    return true;
316
0
  }
317
0
}
318
319
/*
320
 * Return whether brin_doupdate can do a samepage update.
321
 */
322
bool
323
brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)
324
0
{
325
0
  return
326
0
    ((newsz <= origsz) ||
327
0
     PageGetExactFreeSpace(BufferGetPage(buffer)) >= (newsz - origsz));
328
0
}
329
330
/*
331
 * Insert an index tuple into the index relation.  The revmap is updated to
332
 * mark the range containing the given page as pointing to the inserted entry.
333
 * A WAL record is written.
334
 *
335
 * The buffer, if valid, is first checked for free space to insert the new
336
 * entry; if there isn't enough, a new buffer is obtained and pinned.  No
337
 * buffer lock must be held on entry, no buffer lock is held on exit.
338
 *
339
 * Return value is the offset number where the tuple was inserted.
340
 */
341
OffsetNumber
342
brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
343
        BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
344
        BrinTuple *tup, Size itemsz)
345
0
{
346
0
  Page    page;
347
0
  BlockNumber blk;
348
0
  OffsetNumber off;
349
0
  Size    freespace = 0;
350
0
  Buffer    revmapbuf;
351
0
  ItemPointerData tid;
352
0
  bool    extended;
353
354
0
  Assert(itemsz == MAXALIGN(itemsz));
355
356
  /* If the item is oversized, don't even bother. */
357
0
  if (itemsz > BrinMaxItemSize)
358
0
  {
359
0
    ereport(ERROR,
360
0
        (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
361
0
         errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
362
0
            itemsz, BrinMaxItemSize, RelationGetRelationName(idxrel))));
363
0
    return InvalidOffsetNumber; /* keep compiler quiet */
364
0
  }
365
366
  /* Make sure the revmap is long enough to contain the entry we need */
367
0
  brinRevmapExtend(revmap, heapBlk);
368
369
  /*
370
   * Acquire lock on buffer supplied by caller, if any.  If it doesn't have
371
   * enough space, unpin it to obtain a new one below.
372
   */
373
0
  if (BufferIsValid(*buffer))
374
0
  {
375
    /*
376
     * It's possible that another backend (or ourselves!) extended the
377
     * revmap over the page we held a pin on, so we cannot assume that
378
     * it's still a regular page.
379
     */
380
0
    LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
381
0
    if (br_page_get_freespace(BufferGetPage(*buffer)) < itemsz)
382
0
    {
383
0
      UnlockReleaseBuffer(*buffer);
384
0
      *buffer = InvalidBuffer;
385
0
    }
386
0
  }
387
388
  /*
389
   * If we still don't have a usable buffer, have brin_getinsertbuffer
390
   * obtain one for us.
391
   */
392
0
  if (!BufferIsValid(*buffer))
393
0
  {
394
0
    do
395
0
      *buffer = brin_getinsertbuffer(idxrel, InvalidBuffer, itemsz, &extended);
396
0
    while (!BufferIsValid(*buffer));
397
0
  }
398
0
  else
399
0
    extended = false;
400
401
  /* Now obtain lock on revmap buffer */
402
0
  revmapbuf = brinLockRevmapPageForUpdate(revmap, heapBlk);
403
404
0
  page = BufferGetPage(*buffer);
405
0
  blk = BufferGetBlockNumber(*buffer);
406
407
  /* Execute the actual insertion */
408
0
  START_CRIT_SECTION();
409
0
  if (extended)
410
0
    brin_page_init(page, BRIN_PAGETYPE_REGULAR);
411
0
  off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
412
0
            false, false);
413
0
  if (off == InvalidOffsetNumber)
414
0
    elog(ERROR, "failed to add BRIN tuple to new page");
415
0
  MarkBufferDirty(*buffer);
416
417
  /* needed to update FSM below */
418
0
  if (extended)
419
0
    freespace = br_page_get_freespace(page);
420
421
0
  ItemPointerSet(&tid, blk, off);
422
0
  brinSetHeapBlockItemptr(revmapbuf, pagesPerRange, heapBlk, tid);
423
0
  MarkBufferDirty(revmapbuf);
424
425
  /* XLOG stuff */
426
0
  if (RelationNeedsWAL(idxrel))
427
0
  {
428
0
    xl_brin_insert xlrec;
429
0
    XLogRecPtr  recptr;
430
0
    uint8   info;
431
432
0
    info = XLOG_BRIN_INSERT | (extended ? XLOG_BRIN_INIT_PAGE : 0);
433
0
    xlrec.heapBlk = heapBlk;
434
0
    xlrec.pagesPerRange = pagesPerRange;
435
0
    xlrec.offnum = off;
436
437
0
    XLogBeginInsert();
438
0
    XLogRegisterData(&xlrec, SizeOfBrinInsert);
439
440
0
    XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD | (extended ? REGBUF_WILL_INIT : 0));
441
0
    XLogRegisterBufData(0, tup, itemsz);
442
443
0
    XLogRegisterBuffer(1, revmapbuf, 0);
444
445
0
    recptr = XLogInsert(RM_BRIN_ID, info);
446
447
0
    PageSetLSN(page, recptr);
448
0
    PageSetLSN(BufferGetPage(revmapbuf), recptr);
449
0
  }
450
451
0
  END_CRIT_SECTION();
452
453
  /* Tuple is firmly on buffer; we can release our locks */
454
0
  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
455
0
  LockBuffer(revmapbuf, BUFFER_LOCK_UNLOCK);
456
457
0
  BRIN_elog((DEBUG2, "inserted tuple (%u,%u) for range starting at %u",
458
0
         blk, off, heapBlk));
459
460
0
  if (extended)
461
0
  {
462
0
    RecordPageWithFreeSpace(idxrel, blk, freespace);
463
0
    FreeSpaceMapVacuumRange(idxrel, blk, blk + 1);
464
0
  }
465
466
0
  return off;
467
0
}
468
469
/*
470
 * Initialize a page with the given type.
471
 *
472
 * Caller is responsible for marking it dirty, as appropriate.
473
 */
474
void
475
brin_page_init(Page page, uint16 type)
476
0
{
477
0
  PageInit(page, BLCKSZ, sizeof(BrinSpecialSpace));
478
479
0
  BrinPageType(page) = type;
480
0
}
481
482
/*
483
 * Initialize a new BRIN index's metapage.
484
 */
485
void
486
brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
487
0
{
488
0
  BrinMetaPageData *metadata;
489
490
0
  brin_page_init(page, BRIN_PAGETYPE_META);
491
492
0
  metadata = (BrinMetaPageData *) PageGetContents(page);
493
494
0
  metadata->brinMagic = BRIN_META_MAGIC;
495
0
  metadata->brinVersion = version;
496
0
  metadata->pagesPerRange = pagesPerRange;
497
498
  /*
499
   * Note we cheat here a little.  0 is not a valid revmap block number
500
   * (because it's the metapage buffer), but doing this enables the first
501
   * revmap page to be created when the index is.
502
   */
503
0
  metadata->lastRevmapPage = 0;
504
505
  /*
506
   * Set pd_lower just past the end of the metadata.  This is essential,
507
   * because without doing so, metadata will be lost if xlog.c compresses
508
   * the page.
509
   */
510
0
  ((PageHeader) page)->pd_lower =
511
0
    ((char *) metadata + sizeof(BrinMetaPageData)) - (char *) page;
512
0
}
513
514
/*
515
 * Initiate page evacuation protocol.
516
 *
517
 * The page must be locked in exclusive mode by the caller.
518
 *
519
 * If the page is not yet initialized or empty, return false without doing
520
 * anything; it can be used for revmap without any further changes.  If it
521
 * contains tuples, mark it for evacuation and return true.
522
 */
523
bool
524
brin_start_evacuating_page(Relation idxRel, Buffer buf)
525
0
{
526
0
  OffsetNumber off;
527
0
  OffsetNumber maxoff;
528
0
  Page    page;
529
530
0
  page = BufferGetPage(buf);
531
532
0
  if (PageIsNew(page))
533
0
    return false;
534
535
0
  maxoff = PageGetMaxOffsetNumber(page);
536
0
  for (off = FirstOffsetNumber; off <= maxoff; off++)
537
0
  {
538
0
    ItemId    lp;
539
540
0
    lp = PageGetItemId(page, off);
541
0
    if (ItemIdIsUsed(lp))
542
0
    {
543
      /*
544
       * Prevent other backends from adding more stuff to this page:
545
       * BRIN_EVACUATE_PAGE informs br_page_get_freespace that this page
546
       * can no longer be used to add new tuples.  Note that this flag
547
       * is not WAL-logged, except accidentally.
548
       */
549
0
      BrinPageFlags(page) |= BRIN_EVACUATE_PAGE;
550
0
      MarkBufferDirtyHint(buf, true);
551
552
0
      return true;
553
0
    }
554
0
  }
555
0
  return false;
556
0
}
557
558
/*
559
 * Move all tuples out of a page.
560
 *
561
 * The caller must hold lock on the page. The lock and pin are released.
562
 */
563
void
564
brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
565
           BrinRevmap *revmap, Buffer buf)
566
0
{
567
0
  OffsetNumber off;
568
0
  OffsetNumber maxoff;
569
0
  Page    page;
570
0
  BrinTuple  *btup = NULL;
571
0
  Size    btupsz = 0;
572
573
0
  page = BufferGetPage(buf);
574
575
0
  Assert(BrinPageFlags(page) & BRIN_EVACUATE_PAGE);
576
577
0
  maxoff = PageGetMaxOffsetNumber(page);
578
0
  for (off = FirstOffsetNumber; off <= maxoff; off++)
579
0
  {
580
0
    BrinTuple  *tup;
581
0
    Size    sz;
582
0
    ItemId    lp;
583
584
0
    CHECK_FOR_INTERRUPTS();
585
586
0
    lp = PageGetItemId(page, off);
587
0
    if (ItemIdIsUsed(lp))
588
0
    {
589
0
      sz = ItemIdGetLength(lp);
590
0
      tup = (BrinTuple *) PageGetItem(page, lp);
591
0
      tup = brin_copy_tuple(tup, sz, btup, &btupsz);
592
593
0
      LockBuffer(buf, BUFFER_LOCK_UNLOCK);
594
595
0
      if (!brin_doupdate(idxRel, pagesPerRange, revmap, tup->bt_blkno,
596
0
                 buf, off, tup, sz, tup, sz, false))
597
0
        off--;     /* retry */
598
599
0
      LockBuffer(buf, BUFFER_LOCK_SHARE);
600
601
      /* It's possible that someone extended the revmap over this page */
602
0
      if (!BRIN_IS_REGULAR_PAGE(page))
603
0
        break;
604
0
    }
605
0
  }
606
607
0
  UnlockReleaseBuffer(buf);
608
0
}
609
610
/*
611
 * Given a BRIN index page, initialize it if necessary, and record its
612
 * current free space in the FSM.
613
 *
614
 * The main use for this is when, during vacuuming, an uninitialized page is
615
 * found, which could be the result of relation extension followed by a crash
616
 * before the page can be used.
617
 *
618
 * Here, we don't bother to update upper FSM pages, instead expecting that our
619
 * caller (brin_vacuum_scan) will fix them at the end of the scan.  Elsewhere
620
 * in this file, it's generally a good idea to propagate additions of free
621
 * space into the upper FSM pages immediately.
622
 */
623
void
624
brin_page_cleanup(Relation idxrel, Buffer buf)
625
0
{
626
0
  Page    page = BufferGetPage(buf);
627
628
  /*
629
   * If a page was left uninitialized, initialize it now; also record it in
630
   * FSM.
631
   *
632
   * Somebody else might be extending the relation concurrently.  To avoid
633
   * re-initializing the page before they can grab the buffer lock, we
634
   * acquire the extension lock momentarily.  Since they hold the extension
635
   * lock from before getting the page and after its been initialized, we're
636
   * sure to see their initialization.
637
   */
638
0
  if (PageIsNew(page))
639
0
  {
640
0
    LockRelationForExtension(idxrel, ShareLock);
641
0
    UnlockRelationForExtension(idxrel, ShareLock);
642
643
0
    LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
644
0
    if (PageIsNew(page))
645
0
    {
646
0
      brin_initialize_empty_new_buffer(idxrel, buf);
647
0
      LockBuffer(buf, BUFFER_LOCK_UNLOCK);
648
0
      return;
649
0
    }
650
0
    LockBuffer(buf, BUFFER_LOCK_UNLOCK);
651
0
  }
652
653
  /* Nothing to be done for non-regular index pages */
654
0
  if (BRIN_IS_META_PAGE(BufferGetPage(buf)) ||
655
0
    BRIN_IS_REVMAP_PAGE(BufferGetPage(buf)))
656
0
    return;
657
658
  /* Measure free space and record it */
659
0
  RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buf),
660
0
              br_page_get_freespace(page));
661
0
}
662
663
/*
664
 * Return a pinned and exclusively locked buffer which can be used to insert an
665
 * index item of size itemsz (caller must ensure not to request sizes
666
 * impossible to fulfill).  If oldbuf is a valid buffer, it is also locked (in
667
 * an order determined to avoid deadlocks).
668
 *
669
 * If we find that the old page is no longer a regular index page (because
670
 * of a revmap extension), the old buffer is unlocked and we return
671
 * InvalidBuffer.
672
 *
673
 * If there's no existing page with enough free space to accommodate the new
674
 * item, the relation is extended.  If this happens, *extended is set to true,
675
 * and it is the caller's responsibility to initialize the page (and WAL-log
676
 * that fact) prior to use.  The caller should also update the FSM with the
677
 * page's remaining free space after the insertion.
678
 *
679
 * Note that the caller is not expected to update FSM unless *extended is set
680
 * true.  This policy means that we'll update FSM when a page is created, and
681
 * when it's found to have too little space for a desired tuple insertion,
682
 * but not every single time we add a tuple to the page.
683
 *
684
 * Note that in some corner cases it is possible for this routine to extend
685
 * the relation and then not return the new page.  It is this routine's
686
 * responsibility to WAL-log the page initialization and to record the page in
687
 * FSM if that happens, since the caller certainly can't do it.
688
 */
689
static Buffer
690
brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
691
           bool *extended)
692
0
{
693
0
  BlockNumber oldblk;
694
0
  BlockNumber newblk;
695
0
  Page    page;
696
0
  Size    freespace;
697
698
  /* callers must have checked */
699
0
  Assert(itemsz <= BrinMaxItemSize);
700
701
0
  if (BufferIsValid(oldbuf))
702
0
    oldblk = BufferGetBlockNumber(oldbuf);
703
0
  else
704
0
    oldblk = InvalidBlockNumber;
705
706
  /* Choose initial target page, re-using existing target if known */
707
0
  newblk = RelationGetTargetBlock(irel);
708
0
  if (newblk == InvalidBlockNumber)
709
0
    newblk = GetPageWithFreeSpace(irel, itemsz);
710
711
  /*
712
   * Loop until we find a page with sufficient free space.  By the time we
713
   * return to caller out of this loop, both buffers are valid and locked;
714
   * if we have to restart here, neither page is locked and newblk isn't
715
   * pinned (if it's even valid).
716
   */
717
0
  for (;;)
718
0
  {
719
0
    Buffer    buf;
720
0
    bool    extensionLockHeld = false;
721
722
0
    CHECK_FOR_INTERRUPTS();
723
724
0
    *extended = false;
725
726
0
    if (newblk == InvalidBlockNumber)
727
0
    {
728
      /*
729
       * There's not enough free space in any existing index page,
730
       * according to the FSM: extend the relation to obtain a shiny new
731
       * page.
732
       *
733
       * XXX: It's likely possible to use RBM_ZERO_AND_LOCK here,
734
       * which'd avoid the need to hold the extension lock during buffer
735
       * reclaim.
736
       */
737
0
      if (!RELATION_IS_LOCAL(irel))
738
0
      {
739
0
        LockRelationForExtension(irel, ExclusiveLock);
740
0
        extensionLockHeld = true;
741
0
      }
742
0
      buf = ReadBuffer(irel, P_NEW);
743
0
      newblk = BufferGetBlockNumber(buf);
744
0
      *extended = true;
745
746
0
      BRIN_elog((DEBUG2, "brin_getinsertbuffer: extending to page %u",
747
0
             BufferGetBlockNumber(buf)));
748
0
    }
749
0
    else if (newblk == oldblk)
750
0
    {
751
      /*
752
       * There's an odd corner-case here where the FSM is out-of-date,
753
       * and gave us the old page.
754
       */
755
0
      buf = oldbuf;
756
0
    }
757
0
    else
758
0
    {
759
0
      buf = ReadBuffer(irel, newblk);
760
0
    }
761
762
    /*
763
     * We lock the old buffer first, if it's earlier than the new one; but
764
     * then we need to check that it hasn't been turned into a revmap page
765
     * concurrently.  If we detect that that happened, give up and tell
766
     * caller to start over.
767
     */
768
0
    if (BufferIsValid(oldbuf) && oldblk < newblk)
769
0
    {
770
0
      LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
771
0
      if (!BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
772
0
      {
773
0
        LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
774
775
        /*
776
         * It is possible that the new page was obtained from
777
         * extending the relation.  In that case, we must be sure to
778
         * record it in the FSM before leaving, because otherwise the
779
         * space would be lost forever.  However, we cannot let an
780
         * uninitialized page get in the FSM, so we need to initialize
781
         * it first.
782
         */
783
0
        if (*extended)
784
0
          brin_initialize_empty_new_buffer(irel, buf);
785
786
0
        if (extensionLockHeld)
787
0
          UnlockRelationForExtension(irel, ExclusiveLock);
788
789
0
        ReleaseBuffer(buf);
790
791
0
        if (*extended)
792
0
        {
793
0
          FreeSpaceMapVacuumRange(irel, newblk, newblk + 1);
794
          /* shouldn't matter, but don't confuse caller */
795
0
          *extended = false;
796
0
        }
797
798
0
        return InvalidBuffer;
799
0
      }
800
0
    }
801
802
0
    LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
803
804
0
    if (extensionLockHeld)
805
0
      UnlockRelationForExtension(irel, ExclusiveLock);
806
807
0
    page = BufferGetPage(buf);
808
809
    /*
810
     * We have a new buffer to insert into.  Check that the new page has
811
     * enough free space, and return it if it does; otherwise start over.
812
     * (br_page_get_freespace also checks that the FSM didn't hand us a
813
     * page that has since been repurposed for the revmap.)
814
     */
815
0
    freespace = *extended ?
816
0
      BrinMaxItemSize : br_page_get_freespace(page);
817
0
    if (freespace >= itemsz)
818
0
    {
819
0
      RelationSetTargetBlock(irel, newblk);
820
821
      /*
822
       * Lock the old buffer if not locked already.  Note that in this
823
       * case we know for sure it's a regular page: it's later than the
824
       * new page we just got, which is not a revmap page, and revmap
825
       * pages are always consecutive.
826
       */
827
0
      if (BufferIsValid(oldbuf) && oldblk > newblk)
828
0
      {
829
0
        LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
830
0
        Assert(BRIN_IS_REGULAR_PAGE(BufferGetPage(oldbuf)));
831
0
      }
832
833
0
      return buf;
834
0
    }
835
836
    /* This page is no good. */
837
838
    /*
839
     * If an entirely new page does not contain enough free space for the
840
     * new item, then surely that item is oversized.  Complain loudly; but
841
     * first make sure we initialize the page and record it as free, for
842
     * next time.
843
     */
844
0
    if (*extended)
845
0
    {
846
0
      brin_initialize_empty_new_buffer(irel, buf);
847
      /* since this should not happen, skip FreeSpaceMapVacuum */
848
849
0
      ereport(ERROR,
850
0
          (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
851
0
           errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
852
0
              itemsz, freespace, RelationGetRelationName(irel))));
853
0
      return InvalidBuffer; /* keep compiler quiet */
854
0
    }
855
856
0
    if (newblk != oldblk)
857
0
      UnlockReleaseBuffer(buf);
858
0
    if (BufferIsValid(oldbuf) && oldblk <= newblk)
859
0
      LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
860
861
    /*
862
     * Update the FSM with the new, presumably smaller, freespace value
863
     * for this page, then search for a new target page.
864
     */
865
0
    newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
866
0
  }
867
0
}
868
869
/*
870
 * Initialize a page as an empty regular BRIN page, WAL-log this, and record
871
 * the page in FSM.
872
 *
873
 * There are several corner situations in which we extend the relation to
874
 * obtain a new page and later find that we cannot use it immediately.  When
875
 * that happens, we don't want to leave the page go unrecorded in FSM, because
876
 * there is no mechanism to get the space back and the index would bloat.
877
 * Also, because we would not WAL-log the action that would initialize the
878
 * page, the page would go uninitialized in a standby (or after recovery).
879
 *
880
 * While we record the page in FSM here, caller is responsible for doing FSM
881
 * upper-page update if that seems appropriate.
882
 */
883
static void
884
brin_initialize_empty_new_buffer(Relation idxrel, Buffer buffer)
885
0
{
886
0
  Page    page;
887
888
0
  BRIN_elog((DEBUG2,
889
0
         "brin_initialize_empty_new_buffer: initializing blank page %u",
890
0
         BufferGetBlockNumber(buffer)));
891
892
0
  START_CRIT_SECTION();
893
0
  page = BufferGetPage(buffer);
894
0
  brin_page_init(page, BRIN_PAGETYPE_REGULAR);
895
0
  MarkBufferDirty(buffer);
896
0
  log_newpage_buffer(buffer, true);
897
0
  END_CRIT_SECTION();
898
899
  /*
900
   * We update the FSM for this page, but this is not WAL-logged.  This is
901
   * acceptable because VACUUM will scan the index and update the FSM with
902
   * pages whose FSM records were forgotten in a crash.
903
   */
904
0
  RecordPageWithFreeSpace(idxrel, BufferGetBlockNumber(buffer),
905
0
              br_page_get_freespace(page));
906
0
}
907
908
909
/*
910
 * Return the amount of free space on a regular BRIN index page.
911
 *
912
 * If the page is not a regular page, or has been marked with the
913
 * BRIN_EVACUATE_PAGE flag, returns 0.
914
 */
915
static Size
916
br_page_get_freespace(Page page)
917
0
{
918
0
  if (!BRIN_IS_REGULAR_PAGE(page) ||
919
0
    (BrinPageFlags(page) & BRIN_EVACUATE_PAGE) != 0)
920
0
    return 0;
921
0
  else
922
0
    return PageGetFreeSpace(page);
923
0
}