Coverage Report

Created: 2025-09-27 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/postgres/src/backend/access/nbtree/nbtpage.c
Line
Count
Source
1
/*-------------------------------------------------------------------------
2
 *
3
 * nbtpage.c
4
 *    BTree-specific page management code for the Postgres btree access
5
 *    method.
6
 *
7
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8
 * Portions Copyright (c) 1994, Regents of the University of California
9
 *
10
 *
11
 * IDENTIFICATION
12
 *    src/backend/access/nbtree/nbtpage.c
13
 *
14
 *  NOTES
15
 *     Postgres btree pages look like ordinary relation pages.  The opaque
16
 *     data at high addresses includes pointers to left and right siblings
17
 *     and flag data describing page state.  The first page in a btree, page
18
 *     zero, is special -- it stores meta-information describing the tree.
19
 *     Pages one and higher store the actual tree data.
20
 *
21
 *-------------------------------------------------------------------------
22
 */
23
#include "postgres.h"
24
25
#include "access/nbtree.h"
26
#include "access/nbtxlog.h"
27
#include "access/tableam.h"
28
#include "access/transam.h"
29
#include "access/xlog.h"
30
#include "access/xloginsert.h"
31
#include "common/int.h"
32
#include "miscadmin.h"
33
#include "storage/indexfsm.h"
34
#include "storage/predicate.h"
35
#include "storage/procarray.h"
36
#include "utils/memdebug.h"
37
#include "utils/memutils.h"
38
#include "utils/snapmgr.h"
39
40
static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
41
static void _bt_delitems_delete(Relation rel, Buffer buf,
42
                TransactionId snapshotConflictHorizon,
43
                bool isCatalogRel,
44
                OffsetNumber *deletable, int ndeletable,
45
                BTVacuumPosting *updatable, int nupdatable);
46
static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
47
                 OffsetNumber *updatedoffsets,
48
                 Size *updatedbuflen, bool needswal);
49
static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel,
50
                   Buffer leafbuf, BTStack stack);
51
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
52
                   BlockNumber scanblkno,
53
                   bool *rightsib_empty,
54
                   BTVacState *vstate);
55
static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel,
56
                  BlockNumber child, BTStack stack,
57
                  Buffer *subtreeparent, OffsetNumber *poffset,
58
                  BlockNumber *topparent,
59
                  BlockNumber *topparentrightsib);
60
static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,
61
                 FullTransactionId safexid);
62
63
/*
64
 *  _bt_initmetapage() -- Fill a page buffer with a correct metapage image
65
 */
66
void
67
_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
68
         bool allequalimage)
69
0
{
70
0
  BTMetaPageData *metad;
71
0
  BTPageOpaque metaopaque;
72
73
0
  _bt_pageinit(page, BLCKSZ);
74
75
0
  metad = BTPageGetMeta(page);
76
0
  metad->btm_magic = BTREE_MAGIC;
77
0
  metad->btm_version = BTREE_VERSION;
78
0
  metad->btm_root = rootbknum;
79
0
  metad->btm_level = level;
80
0
  metad->btm_fastroot = rootbknum;
81
0
  metad->btm_fastlevel = level;
82
0
  metad->btm_last_cleanup_num_delpages = 0;
83
0
  metad->btm_last_cleanup_num_heap_tuples = -1.0;
84
0
  metad->btm_allequalimage = allequalimage;
85
86
0
  metaopaque = BTPageGetOpaque(page);
87
0
  metaopaque->btpo_flags = BTP_META;
88
89
  /*
90
   * Set pd_lower just past the end of the metadata.  This is essential,
91
   * because without doing so, metadata will be lost if xlog.c compresses
92
   * the page.
93
   */
94
0
  ((PageHeader) page)->pd_lower =
95
0
    ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
96
0
}
97
98
/*
99
 *  _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version
100
 *    3, the last version that can be updated without broadly affecting
101
 *    on-disk compatibility.  (A REINDEX is required to upgrade to v4.)
102
 *
103
 *    This routine does purely in-memory image upgrade.  Caller is
104
 *    responsible for locking, WAL-logging etc.
105
 */
106
void
107
_bt_upgrademetapage(Page page)
108
0
{
109
0
  BTMetaPageData *metad;
110
0
  BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;
111
112
0
  metad = BTPageGetMeta(page);
113
0
  metaopaque = BTPageGetOpaque(page);
114
115
  /* It must be really a meta page of upgradable version */
116
0
  Assert(metaopaque->btpo_flags & BTP_META);
117
0
  Assert(metad->btm_version < BTREE_NOVAC_VERSION);
118
0
  Assert(metad->btm_version >= BTREE_MIN_VERSION);
119
120
  /* Set version number and fill extra fields added into version 3 */
121
0
  metad->btm_version = BTREE_NOVAC_VERSION;
122
0
  metad->btm_last_cleanup_num_delpages = 0;
123
0
  metad->btm_last_cleanup_num_heap_tuples = -1.0;
124
  /* Only a REINDEX can set this field */
125
0
  Assert(!metad->btm_allequalimage);
126
0
  metad->btm_allequalimage = false;
127
128
  /* Adjust pd_lower (see _bt_initmetapage() for details) */
129
0
  ((PageHeader) page)->pd_lower =
130
0
    ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
131
0
}
132
133
/*
134
 * Get metadata from share-locked buffer containing metapage, while performing
135
 * standard sanity checks.
136
 *
137
 * Callers that cache data returned here in local cache should note that an
138
 * on-the-fly upgrade using _bt_upgrademetapage() can change the version field
139
 * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.
140
 */
141
static BTMetaPageData *
142
_bt_getmeta(Relation rel, Buffer metabuf)
143
0
{
144
0
  Page    metapg;
145
0
  BTPageOpaque metaopaque;
146
0
  BTMetaPageData *metad;
147
148
0
  metapg = BufferGetPage(metabuf);
149
0
  metaopaque = BTPageGetOpaque(metapg);
150
0
  metad = BTPageGetMeta(metapg);
151
152
  /* sanity-check the metapage */
153
0
  if (!P_ISMETA(metaopaque) ||
154
0
    metad->btm_magic != BTREE_MAGIC)
155
0
    ereport(ERROR,
156
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
157
0
         errmsg("index \"%s\" is not a btree",
158
0
            RelationGetRelationName(rel))));
159
160
0
  if (metad->btm_version < BTREE_MIN_VERSION ||
161
0
    metad->btm_version > BTREE_VERSION)
162
0
    ereport(ERROR,
163
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
164
0
         errmsg("version mismatch in index \"%s\": file version %d, "
165
0
            "current version %d, minimal supported version %d",
166
0
            RelationGetRelationName(rel),
167
0
            metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
168
169
0
  return metad;
170
0
}
171
172
/*
173
 * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
174
 *
175
 * Called by btvacuumcleanup when btbulkdelete was never called because no
176
 * index tuples needed to be deleted.
177
 */
178
bool
179
_bt_vacuum_needs_cleanup(Relation rel)
180
0
{
181
0
  Buffer    metabuf;
182
0
  Page    metapg;
183
0
  BTMetaPageData *metad;
184
0
  uint32    btm_version;
185
0
  BlockNumber prev_num_delpages;
186
187
  /*
188
   * Copy details from metapage to local variables quickly.
189
   *
190
   * Note that we deliberately avoid using cached version of metapage here.
191
   */
192
0
  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
193
0
  metapg = BufferGetPage(metabuf);
194
0
  metad = BTPageGetMeta(metapg);
195
0
  btm_version = metad->btm_version;
196
197
0
  if (btm_version < BTREE_NOVAC_VERSION)
198
0
  {
199
    /*
200
     * Metapage needs to be dynamically upgraded to store fields that are
201
     * only present when btm_version >= BTREE_NOVAC_VERSION
202
     */
203
0
    _bt_relbuf(rel, metabuf);
204
0
    return true;
205
0
  }
206
207
0
  prev_num_delpages = metad->btm_last_cleanup_num_delpages;
208
0
  _bt_relbuf(rel, metabuf);
209
210
  /*
211
   * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
212
   * total size of the index.  We can reasonably expect (though are not
213
   * guaranteed) to be able to recycle this many pages if we decide to do a
214
   * btvacuumscan call during the ongoing btvacuumcleanup.  For further
215
   * details see the nbtree/README section on placing deleted pages in the
216
   * FSM.
217
   */
218
0
  if (prev_num_delpages > 0 &&
219
0
    prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)
220
0
    return true;
221
222
0
  return false;
223
0
}
224
225
/*
226
 * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.
227
 *
228
 * Called at the end of btvacuumcleanup, when num_delpages value has been
229
 * finalized.
230
 */
231
void
232
_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)
233
0
{
234
0
  Buffer    metabuf;
235
0
  Page    metapg;
236
0
  BTMetaPageData *metad;
237
238
  /*
239
   * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
240
   * field started out as a TransactionId field called btm_oldest_btpo_xact.
241
   * Both "versions" are just uint32 fields.  It was convenient to repurpose
242
   * the field when we began to use 64-bit XIDs in deleted pages.
243
   *
244
   * It's possible that a pg_upgrade'd database will contain an XID value in
245
   * what is now recognized as the metapage's btm_last_cleanup_num_delpages
246
   * field.  _bt_vacuum_needs_cleanup() may even believe that this value
247
   * indicates that there are lots of pages that it needs to recycle, when
248
   * in reality there are only one or two.  The worst that can happen is
249
   * that there will be a call to btvacuumscan a little earlier, which will
250
   * set btm_last_cleanup_num_delpages to a sane value when we're called.
251
   *
252
   * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is
253
   * no longer used as of PostgreSQL 14.  We set it to -1.0 on rewrite, just
254
   * to be consistent.
255
   */
256
0
  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
257
0
  metapg = BufferGetPage(metabuf);
258
0
  metad = BTPageGetMeta(metapg);
259
260
  /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */
261
0
  if (metad->btm_version >= BTREE_NOVAC_VERSION &&
262
0
    metad->btm_last_cleanup_num_delpages == num_delpages)
263
0
  {
264
    /* Usually means index continues to have num_delpages of 0 */
265
0
    _bt_relbuf(rel, metabuf);
266
0
    return;
267
0
  }
268
269
  /* trade in our read lock for a write lock */
270
0
  _bt_unlockbuf(rel, metabuf);
271
0
  _bt_lockbuf(rel, metabuf, BT_WRITE);
272
273
0
  START_CRIT_SECTION();
274
275
  /* upgrade meta-page if needed */
276
0
  if (metad->btm_version < BTREE_NOVAC_VERSION)
277
0
    _bt_upgrademetapage(metapg);
278
279
  /* update cleanup-related information */
280
0
  metad->btm_last_cleanup_num_delpages = num_delpages;
281
0
  metad->btm_last_cleanup_num_heap_tuples = -1.0;
282
0
  MarkBufferDirty(metabuf);
283
284
  /* write wal record if needed */
285
0
  if (RelationNeedsWAL(rel))
286
0
  {
287
0
    xl_btree_metadata md;
288
0
    XLogRecPtr  recptr;
289
290
0
    XLogBeginInsert();
291
0
    XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
292
293
0
    Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
294
0
    md.version = metad->btm_version;
295
0
    md.root = metad->btm_root;
296
0
    md.level = metad->btm_level;
297
0
    md.fastroot = metad->btm_fastroot;
298
0
    md.fastlevel = metad->btm_fastlevel;
299
0
    md.last_cleanup_num_delpages = num_delpages;
300
0
    md.allequalimage = metad->btm_allequalimage;
301
302
0
    XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));
303
304
0
    recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);
305
306
0
    PageSetLSN(metapg, recptr);
307
0
  }
308
309
0
  END_CRIT_SECTION();
310
311
0
  _bt_relbuf(rel, metabuf);
312
0
}
313
314
/*
315
 *  _bt_getroot() -- Get the root page of the btree.
316
 *
317
 *    Since the root page can move around the btree file, we have to read
318
 *    its location from the metadata page, and then read the root page
319
 *    itself.  If no root page exists yet, we have to create one.
320
 *
321
 *    The access type parameter (BT_READ or BT_WRITE) controls whether
322
 *    a new root page will be created or not.  If access = BT_READ,
323
 *    and no root page exists, we just return InvalidBuffer.  For
324
 *    BT_WRITE, we try to create the root page if it doesn't exist.
325
 *    NOTE that the returned root page will have only a read lock set
326
 *    on it even if access = BT_WRITE!
327
 *
328
 *    If access = BT_WRITE, heaprel must be set; otherwise caller can just
329
 *    pass NULL.  See _bt_allocbuf for an explanation.
330
 *
331
 *    The returned page is not necessarily the true root --- it could be
332
 *    a "fast root" (a page that is alone in its level due to deletions).
333
 *    Also, if the root page is split while we are "in flight" to it,
334
 *    what we will return is the old root, which is now just the leftmost
335
 *    page on a probably-not-very-wide level.  For most purposes this is
336
 *    as good as or better than the true root, so we do not bother to
337
 *    insist on finding the true root.  We do, however, guarantee to
338
 *    return a live (not deleted or half-dead) page.
339
 *
340
 *    On successful return, the root page is pinned and read-locked.
341
 *    The metadata page is not locked or pinned on exit.
342
 */
343
Buffer
344
_bt_getroot(Relation rel, Relation heaprel, int access)
345
0
{
346
0
  Buffer    metabuf;
347
0
  Buffer    rootbuf;
348
0
  Page    rootpage;
349
0
  BTPageOpaque rootopaque;
350
0
  BlockNumber rootblkno;
351
0
  uint32    rootlevel;
352
0
  BTMetaPageData *metad;
353
354
0
  Assert(access == BT_READ || heaprel != NULL);
355
356
  /*
357
   * Try to use previously-cached metapage data to find the root.  This
358
   * normally saves one buffer access per index search, which is a very
359
   * helpful savings in bufmgr traffic and hence contention.
360
   */
361
0
  if (rel->rd_amcache != NULL)
362
0
  {
363
0
    metad = (BTMetaPageData *) rel->rd_amcache;
364
    /* We shouldn't have cached it if any of these fail */
365
0
    Assert(metad->btm_magic == BTREE_MAGIC);
366
0
    Assert(metad->btm_version >= BTREE_MIN_VERSION);
367
0
    Assert(metad->btm_version <= BTREE_VERSION);
368
0
    Assert(!metad->btm_allequalimage ||
369
0
         metad->btm_version > BTREE_NOVAC_VERSION);
370
0
    Assert(metad->btm_root != P_NONE);
371
372
0
    rootblkno = metad->btm_fastroot;
373
0
    Assert(rootblkno != P_NONE);
374
0
    rootlevel = metad->btm_fastlevel;
375
376
0
    rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);
377
0
    rootpage = BufferGetPage(rootbuf);
378
0
    rootopaque = BTPageGetOpaque(rootpage);
379
380
    /*
381
     * Since the cache might be stale, we check the page more carefully
382
     * here than normal.  We *must* check that it's not deleted. If it's
383
     * not alone on its level, then we reject too --- this may be overly
384
     * paranoid but better safe than sorry.  Note we don't check P_ISROOT,
385
     * because that's not set in a "fast root".
386
     */
387
0
    if (!P_IGNORE(rootopaque) &&
388
0
      rootopaque->btpo_level == rootlevel &&
389
0
      P_LEFTMOST(rootopaque) &&
390
0
      P_RIGHTMOST(rootopaque))
391
0
    {
392
      /* OK, accept cached page as the root */
393
0
      return rootbuf;
394
0
    }
395
0
    _bt_relbuf(rel, rootbuf);
396
    /* Cache is stale, throw it away */
397
0
    if (rel->rd_amcache)
398
0
      pfree(rel->rd_amcache);
399
0
    rel->rd_amcache = NULL;
400
0
  }
401
402
0
  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
403
0
  metad = _bt_getmeta(rel, metabuf);
404
405
  /* if no root page initialized yet, do it */
406
0
  if (metad->btm_root == P_NONE)
407
0
  {
408
0
    Page    metapg;
409
410
    /* If access = BT_READ, caller doesn't want us to create root yet */
411
0
    if (access == BT_READ)
412
0
    {
413
0
      _bt_relbuf(rel, metabuf);
414
0
      return InvalidBuffer;
415
0
    }
416
417
    /* trade in our read lock for a write lock */
418
0
    _bt_unlockbuf(rel, metabuf);
419
0
    _bt_lockbuf(rel, metabuf, BT_WRITE);
420
421
    /*
422
     * Race condition:  if someone else initialized the metadata between
423
     * the time we released the read lock and acquired the write lock, we
424
     * must avoid doing it again.
425
     */
426
0
    if (metad->btm_root != P_NONE)
427
0
    {
428
      /*
429
       * Metadata initialized by someone else.  In order to guarantee no
430
       * deadlocks, we have to release the metadata page and start all
431
       * over again.  (Is that really true? But it's hardly worth trying
432
       * to optimize this case.)
433
       */
434
0
      _bt_relbuf(rel, metabuf);
435
0
      return _bt_getroot(rel, heaprel, access);
436
0
    }
437
438
    /*
439
     * Get, initialize, write, and leave a lock of the appropriate type on
440
     * the new root page.  Since this is the first page in the tree, it's
441
     * a leaf as well as the root.
442
     */
443
0
    rootbuf = _bt_allocbuf(rel, heaprel);
444
0
    rootblkno = BufferGetBlockNumber(rootbuf);
445
0
    rootpage = BufferGetPage(rootbuf);
446
0
    rootopaque = BTPageGetOpaque(rootpage);
447
0
    rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
448
0
    rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
449
0
    rootopaque->btpo_level = 0;
450
0
    rootopaque->btpo_cycleid = 0;
451
    /* Get raw page pointer for metapage */
452
0
    metapg = BufferGetPage(metabuf);
453
454
    /* NO ELOG(ERROR) till meta is updated */
455
0
    START_CRIT_SECTION();
456
457
    /* upgrade metapage if needed */
458
0
    if (metad->btm_version < BTREE_NOVAC_VERSION)
459
0
      _bt_upgrademetapage(metapg);
460
461
0
    metad->btm_root = rootblkno;
462
0
    metad->btm_level = 0;
463
0
    metad->btm_fastroot = rootblkno;
464
0
    metad->btm_fastlevel = 0;
465
0
    metad->btm_last_cleanup_num_delpages = 0;
466
0
    metad->btm_last_cleanup_num_heap_tuples = -1.0;
467
468
0
    MarkBufferDirty(rootbuf);
469
0
    MarkBufferDirty(metabuf);
470
471
    /* XLOG stuff */
472
0
    if (RelationNeedsWAL(rel))
473
0
    {
474
0
      xl_btree_newroot xlrec;
475
0
      XLogRecPtr  recptr;
476
0
      xl_btree_metadata md;
477
478
0
      XLogBeginInsert();
479
0
      XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
480
0
      XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
481
482
0
      Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
483
0
      md.version = metad->btm_version;
484
0
      md.root = rootblkno;
485
0
      md.level = 0;
486
0
      md.fastroot = rootblkno;
487
0
      md.fastlevel = 0;
488
0
      md.last_cleanup_num_delpages = 0;
489
0
      md.allequalimage = metad->btm_allequalimage;
490
491
0
      XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
492
493
0
      xlrec.rootblk = rootblkno;
494
0
      xlrec.level = 0;
495
496
0
      XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
497
498
0
      recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
499
500
0
      PageSetLSN(rootpage, recptr);
501
0
      PageSetLSN(metapg, recptr);
502
0
    }
503
504
0
    END_CRIT_SECTION();
505
506
    /*
507
     * swap root write lock for read lock.  There is no danger of anyone
508
     * else accessing the new root page while it's unlocked, since no one
509
     * else knows where it is yet.
510
     */
511
0
    _bt_unlockbuf(rel, rootbuf);
512
0
    _bt_lockbuf(rel, rootbuf, BT_READ);
513
514
    /* okay, metadata is correct, release lock on it without caching */
515
0
    _bt_relbuf(rel, metabuf);
516
0
  }
517
0
  else
518
0
  {
519
0
    rootblkno = metad->btm_fastroot;
520
0
    Assert(rootblkno != P_NONE);
521
0
    rootlevel = metad->btm_fastlevel;
522
523
    /*
524
     * Cache the metapage data for next time
525
     */
526
0
    rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
527
0
                       sizeof(BTMetaPageData));
528
0
    memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
529
530
    /*
531
     * We are done with the metapage; arrange to release it via first
532
     * _bt_relandgetbuf call
533
     */
534
0
    rootbuf = metabuf;
535
536
0
    for (;;)
537
0
    {
538
0
      rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
539
0
      rootpage = BufferGetPage(rootbuf);
540
0
      rootopaque = BTPageGetOpaque(rootpage);
541
542
0
      if (!P_IGNORE(rootopaque))
543
0
        break;
544
545
      /* it's dead, Jim.  step right one page */
546
0
      if (P_RIGHTMOST(rootopaque))
547
0
        elog(ERROR, "no live root page found in index \"%s\"",
548
0
           RelationGetRelationName(rel));
549
0
      rootblkno = rootopaque->btpo_next;
550
0
    }
551
552
0
    if (rootopaque->btpo_level != rootlevel)
553
0
      elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
554
0
         rootblkno, RelationGetRelationName(rel),
555
0
         rootopaque->btpo_level, rootlevel);
556
0
  }
557
558
  /*
559
   * By here, we have a pin and read lock on the root page, and no lock set
560
   * on the metadata page.  Return the root page's buffer.
561
   */
562
0
  return rootbuf;
563
0
}
564
565
/*
566
 *  _bt_gettrueroot() -- Get the true root page of the btree.
567
 *
568
 *    This is the same as the BT_READ case of _bt_getroot(), except
569
 *    we follow the true-root link not the fast-root link.
570
 *
571
 * By the time we acquire lock on the root page, it might have been split and
572
 * not be the true root anymore.  This is okay for the present uses of this
573
 * routine; we only really need to be able to move up at least one tree level
574
 * from whatever non-root page we were at.  If we ever do need to lock the
575
 * one true root page, we could loop here, re-reading the metapage on each
576
 * failure.  (Note that it wouldn't do to hold the lock on the metapage while
577
 * moving to the root --- that'd deadlock against any concurrent root split.)
578
 */
579
Buffer
580
_bt_gettrueroot(Relation rel)
581
0
{
582
0
  Buffer    metabuf;
583
0
  Page    metapg;
584
0
  BTPageOpaque metaopaque;
585
0
  Buffer    rootbuf;
586
0
  Page    rootpage;
587
0
  BTPageOpaque rootopaque;
588
0
  BlockNumber rootblkno;
589
0
  uint32    rootlevel;
590
0
  BTMetaPageData *metad;
591
592
  /*
593
   * We don't try to use cached metapage data here, since (a) this path is
594
   * not performance-critical, and (b) if we are here it suggests our cache
595
   * is out-of-date anyway.  In light of point (b), it's probably safest to
596
   * actively flush any cached metapage info.
597
   */
598
0
  if (rel->rd_amcache)
599
0
    pfree(rel->rd_amcache);
600
0
  rel->rd_amcache = NULL;
601
602
0
  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
603
0
  metapg = BufferGetPage(metabuf);
604
0
  metaopaque = BTPageGetOpaque(metapg);
605
0
  metad = BTPageGetMeta(metapg);
606
607
0
  if (!P_ISMETA(metaopaque) ||
608
0
    metad->btm_magic != BTREE_MAGIC)
609
0
    ereport(ERROR,
610
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
611
0
         errmsg("index \"%s\" is not a btree",
612
0
            RelationGetRelationName(rel))));
613
614
0
  if (metad->btm_version < BTREE_MIN_VERSION ||
615
0
    metad->btm_version > BTREE_VERSION)
616
0
    ereport(ERROR,
617
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
618
0
         errmsg("version mismatch in index \"%s\": file version %d, "
619
0
            "current version %d, minimal supported version %d",
620
0
            RelationGetRelationName(rel),
621
0
            metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));
622
623
  /* if no root page initialized yet, fail */
624
0
  if (metad->btm_root == P_NONE)
625
0
  {
626
0
    _bt_relbuf(rel, metabuf);
627
0
    return InvalidBuffer;
628
0
  }
629
630
0
  rootblkno = metad->btm_root;
631
0
  rootlevel = metad->btm_level;
632
633
  /*
634
   * We are done with the metapage; arrange to release it via first
635
   * _bt_relandgetbuf call
636
   */
637
0
  rootbuf = metabuf;
638
639
0
  for (;;)
640
0
  {
641
0
    rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);
642
0
    rootpage = BufferGetPage(rootbuf);
643
0
    rootopaque = BTPageGetOpaque(rootpage);
644
645
0
    if (!P_IGNORE(rootopaque))
646
0
      break;
647
648
    /* it's dead, Jim.  step right one page */
649
0
    if (P_RIGHTMOST(rootopaque))
650
0
      elog(ERROR, "no live root page found in index \"%s\"",
651
0
         RelationGetRelationName(rel));
652
0
    rootblkno = rootopaque->btpo_next;
653
0
  }
654
655
0
  if (rootopaque->btpo_level != rootlevel)
656
0
    elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
657
0
       rootblkno, RelationGetRelationName(rel),
658
0
       rootopaque->btpo_level, rootlevel);
659
660
0
  return rootbuf;
661
0
}
662
663
/*
664
 *  _bt_getrootheight() -- Get the height of the btree search tree.
665
 *
666
 *    We return the level (counting from zero) of the current fast root.
667
 *    This represents the number of tree levels we'd have to descend through
668
 *    to start any btree index search.
669
 *
670
 *    This is used by the planner for cost-estimation purposes.  Since it's
671
 *    only an estimate, slightly-stale data is fine, hence we don't worry
672
 *    about updating previously cached data.
673
 */
674
int
675
_bt_getrootheight(Relation rel)
676
0
{
677
0
  BTMetaPageData *metad;
678
679
0
  if (rel->rd_amcache == NULL)
680
0
  {
681
0
    Buffer    metabuf;
682
683
0
    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
684
0
    metad = _bt_getmeta(rel, metabuf);
685
686
    /*
687
     * If there's no root page yet, _bt_getroot() doesn't expect a cache
688
     * to be made, so just stop here and report the index height is zero.
689
     * (XXX perhaps _bt_getroot() should be changed to allow this case.)
690
     */
691
0
    if (metad->btm_root == P_NONE)
692
0
    {
693
0
      _bt_relbuf(rel, metabuf);
694
0
      return 0;
695
0
    }
696
697
    /*
698
     * Cache the metapage data for next time
699
     */
700
0
    rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
701
0
                       sizeof(BTMetaPageData));
702
0
    memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
703
0
    _bt_relbuf(rel, metabuf);
704
0
  }
705
706
  /* Get cached page */
707
0
  metad = (BTMetaPageData *) rel->rd_amcache;
708
  /* We shouldn't have cached it if any of these fail */
709
0
  Assert(metad->btm_magic == BTREE_MAGIC);
710
0
  Assert(metad->btm_version >= BTREE_MIN_VERSION);
711
0
  Assert(metad->btm_version <= BTREE_VERSION);
712
0
  Assert(!metad->btm_allequalimage ||
713
0
       metad->btm_version > BTREE_NOVAC_VERSION);
714
0
  Assert(metad->btm_fastroot != P_NONE);
715
716
0
  return metad->btm_fastlevel;
717
0
}
718
719
/*
720
 *  _bt_metaversion() -- Get version/status info from metapage.
721
 *
722
 *    Sets caller's *heapkeyspace and *allequalimage arguments using data
723
 *    from the B-Tree metapage (could be locally-cached version).  This
724
 *    information needs to be stashed in insertion scankey, so we provide a
725
 *    single function that fetches both at once.
726
 *
727
 *    This is used to determine the rules that must be used to descend a
728
 *    btree.  Version 4 indexes treat heap TID as a tiebreaker attribute.
729
 *    pg_upgrade'd version 3 indexes need extra steps to preserve reasonable
730
 *    performance when inserting a new BTScanInsert-wise duplicate tuple
731
 *    among many leaf pages already full of such duplicates.
732
 *
733
 *    Also sets allequalimage field, which indicates whether or not it is
734
 *    safe to apply deduplication.  We rely on the assumption that
735
 *    btm_allequalimage will be zero'ed on heapkeyspace indexes that were
736
 *    pg_upgrade'd from Postgres 12.
737
 */
738
void
739
_bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
740
0
{
741
0
  BTMetaPageData *metad;
742
743
0
  if (rel->rd_amcache == NULL)
744
0
  {
745
0
    Buffer    metabuf;
746
747
0
    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
748
0
    metad = _bt_getmeta(rel, metabuf);
749
750
    /*
751
     * If there's no root page yet, _bt_getroot() doesn't expect a cache
752
     * to be made, so just stop here.  (XXX perhaps _bt_getroot() should
753
     * be changed to allow this case.)
754
     */
755
0
    if (metad->btm_root == P_NONE)
756
0
    {
757
0
      *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
758
0
      *allequalimage = metad->btm_allequalimage;
759
760
0
      _bt_relbuf(rel, metabuf);
761
0
      return;
762
0
    }
763
764
    /*
765
     * Cache the metapage data for next time
766
     *
767
     * An on-the-fly version upgrade performed by _bt_upgrademetapage()
768
     * can change the nbtree version for an index without invalidating any
769
     * local cache.  This is okay because it can only happen when moving
770
     * from version 2 to version 3, both of which are !heapkeyspace
771
     * versions.
772
     */
773
0
    rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
774
0
                       sizeof(BTMetaPageData));
775
0
    memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));
776
0
    _bt_relbuf(rel, metabuf);
777
0
  }
778
779
  /* Get cached page */
780
0
  metad = (BTMetaPageData *) rel->rd_amcache;
781
  /* We shouldn't have cached it if any of these fail */
782
0
  Assert(metad->btm_magic == BTREE_MAGIC);
783
0
  Assert(metad->btm_version >= BTREE_MIN_VERSION);
784
0
  Assert(metad->btm_version <= BTREE_VERSION);
785
0
  Assert(!metad->btm_allequalimage ||
786
0
       metad->btm_version > BTREE_NOVAC_VERSION);
787
0
  Assert(metad->btm_fastroot != P_NONE);
788
789
0
  *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;
790
0
  *allequalimage = metad->btm_allequalimage;
791
0
}
792
793
/*
794
 *  _bt_checkpage() -- Verify that a freshly-read page looks sane.
795
 */
796
void
797
_bt_checkpage(Relation rel, Buffer buf)
798
0
{
799
0
  Page    page = BufferGetPage(buf);
800
801
  /*
802
   * ReadBuffer verifies that every newly-read page passes
803
   * PageHeaderIsValid, which means it either contains a reasonably sane
804
   * page header or is all-zero.  We have to defend against the all-zero
805
   * case, however.
806
   */
807
0
  if (PageIsNew(page))
808
0
    ereport(ERROR,
809
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
810
0
         errmsg("index \"%s\" contains unexpected zero page at block %u",
811
0
            RelationGetRelationName(rel),
812
0
            BufferGetBlockNumber(buf)),
813
0
         errhint("Please REINDEX it.")));
814
815
  /*
816
   * Additionally check that the special area looks sane.
817
   */
818
0
  if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))
819
0
    ereport(ERROR,
820
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
821
0
         errmsg("index \"%s\" contains corrupted page at block %u",
822
0
            RelationGetRelationName(rel),
823
0
            BufferGetBlockNumber(buf)),
824
0
         errhint("Please REINDEX it.")));
825
0
}
826
827
/*
828
 *  _bt_getbuf() -- Get an existing block in a buffer, for read or write.
829
 *
830
 *    The general rule in nbtree is that it's never okay to access a
831
 *    page without holding both a buffer pin and a buffer lock on
832
 *    the page's buffer.
833
 *
834
 *    When this routine returns, the appropriate lock is set on the
835
 *    requested buffer and its reference count has been incremented
836
 *    (ie, the buffer is "locked and pinned").  Also, we apply
837
 *    _bt_checkpage to sanity-check the page, and perform Valgrind
838
 *    client requests that help Valgrind detect unsafe page accesses.
839
 *
840
 *    Note: raw LockBuffer() calls are disallowed in nbtree; all
841
 *    buffer lock requests need to go through wrapper functions such
842
 *    as _bt_lockbuf().
843
 */
844
Buffer
845
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
846
0
{
847
0
  Buffer    buf;
848
849
0
  Assert(BlockNumberIsValid(blkno));
850
851
  /* Read an existing block of the relation */
852
0
  buf = ReadBuffer(rel, blkno);
853
0
  _bt_lockbuf(rel, buf, access);
854
0
  _bt_checkpage(rel, buf);
855
856
0
  return buf;
857
0
}
858
859
/*
860
 *  _bt_allocbuf() -- Allocate a new block/page.
861
 *
862
 * Returns a write-locked buffer containing an unallocated nbtree page.
863
 *
864
 * Callers are required to pass a valid heaprel.  We need heaprel so that we
865
 * can handle generating a snapshotConflictHorizon that makes reusing a page
866
 * from the FSM safe for queries that may be running on standbys.
867
 */
868
Buffer
869
_bt_allocbuf(Relation rel, Relation heaprel)
870
0
{
871
0
  Buffer    buf;
872
0
  BlockNumber blkno;
873
0
  Page    page;
874
875
0
  Assert(heaprel != NULL);
876
877
  /*
878
   * First see if the FSM knows of any free pages.
879
   *
880
   * We can't trust the FSM's report unreservedly; we have to check that the
881
   * page is still free.  (For example, an already-free page could have been
882
   * re-used between the time the last VACUUM scanned it and the time the
883
   * VACUUM made its FSM updates.)
884
   *
885
   * In fact, it's worse than that: we can't even assume that it's safe to
886
   * take a lock on the reported page.  If somebody else has a lock on it,
887
   * or even worse our own caller does, we could deadlock.  (The own-caller
888
   * scenario is actually not improbable. Consider an index on a serial or
889
   * timestamp column.  Nearly all splits will be at the rightmost page, so
890
   * it's entirely likely that _bt_split will call us while holding a lock
891
   * on the page most recently acquired from FSM. A VACUUM running
892
   * concurrently with the previous split could well have placed that page
893
   * back in FSM.)
894
   *
895
   * To get around that, we ask for only a conditional lock on the reported
896
   * page.  If we fail, then someone else is using the page, and we may
897
   * reasonably assume it's not free.  (If we happen to be wrong, the worst
898
   * consequence is the page will be lost to use till the next VACUUM, which
899
   * is no big problem.)
900
   */
901
0
  for (;;)
902
0
  {
903
0
    blkno = GetFreeIndexPage(rel);
904
0
    if (blkno == InvalidBlockNumber)
905
0
      break;
906
0
    buf = ReadBuffer(rel, blkno);
907
0
    if (_bt_conditionallockbuf(rel, buf))
908
0
    {
909
0
      page = BufferGetPage(buf);
910
911
      /*
912
       * It's possible to find an all-zeroes page in an index.  For
913
       * example, a backend might successfully extend the relation one
914
       * page and then crash before it is able to make a WAL entry for
915
       * adding the page.  If we find a zeroed page then reclaim it
916
       * immediately.
917
       */
918
0
      if (PageIsNew(page))
919
0
      {
920
        /* Okay to use page.  Initialize and return it. */
921
0
        _bt_pageinit(page, BufferGetPageSize(buf));
922
0
        return buf;
923
0
      }
924
925
0
      if (BTPageIsRecyclable(page, heaprel))
926
0
      {
927
        /*
928
         * If we are generating WAL for Hot Standby then create a WAL
929
         * record that will allow us to conflict with queries running
930
         * on standby, in case they have snapshots older than safexid
931
         * value
932
         */
933
0
        if (RelationNeedsWAL(rel) && XLogStandbyInfoActive())
934
0
        {
935
0
          xl_btree_reuse_page xlrec_reuse;
936
937
          /*
938
           * Note that we don't register the buffer with the record,
939
           * because this operation doesn't modify the page (that
940
           * already happened, back when VACUUM deleted the page).
941
           * This record only exists to provide a conflict point for
942
           * Hot Standby.  See record REDO routine comments.
943
           */
944
0
          xlrec_reuse.locator = rel->rd_locator;
945
0
          xlrec_reuse.block = blkno;
946
0
          xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);
947
0
          xlrec_reuse.isCatalogRel =
948
0
            RelationIsAccessibleInLogicalDecoding(heaprel);
949
950
0
          XLogBeginInsert();
951
0
          XLogRegisterData(&xlrec_reuse, SizeOfBtreeReusePage);
952
953
0
          XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);
954
0
        }
955
956
        /* Okay to use page.  Re-initialize and return it. */
957
0
        _bt_pageinit(page, BufferGetPageSize(buf));
958
0
        return buf;
959
0
      }
960
0
      elog(DEBUG2, "FSM returned nonrecyclable page");
961
0
      _bt_relbuf(rel, buf);
962
0
    }
963
0
    else
964
0
    {
965
0
      elog(DEBUG2, "FSM returned nonlockable page");
966
      /* couldn't get lock, so just drop pin */
967
0
      ReleaseBuffer(buf);
968
0
    }
969
0
  }
970
971
  /*
972
   * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we
973
   * risk a race condition against btvacuumscan --- see comments therein.
974
   * This forces us to repeat the valgrind request that _bt_lockbuf()
975
   * otherwise would make, as we can't use _bt_lockbuf() without introducing
976
   * a race.
977
   */
978
0
  buf = ExtendBufferedRel(BMR_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST);
979
0
  if (!RelationUsesLocalBuffers(rel))
980
0
    VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
981
982
  /* Initialize the new page before returning it */
983
0
  page = BufferGetPage(buf);
984
0
  Assert(PageIsNew(page));
985
0
  _bt_pageinit(page, BufferGetPageSize(buf));
986
987
0
  return buf;
988
0
}
989
990
/*
991
 *  _bt_relandgetbuf() -- release a locked buffer and get another one.
992
 *
993
 * This is equivalent to _bt_relbuf followed by _bt_getbuf.  Also, if obuf is
994
 * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case
995
 * simplifies some callers.
996
 *
997
 * The original motivation for using this was to avoid two entries to the
998
 * bufmgr when one would do.  However, now it's mainly just a notational
999
 * convenience.  The only case where it saves work over _bt_relbuf/_bt_getbuf
1000
 * is when the target page is the same one already in the buffer.
1001
 */
1002
Buffer
1003
_bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
1004
0
{
1005
0
  Buffer    buf;
1006
1007
0
  Assert(BlockNumberIsValid(blkno));
1008
0
  if (BufferIsValid(obuf))
1009
0
    _bt_unlockbuf(rel, obuf);
1010
0
  buf = ReleaseAndReadBuffer(obuf, rel, blkno);
1011
0
  _bt_lockbuf(rel, buf, access);
1012
1013
0
  _bt_checkpage(rel, buf);
1014
0
  return buf;
1015
0
}
1016
1017
/*
1018
 *  _bt_relbuf() -- release a locked buffer.
1019
 *
1020
 * Lock and pin (refcount) are both dropped.
1021
 */
1022
void
1023
_bt_relbuf(Relation rel, Buffer buf)
1024
0
{
1025
0
  _bt_unlockbuf(rel, buf);
1026
0
  ReleaseBuffer(buf);
1027
0
}
1028
1029
/*
1030
 *  _bt_lockbuf() -- lock a pinned buffer.
1031
 *
1032
 * Lock is acquired without acquiring another pin.  This is like a raw
1033
 * LockBuffer() call, but performs extra steps needed by Valgrind.
1034
 *
1035
 * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1036
 * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1037
 */
1038
void
1039
_bt_lockbuf(Relation rel, Buffer buf, int access)
1040
0
{
1041
  /* LockBuffer() asserts that pin is held by this backend */
1042
0
  LockBuffer(buf, access);
1043
1044
  /*
1045
   * It doesn't matter that _bt_unlockbuf() won't get called in the event of
1046
   * an nbtree error (e.g. a unique violation error).  That won't cause
1047
   * Valgrind false positives.
1048
   *
1049
   * The nbtree client requests are superimposed on top of the bufmgr.c
1050
   * buffer pin client requests.  In the event of an nbtree error the buffer
1051
   * will certainly get marked as defined when the backend once again
1052
   * acquires its first pin on the buffer. (Of course, if the backend never
1053
   * touches the buffer again then it doesn't matter that it remains
1054
   * non-accessible to Valgrind.)
1055
   *
1056
   * Note: When an IndexTuple C pointer gets computed using an ItemId read
1057
   * from a page while a lock was held, the C pointer becomes unsafe to
1058
   * dereference forever as soon as the lock is released.  Valgrind can only
1059
   * detect cases where the pointer gets dereferenced with no _current_
1060
   * lock/pin held, though.
1061
   */
1062
0
  if (!RelationUsesLocalBuffers(rel))
1063
0
    VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1064
0
}
1065
1066
/*
1067
 *  _bt_unlockbuf() -- unlock a pinned buffer.
1068
 */
1069
void
1070
_bt_unlockbuf(Relation rel, Buffer buf)
1071
0
{
1072
  /*
1073
   * Buffer is pinned and locked, which means that it is expected to be
1074
   * defined and addressable.  Check that proactively.
1075
   */
1076
0
  VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1077
1078
  /* LockBuffer() asserts that pin is held by this backend */
1079
0
  LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1080
1081
0
  if (!RelationUsesLocalBuffers(rel))
1082
0
    VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
1083
0
}
1084
1085
/*
1086
 *  _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1087
 *  buffer.
1088
 *
1089
 * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1090
 * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1091
 */
1092
bool
1093
_bt_conditionallockbuf(Relation rel, Buffer buf)
1094
0
{
1095
  /* ConditionalLockBuffer() asserts that pin is held by this backend */
1096
0
  if (!ConditionalLockBuffer(buf))
1097
0
    return false;
1098
1099
0
  if (!RelationUsesLocalBuffers(rel))
1100
0
    VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1101
1102
0
  return true;
1103
0
}
1104
1105
/*
1106
 *  _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock.
1107
 */
1108
void
1109
_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
1110
0
{
1111
  /*
1112
   * Buffer is pinned and locked, which means that it is expected to be
1113
   * defined and addressable.  Check that proactively.
1114
   */
1115
0
  VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1116
1117
  /* LockBuffer() asserts that pin is held by this backend */
1118
0
  LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1119
0
  LockBufferForCleanup(buf);
1120
0
}
1121
1122
/*
1123
 *  _bt_pageinit() -- Initialize a new page.
1124
 *
1125
 * On return, the page header is initialized; data space is empty;
1126
 * special space is zeroed out.
1127
 */
1128
void
1129
_bt_pageinit(Page page, Size size)
1130
0
{
1131
0
  PageInit(page, size, sizeof(BTPageOpaqueData));
1132
0
}
1133
1134
/*
1135
 * Delete item(s) from a btree leaf page during VACUUM.
1136
 *
1137
 * This routine assumes that the caller already has a full cleanup lock on
1138
 * the buffer.  Also, the given deletable and updatable arrays *must* be
1139
 * sorted in ascending order.
1140
 *
1141
 * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1142
 * in an existing posting list item are to be removed.  This works by
1143
 * updating/overwriting an existing item with caller's new version of the item
1144
 * (a version that lacks the TIDs that are to be deleted).
1145
 *
1146
 * We record VACUUMs and b-tree deletes differently in WAL.  Deletes must
1147
 * generate their own snapshotConflictHorizon directly from the tableam,
1148
 * whereas VACUUMs rely on the initial VACUUM table scan performing
1149
 * WAL-logging that takes care of the issue for the table's indexes
1150
 * indirectly.  Also, we remove the VACUUM cycle ID from pages, which b-tree
1151
 * deletes don't do.
1152
 */
1153
void
1154
_bt_delitems_vacuum(Relation rel, Buffer buf,
1155
          OffsetNumber *deletable, int ndeletable,
1156
          BTVacuumPosting *updatable, int nupdatable)
1157
0
{
1158
0
  Page    page = BufferGetPage(buf);
1159
0
  BTPageOpaque opaque;
1160
0
  bool    needswal = RelationNeedsWAL(rel);
1161
0
  char     *updatedbuf = NULL;
1162
0
  Size    updatedbuflen = 0;
1163
0
  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1164
1165
  /* Shouldn't be called unless there's something to do */
1166
0
  Assert(ndeletable > 0 || nupdatable > 0);
1167
1168
  /* Generate new version of posting lists without deleted TIDs */
1169
0
  if (nupdatable > 0)
1170
0
    updatedbuf = _bt_delitems_update(updatable, nupdatable,
1171
0
                     updatedoffsets, &updatedbuflen,
1172
0
                     needswal);
1173
1174
  /* No ereport(ERROR) until changes are logged */
1175
0
  START_CRIT_SECTION();
1176
1177
  /*
1178
   * Handle posting tuple updates.
1179
   *
1180
   * Deliberately do this before handling simple deletes.  If we did it the
1181
   * other way around (i.e. WAL record order -- simple deletes before
1182
   * updates) then we'd have to make compensating changes to the 'updatable'
1183
   * array of offset numbers.
1184
   *
1185
   * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it
1186
   * happens to already be set.  It's important that we not interfere with
1187
   * any future simple index tuple deletion operations.
1188
   */
1189
0
  for (int i = 0; i < nupdatable; i++)
1190
0
  {
1191
0
    OffsetNumber updatedoffset = updatedoffsets[i];
1192
0
    IndexTuple  itup;
1193
0
    Size    itemsz;
1194
1195
0
    itup = updatable[i]->itup;
1196
0
    itemsz = MAXALIGN(IndexTupleSize(itup));
1197
0
    if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1198
0
                   itemsz))
1199
0
      elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1200
0
         BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1201
0
  }
1202
1203
  /* Now handle simple deletes of entire tuples */
1204
0
  if (ndeletable > 0)
1205
0
    PageIndexMultiDelete(page, deletable, ndeletable);
1206
1207
  /*
1208
   * We can clear the vacuum cycle ID since this page has certainly been
1209
   * processed by the current vacuum scan.
1210
   */
1211
0
  opaque = BTPageGetOpaque(page);
1212
0
  opaque->btpo_cycleid = 0;
1213
1214
  /*
1215
   * Clear the BTP_HAS_GARBAGE page flag.
1216
   *
1217
   * This flag indicates the presence of LP_DEAD items on the page (though
1218
   * not reliably).  Note that we only rely on it with pg_upgrade'd
1219
   * !heapkeyspace indexes.  That's why clearing it here won't usually
1220
   * interfere with simple index tuple deletion.
1221
   */
1222
0
  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1223
1224
0
  MarkBufferDirty(buf);
1225
1226
  /* XLOG stuff */
1227
0
  if (needswal)
1228
0
  {
1229
0
    XLogRecPtr  recptr;
1230
0
    xl_btree_vacuum xlrec_vacuum;
1231
1232
0
    xlrec_vacuum.ndeleted = ndeletable;
1233
0
    xlrec_vacuum.nupdated = nupdatable;
1234
1235
0
    XLogBeginInsert();
1236
0
    XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1237
0
    XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);
1238
1239
0
    if (ndeletable > 0)
1240
0
      XLogRegisterBufData(0, deletable,
1241
0
                ndeletable * sizeof(OffsetNumber));
1242
1243
0
    if (nupdatable > 0)
1244
0
    {
1245
0
      XLogRegisterBufData(0, updatedoffsets,
1246
0
                nupdatable * sizeof(OffsetNumber));
1247
0
      XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1248
0
    }
1249
1250
0
    recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
1251
1252
0
    PageSetLSN(page, recptr);
1253
0
  }
1254
1255
0
  END_CRIT_SECTION();
1256
1257
  /* can't leak memory here */
1258
0
  if (updatedbuf != NULL)
1259
0
    pfree(updatedbuf);
1260
  /* free tuples allocated within _bt_delitems_update() */
1261
0
  for (int i = 0; i < nupdatable; i++)
1262
0
    pfree(updatable[i]->itup);
1263
0
}
1264
1265
/*
1266
 * Delete item(s) from a btree leaf page during single-page cleanup.
1267
 *
1268
 * This routine assumes that the caller has pinned and write locked the
1269
 * buffer.  Also, the given deletable and updatable arrays *must* be sorted in
1270
 * ascending order.
1271
 *
1272
 * Routine deals with deleting TIDs when some (but not all) of the heap TIDs
1273
 * in an existing posting list item are to be removed.  This works by
1274
 * updating/overwriting an existing item with caller's new version of the item
1275
 * (a version that lacks the TIDs that are to be deleted).
1276
 *
1277
 * This is nearly the same as _bt_delitems_vacuum as far as what it does to
1278
 * the page, but it needs its own snapshotConflictHorizon and isCatalogRel
1279
 * (from the tableam).  This is used by the REDO routine to generate recovery
1280
 * conflicts.  The other difference is that only _bt_delitems_vacuum will
1281
 * clear page's VACUUM cycle ID.
1282
 */
1283
static void
1284
_bt_delitems_delete(Relation rel, Buffer buf,
1285
          TransactionId snapshotConflictHorizon, bool isCatalogRel,
1286
          OffsetNumber *deletable, int ndeletable,
1287
          BTVacuumPosting *updatable, int nupdatable)
1288
0
{
1289
0
  Page    page = BufferGetPage(buf);
1290
0
  BTPageOpaque opaque;
1291
0
  bool    needswal = RelationNeedsWAL(rel);
1292
0
  char     *updatedbuf = NULL;
1293
0
  Size    updatedbuflen = 0;
1294
0
  OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];
1295
1296
  /* Shouldn't be called unless there's something to do */
1297
0
  Assert(ndeletable > 0 || nupdatable > 0);
1298
1299
  /* Generate new versions of posting lists without deleted TIDs */
1300
0
  if (nupdatable > 0)
1301
0
    updatedbuf = _bt_delitems_update(updatable, nupdatable,
1302
0
                     updatedoffsets, &updatedbuflen,
1303
0
                     needswal);
1304
1305
  /* No ereport(ERROR) until changes are logged */
1306
0
  START_CRIT_SECTION();
1307
1308
  /* Handle updates and deletes just like _bt_delitems_vacuum */
1309
0
  for (int i = 0; i < nupdatable; i++)
1310
0
  {
1311
0
    OffsetNumber updatedoffset = updatedoffsets[i];
1312
0
    IndexTuple  itup;
1313
0
    Size    itemsz;
1314
1315
0
    itup = updatable[i]->itup;
1316
0
    itemsz = MAXALIGN(IndexTupleSize(itup));
1317
0
    if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,
1318
0
                   itemsz))
1319
0
      elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",
1320
0
         BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1321
0
  }
1322
1323
0
  if (ndeletable > 0)
1324
0
    PageIndexMultiDelete(page, deletable, ndeletable);
1325
1326
  /*
1327
   * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at
1328
   * this point.  The VACUUM command alone controls vacuum cycle IDs.
1329
   */
1330
0
  opaque = BTPageGetOpaque(page);
1331
1332
  /*
1333
   * Clear the BTP_HAS_GARBAGE page flag.
1334
   *
1335
   * This flag indicates the presence of LP_DEAD items on the page (though
1336
   * not reliably).  Note that we only rely on it with pg_upgrade'd
1337
   * !heapkeyspace indexes.
1338
   */
1339
0
  opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
1340
1341
0
  MarkBufferDirty(buf);
1342
1343
  /* XLOG stuff */
1344
0
  if (needswal)
1345
0
  {
1346
0
    XLogRecPtr  recptr;
1347
0
    xl_btree_delete xlrec_delete;
1348
1349
0
    xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;
1350
0
    xlrec_delete.ndeleted = ndeletable;
1351
0
    xlrec_delete.nupdated = nupdatable;
1352
0
    xlrec_delete.isCatalogRel = isCatalogRel;
1353
1354
0
    XLogBeginInsert();
1355
0
    XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1356
0
    XLogRegisterData(&xlrec_delete, SizeOfBtreeDelete);
1357
1358
0
    if (ndeletable > 0)
1359
0
      XLogRegisterBufData(0, deletable,
1360
0
                ndeletable * sizeof(OffsetNumber));
1361
1362
0
    if (nupdatable > 0)
1363
0
    {
1364
0
      XLogRegisterBufData(0, updatedoffsets,
1365
0
                nupdatable * sizeof(OffsetNumber));
1366
0
      XLogRegisterBufData(0, updatedbuf, updatedbuflen);
1367
0
    }
1368
1369
0
    recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);
1370
1371
0
    PageSetLSN(page, recptr);
1372
0
  }
1373
1374
0
  END_CRIT_SECTION();
1375
1376
  /* can't leak memory here */
1377
0
  if (updatedbuf != NULL)
1378
0
    pfree(updatedbuf);
1379
  /* free tuples allocated within _bt_delitems_update() */
1380
0
  for (int i = 0; i < nupdatable; i++)
1381
0
    pfree(updatable[i]->itup);
1382
0
}
1383
1384
/*
1385
 * Set up state needed to delete TIDs from posting list tuples via "updating"
1386
 * the tuple.  Performs steps common to both _bt_delitems_vacuum and
1387
 * _bt_delitems_delete.  These steps must take place before each function's
1388
 * critical section begins.
1389
 *
1390
 * updatable and nupdatable are inputs, though note that we will use
1391
 * _bt_update_posting() to replace the original itup with a pointer to a final
1392
 * version in palloc()'d memory.  Caller should free the tuples when its done.
1393
 *
1394
 * The first nupdatable entries from updatedoffsets are set to the page offset
1395
 * number for posting list tuples that caller updates.  This is mostly useful
1396
 * because caller may need to WAL-log the page offsets (though we always do
1397
 * this for caller out of convenience).
1398
 *
1399
 * Returns buffer consisting of an array of xl_btree_update structs that
1400
 * describe the steps we perform here for caller (though only when needswal is
1401
 * true).  Also sets *updatedbuflen to the final size of the buffer.  This
1402
 * buffer is used by caller when WAL logging is required.
1403
 */
1404
static char *
1405
_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,
1406
          OffsetNumber *updatedoffsets, Size *updatedbuflen,
1407
          bool needswal)
1408
0
{
1409
0
  char     *updatedbuf = NULL;
1410
0
  Size    buflen = 0;
1411
1412
  /* Shouldn't be called unless there's something to do */
1413
0
  Assert(nupdatable > 0);
1414
1415
0
  for (int i = 0; i < nupdatable; i++)
1416
0
  {
1417
0
    BTVacuumPosting vacposting = updatable[i];
1418
0
    Size    itemsz;
1419
1420
    /* Replace work area IndexTuple with updated version */
1421
0
    _bt_update_posting(vacposting);
1422
1423
    /* Keep track of size of xl_btree_update for updatedbuf in passing */
1424
0
    itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);
1425
0
    buflen += itemsz;
1426
1427
    /* Build updatedoffsets buffer in passing */
1428
0
    updatedoffsets[i] = vacposting->updatedoffset;
1429
0
  }
1430
1431
  /* XLOG stuff */
1432
0
  if (needswal)
1433
0
  {
1434
0
    Size    offset = 0;
1435
1436
    /* Allocate, set final size for caller */
1437
0
    updatedbuf = palloc(buflen);
1438
0
    *updatedbuflen = buflen;
1439
0
    for (int i = 0; i < nupdatable; i++)
1440
0
    {
1441
0
      BTVacuumPosting vacposting = updatable[i];
1442
0
      Size    itemsz;
1443
0
      xl_btree_update update;
1444
1445
0
      update.ndeletedtids = vacposting->ndeletedtids;
1446
0
      memcpy(updatedbuf + offset, &update.ndeletedtids,
1447
0
           SizeOfBtreeUpdate);
1448
0
      offset += SizeOfBtreeUpdate;
1449
1450
0
      itemsz = update.ndeletedtids * sizeof(uint16);
1451
0
      memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);
1452
0
      offset += itemsz;
1453
0
    }
1454
0
  }
1455
1456
0
  return updatedbuf;
1457
0
}
1458
1459
/*
1460
 * Comparator used by _bt_delitems_delete_check() to restore deltids array
1461
 * back to its original leaf-page-wise sort order
1462
 */
1463
static int
1464
_bt_delitems_cmp(const void *a, const void *b)
1465
0
{
1466
0
  TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;
1467
0
  TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;
1468
1469
0
  Assert(indexdelete1->id != indexdelete2->id);
1470
1471
0
  return pg_cmp_s16(indexdelete1->id, indexdelete2->id);
1472
0
}
1473
1474
/*
1475
 * Try to delete item(s) from a btree leaf page during single-page cleanup.
1476
 *
1477
 * nbtree interface to table_index_delete_tuples().  Deletes a subset of index
1478
 * tuples from caller's deltids array: those whose TIDs are found safe to
1479
 * delete by the tableam (or already marked LP_DEAD in index, and so already
1480
 * known to be deletable by our simple index deletion caller).  We physically
1481
 * delete index tuples from buf leaf page last of all (for index tuples where
1482
 * that is known to be safe following our table_index_delete_tuples() call).
1483
 *
1484
 * Simple index deletion caller only includes TIDs from index tuples marked
1485
 * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be
1486
 * included without increasing the total number of distinct table blocks for
1487
 * the deletion operation as a whole.  This approach often allows us to delete
1488
 * some extra index tuples that were practically free for tableam to check in
1489
 * passing (when they actually turn out to be safe to delete).  It probably
1490
 * only makes sense for the tableam to go ahead with these extra checks when
1491
 * it is block-oriented (otherwise the checks probably won't be practically
1492
 * free, which we rely on).  The tableam interface requires the tableam side
1493
 * to handle the problem, though, so this is okay (we as an index AM are free
1494
 * to make the simplifying assumption that all tableams must be block-based).
1495
 *
1496
 * Bottom-up index deletion caller provides all the TIDs from the leaf page,
1497
 * without expecting that tableam will check most of them.  The tableam has
1498
 * considerable discretion around which entries/blocks it checks.  Our role in
1499
 * costing the bottom-up deletion operation is strictly advisory.
1500
 *
1501
 * Note: Caller must have added deltids entries (i.e. entries that go in
1502
 * delstate's main array) in leaf-page-wise order: page offset number order,
1503
 * TID order among entries taken from the same posting list tuple (tiebreak on
1504
 * TID).  This order is convenient to work with here.
1505
 *
1506
 * Note: We also rely on the id field of each deltids element "capturing" this
1507
 * original leaf-page-wise order.  That is, we expect to be able to get back
1508
 * to the original leaf-page-wise order just by sorting deltids on the id
1509
 * field (tableam will sort deltids for its own reasons, so we'll need to put
1510
 * it back in leaf-page-wise order afterwards).
1511
 */
1512
void
1513
_bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,
1514
              TM_IndexDeleteOp *delstate)
1515
0
{
1516
0
  Page    page = BufferGetPage(buf);
1517
0
  TransactionId snapshotConflictHorizon;
1518
0
  bool    isCatalogRel;
1519
0
  OffsetNumber postingidxoffnum = InvalidOffsetNumber;
1520
0
  int     ndeletable = 0,
1521
0
        nupdatable = 0;
1522
0
  OffsetNumber deletable[MaxIndexTuplesPerPage];
1523
0
  BTVacuumPosting updatable[MaxIndexTuplesPerPage];
1524
1525
  /* Use tableam interface to determine which tuples to delete first */
1526
0
  snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);
1527
0
  isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);
1528
1529
  /* Should not WAL-log snapshotConflictHorizon unless it's required */
1530
0
  if (!XLogStandbyInfoActive())
1531
0
    snapshotConflictHorizon = InvalidTransactionId;
1532
1533
  /*
1534
   * Construct a leaf-page-wise description of what _bt_delitems_delete()
1535
   * needs to do to physically delete index tuples from the page.
1536
   *
1537
   * Must sort deltids array to restore leaf-page-wise order (original order
1538
   * before call to tableam).  This is the order that the loop expects.
1539
   *
1540
   * Note that deltids array might be a lot smaller now.  It might even have
1541
   * no entries at all (with bottom-up deletion caller), in which case there
1542
   * is nothing left to do.
1543
   */
1544
0
  qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),
1545
0
      _bt_delitems_cmp);
1546
0
  if (delstate->ndeltids == 0)
1547
0
  {
1548
0
    Assert(delstate->bottomup);
1549
0
    return;
1550
0
  }
1551
1552
  /* We definitely have to delete at least one index tuple (or one TID) */
1553
0
  for (int i = 0; i < delstate->ndeltids; i++)
1554
0
  {
1555
0
    TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;
1556
0
    OffsetNumber idxoffnum = dstatus->idxoffnum;
1557
0
    ItemId    itemid = PageGetItemId(page, idxoffnum);
1558
0
    IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
1559
0
    int     nestedi,
1560
0
          nitem;
1561
0
    BTVacuumPosting vacposting;
1562
1563
0
    Assert(OffsetNumberIsValid(idxoffnum));
1564
1565
0
    if (idxoffnum == postingidxoffnum)
1566
0
    {
1567
      /*
1568
       * This deltid entry is a TID from a posting list tuple that has
1569
       * already been completely processed
1570
       */
1571
0
      Assert(BTreeTupleIsPosting(itup));
1572
0
      Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
1573
0
                    &delstate->deltids[i].tid) < 0);
1574
0
      Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),
1575
0
                    &delstate->deltids[i].tid) >= 0);
1576
0
      continue;
1577
0
    }
1578
1579
0
    if (!BTreeTupleIsPosting(itup))
1580
0
    {
1581
      /* Plain non-pivot tuple */
1582
0
      Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));
1583
0
      if (dstatus->knowndeletable)
1584
0
        deletable[ndeletable++] = idxoffnum;
1585
0
      continue;
1586
0
    }
1587
1588
    /*
1589
     * itup is a posting list tuple whose lowest deltids entry (which may
1590
     * or may not be for the first TID from itup) is considered here now.
1591
     * We should process all of the deltids entries for the posting list
1592
     * together now, though (not just the lowest).  Remember to skip over
1593
     * later itup-related entries during later iterations of outermost
1594
     * loop.
1595
     */
1596
0
    postingidxoffnum = idxoffnum; /* Remember work in outermost loop */
1597
0
    nestedi = i;      /* Initialize for first itup deltids entry */
1598
0
    vacposting = NULL;   /* Describes final action for itup */
1599
0
    nitem = BTreeTupleGetNPosting(itup);
1600
0
    for (int p = 0; p < nitem; p++)
1601
0
    {
1602
0
      ItemPointer ptid = BTreeTupleGetPostingN(itup, p);
1603
0
      int     ptidcmp = -1;
1604
1605
      /*
1606
       * This nested loop reuses work across ptid TIDs taken from itup.
1607
       * We take advantage of the fact that both itup's TIDs and deltids
1608
       * entries (within a single itup/posting list grouping) must both
1609
       * be in ascending TID order.
1610
       */
1611
0
      for (; nestedi < delstate->ndeltids; nestedi++)
1612
0
      {
1613
0
        TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];
1614
0
        TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);
1615
1616
        /* Stop once we get past all itup related deltids entries */
1617
0
        Assert(tdstatus->idxoffnum >= idxoffnum);
1618
0
        if (tdstatus->idxoffnum != idxoffnum)
1619
0
          break;
1620
1621
        /* Skip past non-deletable itup related entries up front */
1622
0
        if (!tdstatus->knowndeletable)
1623
0
          continue;
1624
1625
        /* Entry is first partial ptid match (or an exact match)? */
1626
0
        ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);
1627
0
        if (ptidcmp >= 0)
1628
0
        {
1629
          /* Greater than or equal (partial or exact) match... */
1630
0
          break;
1631
0
        }
1632
0
      }
1633
1634
      /* ...exact ptid match to a deletable deltids entry? */
1635
0
      if (ptidcmp != 0)
1636
0
        continue;
1637
1638
      /* Exact match for deletable deltids entry -- ptid gets deleted */
1639
0
      if (vacposting == NULL)
1640
0
      {
1641
0
        vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +
1642
0
                  nitem * sizeof(uint16));
1643
0
        vacposting->itup = itup;
1644
0
        vacposting->updatedoffset = idxoffnum;
1645
0
        vacposting->ndeletedtids = 0;
1646
0
      }
1647
0
      vacposting->deletetids[vacposting->ndeletedtids++] = p;
1648
0
    }
1649
1650
    /* Final decision on itup, a posting list tuple */
1651
1652
0
    if (vacposting == NULL)
1653
0
    {
1654
      /* No TIDs to delete from itup -- do nothing */
1655
0
    }
1656
0
    else if (vacposting->ndeletedtids == nitem)
1657
0
    {
1658
      /* Straight delete of itup (to delete all TIDs) */
1659
0
      deletable[ndeletable++] = idxoffnum;
1660
      /* Turns out we won't need granular information */
1661
0
      pfree(vacposting);
1662
0
    }
1663
0
    else
1664
0
    {
1665
      /* Delete some (but not all) TIDs from itup */
1666
0
      Assert(vacposting->ndeletedtids > 0 &&
1667
0
           vacposting->ndeletedtids < nitem);
1668
0
      updatable[nupdatable++] = vacposting;
1669
0
    }
1670
0
  }
1671
1672
  /* Physically delete tuples (or TIDs) using deletable (or updatable) */
1673
0
  _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,
1674
0
            deletable, ndeletable, updatable, nupdatable);
1675
1676
  /* be tidy */
1677
0
  for (int i = 0; i < nupdatable; i++)
1678
0
    pfree(updatable[i]);
1679
0
}
1680
1681
/*
1682
 * Check that leftsib page (the btpo_prev of target page) is not marked with
1683
 * INCOMPLETE_SPLIT flag.  Used during page deletion.
1684
 *
1685
 * Returning true indicates that page flag is set in leftsib (which is
1686
 * definitely still the left sibling of target).  When that happens, the
1687
 * target doesn't have a downlink in parent, and the page deletion algorithm
1688
 * isn't prepared to handle that.  Deletion of the target page (or the whole
1689
 * subtree that contains the target page) cannot take place.
1690
 *
1691
 * Caller should not have a lock on the target page itself, since pages on the
1692
 * same level must always be locked left to right to avoid deadlocks.
1693
 */
1694
static bool
1695
_bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)
1696
0
{
1697
0
  Buffer    buf;
1698
0
  Page    page;
1699
0
  BTPageOpaque opaque;
1700
0
  bool    result;
1701
1702
  /* Easy case: No left sibling */
1703
0
  if (leftsib == P_NONE)
1704
0
    return false;
1705
1706
0
  buf = _bt_getbuf(rel, leftsib, BT_READ);
1707
0
  page = BufferGetPage(buf);
1708
0
  opaque = BTPageGetOpaque(page);
1709
1710
  /*
1711
   * If the left sibling was concurrently split, so that its next-pointer
1712
   * doesn't point to the current page anymore, the split that created
1713
   * target must be completed.  Caller can reasonably expect that there will
1714
   * be a downlink to the target page that it can relocate using its stack.
1715
   * (We don't allow splitting an incompletely split page again until the
1716
   * previous split has been completed.)
1717
   */
1718
0
  result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));
1719
0
  _bt_relbuf(rel, buf);
1720
1721
0
  return result;
1722
0
}
1723
1724
/*
1725
 * Check that leafrightsib page (the btpo_next of target leaf page) is not
1726
 * marked with ISHALFDEAD flag.  Used during page deletion.
1727
 *
1728
 * Returning true indicates that page flag is set in leafrightsib, so page
1729
 * deletion cannot go ahead.  Our caller is not prepared to deal with the case
1730
 * where the parent page does not have a pivot tuples whose downlink points to
1731
 * leafrightsib (due to an earlier interrupted VACUUM operation).  It doesn't
1732
 * seem worth going to the trouble of teaching our caller to deal with it.
1733
 * The situation will be resolved after VACUUM finishes the deletion of the
1734
 * half-dead page (when a future VACUUM operation reaches the target page
1735
 * again).
1736
 *
1737
 * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1738
 * _bt_rightsib_halfdeadflag() is only called for leaf pages, though.  This is
1739
 * okay because of the restriction on deleting pages that are the rightmost
1740
 * page of their parent (i.e. that such deletions can only take place when the
1741
 * entire subtree must be deleted).  The leaf level check made here will apply
1742
 * to a right "cousin" leaf page rather than a simple right sibling leaf page
1743
 * in cases where caller actually goes on to attempt deleting pages that are
1744
 * above the leaf page.  The right cousin leaf page is representative of the
1745
 * left edge of the subtree to the right of the to-be-deleted subtree as a
1746
 * whole, which is exactly the condition that our caller cares about.
1747
 * (Besides, internal pages are never marked half-dead, so it isn't even
1748
 * possible to _directly_ assess if an internal page is part of some other
1749
 * to-be-deleted subtree.)
1750
 */
1751
static bool
1752
_bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)
1753
0
{
1754
0
  Buffer    buf;
1755
0
  Page    page;
1756
0
  BTPageOpaque opaque;
1757
0
  bool    result;
1758
1759
0
  Assert(leafrightsib != P_NONE);
1760
1761
0
  buf = _bt_getbuf(rel, leafrightsib, BT_READ);
1762
0
  page = BufferGetPage(buf);
1763
0
  opaque = BTPageGetOpaque(page);
1764
1765
0
  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));
1766
0
  result = P_ISHALFDEAD(opaque);
1767
0
  _bt_relbuf(rel, buf);
1768
1769
0
  return result;
1770
0
}
1771
1772
/*
1773
 * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
1774
 *
1775
 * This action unlinks the leaf page from the b-tree structure, removing all
1776
 * pointers leading to it --- but not touching its own left and right links.
1777
 * The page cannot be physically reclaimed right away, since other processes
1778
 * may currently be trying to follow links leading to the page; they have to
1779
 * be allowed to use its right-link to recover.  See nbtree/README.
1780
 *
1781
 * On entry, the target buffer must be pinned and locked (either read or write
1782
 * lock is OK).  The page must be an empty leaf page, which may be half-dead
1783
 * already (a half-dead page should only be passed to us when an earlier
1784
 * VACUUM operation was interrupted, though).  Note in particular that caller
1785
 * should never pass a buffer containing an existing deleted page here.  The
1786
 * lock and pin on caller's buffer will be dropped before we return.
1787
 *
1788
 * Maintains bulk delete stats for caller, which are taken from vstate.  We
1789
 * need to cooperate closely with caller here so that whole VACUUM operation
1790
 * reliably avoids any double counting of subsidiary-to-leafbuf pages that we
1791
 * delete in passing.  If such pages happen to be from a block number that is
1792
 * ahead of the current scanblkno position, then caller is expected to count
1793
 * them directly later on.  It's simpler for us to understand caller's
1794
 * requirements than it would be for caller to understand when or how a
1795
 * deleted page became deleted after the fact.
1796
 *
1797
 * NOTE: this leaks memory.  Rather than trying to clean up everything
1798
 * carefully, it's better to run it in a temp context that can be reset
1799
 * frequently.
1800
 */
1801
void
1802
_bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)
1803
0
{
1804
0
  BlockNumber rightsib;
1805
0
  bool    rightsib_empty;
1806
0
  Page    page;
1807
0
  BTPageOpaque opaque;
1808
1809
  /*
1810
   * Save original leafbuf block number from caller.  Only deleted blocks
1811
   * that are <= scanblkno are added to bulk delete stat's pages_deleted
1812
   * count.
1813
   */
1814
0
  BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);
1815
1816
  /*
1817
   * "stack" is a search stack leading (approximately) to the target page.
1818
   * It is initially NULL, but when iterating, we keep it to avoid
1819
   * duplicated search effort.
1820
   *
1821
   * Also, when "stack" is not NULL, we have already checked that the
1822
   * current page is not the right half of an incomplete split, i.e. the
1823
   * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1824
   * when the current target page is to the right of caller's initial page
1825
   * (the scanblkno page).
1826
   */
1827
0
  BTStack   stack = NULL;
1828
1829
0
  for (;;)
1830
0
  {
1831
0
    page = BufferGetPage(leafbuf);
1832
0
    opaque = BTPageGetOpaque(page);
1833
1834
    /*
1835
     * Internal pages are never deleted directly, only as part of deleting
1836
     * the whole subtree all the way down to leaf level.
1837
     *
1838
     * Also check for deleted pages here.  Caller never passes us a fully
1839
     * deleted page.  Only VACUUM can delete pages, so there can't have
1840
     * been a concurrent deletion.  Assume that we reached any deleted
1841
     * page encountered here by following a sibling link, and that the
1842
     * index is corrupt.
1843
     */
1844
0
    Assert(!P_ISDELETED(opaque));
1845
0
    if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
1846
0
    {
1847
      /*
1848
       * Pre-9.4 page deletion only marked internal pages as half-dead,
1849
       * but now we only use that flag on leaf pages. The old algorithm
1850
       * was never supposed to leave half-dead pages in the tree, it was
1851
       * just a transient state, but it was nevertheless possible in
1852
       * error scenarios. We don't know how to deal with them here. They
1853
       * are harmless as far as searches are considered, but inserts
1854
       * into the deleted keyspace could add out-of-order downlinks in
1855
       * the upper levels. Log a notice, hopefully the admin will notice
1856
       * and reindex.
1857
       */
1858
0
      if (P_ISHALFDEAD(opaque))
1859
0
        ereport(LOG,
1860
0
            (errcode(ERRCODE_INDEX_CORRUPTED),
1861
0
             errmsg("index \"%s\" contains a half-dead internal page",
1862
0
                RelationGetRelationName(rel)),
1863
0
             errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
1864
1865
0
      if (P_ISDELETED(opaque))
1866
0
        ereport(LOG,
1867
0
            (errcode(ERRCODE_INDEX_CORRUPTED),
1868
0
             errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",
1869
0
                     BufferGetBlockNumber(leafbuf),
1870
0
                     scanblkno,
1871
0
                     RelationGetRelationName(rel))));
1872
1873
0
      _bt_relbuf(rel, leafbuf);
1874
0
      return;
1875
0
    }
1876
1877
    /*
1878
     * We can never delete rightmost pages nor root pages.  While at it,
1879
     * check that page is empty, since it's possible that the leafbuf page
1880
     * was empty a moment ago, but has since had some inserts.
1881
     *
1882
     * To keep the algorithm simple, we also never delete an incompletely
1883
     * split page (they should be rare enough that this doesn't make any
1884
     * meaningful difference to disk usage):
1885
     *
1886
     * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1887
     * left half of an incomplete split, but ensuring that it's not the
1888
     * right half is more complicated.  For that, we have to check that
1889
     * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1890
     * _bt_leftsib_splitflag().  On the first iteration, we temporarily
1891
     * release the lock on scanblkno/leafbuf, check the left sibling, and
1892
     * construct a search stack to scanblkno.  On subsequent iterations,
1893
     * we know we stepped right from a page that passed these tests, so
1894
     * it's OK.
1895
     */
1896
0
    if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
1897
0
      P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
1898
0
      P_INCOMPLETE_SPLIT(opaque))
1899
0
    {
1900
      /* Should never fail to delete a half-dead page */
1901
0
      Assert(!P_ISHALFDEAD(opaque));
1902
1903
0
      _bt_relbuf(rel, leafbuf);
1904
0
      return;
1905
0
    }
1906
1907
    /*
1908
     * First, remove downlink pointing to the page (or a parent of the
1909
     * page, if we are going to delete a taller subtree), and mark the
1910
     * leafbuf page half-dead
1911
     */
1912
0
    if (!P_ISHALFDEAD(opaque))
1913
0
    {
1914
      /*
1915
       * We need an approximate pointer to the page's parent page.  We
1916
       * use a variant of the standard search mechanism to search for
1917
       * the page's high key; this will give us a link to either the
1918
       * current parent or someplace to its left (if there are multiple
1919
       * equal high keys, which is possible with !heapkeyspace indexes).
1920
       *
1921
       * Also check if this is the right-half of an incomplete split
1922
       * (see comment above).
1923
       */
1924
0
      if (!stack)
1925
0
      {
1926
0
        BTScanInsert itup_key;
1927
0
        ItemId    itemid;
1928
0
        IndexTuple  targetkey;
1929
0
        BlockNumber leftsib,
1930
0
              leafblkno;
1931
0
        Buffer    sleafbuf;
1932
1933
0
        itemid = PageGetItemId(page, P_HIKEY);
1934
0
        targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
1935
1936
0
        leftsib = opaque->btpo_prev;
1937
0
        leafblkno = BufferGetBlockNumber(leafbuf);
1938
1939
        /*
1940
         * To avoid deadlocks, we'd better drop the leaf page lock
1941
         * before going further.
1942
         */
1943
0
        _bt_unlockbuf(rel, leafbuf);
1944
1945
        /*
1946
         * Check that the left sibling of leafbuf (if any) is not
1947
         * marked with INCOMPLETE_SPLIT flag before proceeding
1948
         */
1949
0
        Assert(leafblkno == scanblkno);
1950
0
        if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
1951
0
        {
1952
0
          ReleaseBuffer(leafbuf);
1953
0
          return;
1954
0
        }
1955
1956
        /*
1957
         * We need an insertion scan key, so build one.
1958
         *
1959
         * _bt_search searches for the leaf page that contains any
1960
         * matching non-pivot tuples, but we need it to "search" for
1961
         * the high key pivot from the page that we're set to delete.
1962
         * Compensate for the mismatch by having _bt_search locate the
1963
         * last position < equal-to-untruncated-prefix non-pivots.
1964
         */
1965
0
        itup_key = _bt_mkscankey(rel, targetkey);
1966
1967
        /* Set up a BTLessStrategyNumber-like insertion scan key */
1968
0
        itup_key->nextkey = false;
1969
0
        itup_key->backward = true;
1970
0
        stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);
1971
        /* won't need a second lock or pin on leafbuf */
1972
0
        _bt_relbuf(rel, sleafbuf);
1973
1974
        /*
1975
         * Re-lock the leaf page, and start over to use our stack
1976
         * within _bt_mark_page_halfdead.  We must do it that way
1977
         * because it's possible that leafbuf can no longer be
1978
         * deleted.  We need to recheck.
1979
         *
1980
         * Note: We can't simply hold on to the sleafbuf lock instead,
1981
         * because it's barely possible that sleafbuf is not the same
1982
         * page as leafbuf.  This happens when leafbuf split after our
1983
         * original lock was dropped, but before _bt_search finished
1984
         * its descent.  We rely on the assumption that we'll find
1985
         * leafbuf isn't safe to delete anymore in this scenario.
1986
         * (Page deletion can cope with the stack being to the left of
1987
         * leafbuf, but not to the right of leafbuf.)
1988
         */
1989
0
        _bt_lockbuf(rel, leafbuf, BT_WRITE);
1990
0
        continue;
1991
0
      }
1992
1993
      /*
1994
       * See if it's safe to delete the leaf page, and determine how
1995
       * many parent/internal pages above the leaf level will be
1996
       * deleted.  If it's safe then _bt_mark_page_halfdead will also
1997
       * perform the first phase of deletion, which includes marking the
1998
       * leafbuf page half-dead.
1999
       */
2000
0
      Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
2001
0
      if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,
2002
0
                    stack))
2003
0
      {
2004
0
        _bt_relbuf(rel, leafbuf);
2005
0
        return;
2006
0
      }
2007
0
    }
2008
2009
    /*
2010
     * Then unlink it from its siblings.  Each call to
2011
     * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,
2012
     * making it shallower.  Iterate until the leafbuf page is deleted.
2013
     */
2014
0
    rightsib_empty = false;
2015
0
    Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
2016
0
    while (P_ISHALFDEAD(opaque))
2017
0
    {
2018
      /* Check for interrupts in _bt_unlink_halfdead_page */
2019
0
      if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
2020
0
                      &rightsib_empty, vstate))
2021
0
      {
2022
        /*
2023
         * _bt_unlink_halfdead_page should never fail, since we
2024
         * established that deletion is generally safe in
2025
         * _bt_mark_page_halfdead -- index must be corrupt.
2026
         *
2027
         * Note that _bt_unlink_halfdead_page already released the
2028
         * lock and pin on leafbuf for us.
2029
         */
2030
0
        Assert(false);
2031
0
        return;
2032
0
      }
2033
0
    }
2034
2035
0
    Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
2036
2037
0
    rightsib = opaque->btpo_next;
2038
2039
0
    _bt_relbuf(rel, leafbuf);
2040
2041
    /*
2042
     * Check here, as calling loops will have locks held, preventing
2043
     * interrupts from being processed.
2044
     */
2045
0
    CHECK_FOR_INTERRUPTS();
2046
2047
    /*
2048
     * The page has now been deleted. If its right sibling is completely
2049
     * empty, it's possible that the reason we haven't deleted it earlier
2050
     * is that it was the rightmost child of the parent. Now that we
2051
     * removed the downlink for this page, the right sibling might now be
2052
     * the only child of the parent, and could be removed. It would be
2053
     * picked up by the next vacuum anyway, but might as well try to
2054
     * remove it now, so loop back to process the right sibling.
2055
     *
2056
     * Note: This relies on the assumption that _bt_getstackbuf() will be
2057
     * able to reuse our original descent stack with a different child
2058
     * block (provided that the child block is to the right of the
2059
     * original leaf page reached by _bt_search()). It will even update
2060
     * the descent stack each time we loop around, avoiding repeated work.
2061
     */
2062
0
    if (!rightsib_empty)
2063
0
      break;
2064
2065
0
    leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2066
0
  }
2067
0
}
2068
2069
/*
2070
 * First stage of page deletion.
2071
 *
2072
 * Establish the height of the to-be-deleted subtree with leafbuf at its
2073
 * lowest level, remove the downlink to the subtree, and mark leafbuf
2074
 * half-dead.  The final to-be-deleted subtree is usually just leafbuf itself,
2075
 * but may include additional internal pages (at most one per level of the
2076
 * tree below the root).
2077
 *
2078
 * Caller must pass a valid heaprel, since it's just about possible that our
2079
 * call to _bt_lock_subtree_parent will need to allocate a new index page to
2080
 * complete a page split.  Every call to _bt_allocbuf needs to pass a heaprel.
2081
 *
2082
 * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is
2083
 * the rightmost child of its parent (and parent has more than one downlink).
2084
 * Returns 'true' when the first stage of page deletion completed
2085
 * successfully.
2086
 */
2087
static bool
2088
_bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf,
2089
             BTStack stack)
2090
0
{
2091
0
  BlockNumber leafblkno;
2092
0
  BlockNumber leafrightsib;
2093
0
  BlockNumber topparent;
2094
0
  BlockNumber topparentrightsib;
2095
0
  ItemId    itemid;
2096
0
  Page    page;
2097
0
  BTPageOpaque opaque;
2098
0
  Buffer    subtreeparent;
2099
0
  OffsetNumber poffset;
2100
0
  OffsetNumber nextoffset;
2101
0
  IndexTuple  itup;
2102
0
  IndexTupleData trunctuple;
2103
2104
0
  page = BufferGetPage(leafbuf);
2105
0
  opaque = BTPageGetOpaque(page);
2106
2107
0
  Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&
2108
0
       P_ISLEAF(opaque) && !P_IGNORE(opaque) &&
2109
0
       P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2110
0
  Assert(heaprel != NULL);
2111
2112
  /*
2113
   * Save info about the leaf page.
2114
   */
2115
0
  leafblkno = BufferGetBlockNumber(leafbuf);
2116
0
  leafrightsib = opaque->btpo_next;
2117
2118
  /*
2119
   * Before attempting to lock the parent page, check that the right sibling
2120
   * is not in half-dead state.  A half-dead right sibling would have no
2121
   * downlink in the parent, which would be highly confusing later when we
2122
   * delete the downlink.  It would fail the "right sibling of target page
2123
   * is also the next child in parent page" cross-check below.
2124
   */
2125
0
  if (_bt_rightsib_halfdeadflag(rel, leafrightsib))
2126
0
  {
2127
0
    elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",
2128
0
       leafblkno, leafrightsib);
2129
0
    return false;
2130
0
  }
2131
2132
  /*
2133
   * We cannot delete a page that is the rightmost child of its immediate
2134
   * parent, unless it is the only child --- in which case the parent has to
2135
   * be deleted too, and the same condition applies recursively to it. We
2136
   * have to check this condition all the way up before trying to delete,
2137
   * and lock the parent of the root of the to-be-deleted subtree (the
2138
   * "subtree parent").  _bt_lock_subtree_parent() locks the subtree parent
2139
   * for us.  We remove the downlink to the "top parent" page (subtree root
2140
   * page) from the subtree parent page below.
2141
   *
2142
   * Initialize topparent to be leafbuf page now.  The final to-be-deleted
2143
   * subtree is often a degenerate one page subtree consisting only of the
2144
   * leafbuf page.  When that happens, the leafbuf page is the final subtree
2145
   * root page/top parent page.
2146
   */
2147
0
  topparent = leafblkno;
2148
0
  topparentrightsib = leafrightsib;
2149
0
  if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack,
2150
0
                 &subtreeparent, &poffset,
2151
0
                 &topparent, &topparentrightsib))
2152
0
    return false;
2153
2154
0
  page = BufferGetPage(subtreeparent);
2155
0
  opaque = BTPageGetOpaque(page);
2156
2157
#ifdef USE_ASSERT_CHECKING
2158
2159
  /*
2160
   * This is just an assertion because _bt_lock_subtree_parent should have
2161
   * guaranteed tuple has the expected contents
2162
   */
2163
  itemid = PageGetItemId(page, poffset);
2164
  itup = (IndexTuple) PageGetItem(page, itemid);
2165
  Assert(BTreeTupleGetDownLink(itup) == topparent);
2166
#endif
2167
2168
0
  nextoffset = OffsetNumberNext(poffset);
2169
0
  itemid = PageGetItemId(page, nextoffset);
2170
0
  itup = (IndexTuple) PageGetItem(page, itemid);
2171
2172
  /*
2173
   * Check that the parent-page index items we're about to delete/overwrite
2174
   * in subtree parent page contain what we expect.  This can fail if the
2175
   * index has become corrupt for some reason.  When that happens we back
2176
   * out of deletion of the leafbuf subtree.  (This is just like the case
2177
   * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.)
2178
   */
2179
0
  if (BTreeTupleGetDownLink(itup) != topparentrightsib)
2180
0
  {
2181
0
    ereport(LOG,
2182
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
2183
0
         errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",
2184
0
                 topparentrightsib, topparent,
2185
0
                 BTreeTupleGetDownLink(itup),
2186
0
                 BufferGetBlockNumber(subtreeparent),
2187
0
                 RelationGetRelationName(rel))));
2188
2189
0
    _bt_relbuf(rel, subtreeparent);
2190
0
    Assert(false);
2191
0
    return false;
2192
0
  }
2193
2194
  /*
2195
   * Any insert which would have gone on the leaf block will now go to its
2196
   * right sibling.  In other words, the key space moves right.
2197
   */
2198
0
  PredicateLockPageCombine(rel, leafblkno, leafrightsib);
2199
2200
  /* No ereport(ERROR) until changes are logged */
2201
0
  START_CRIT_SECTION();
2202
2203
  /*
2204
   * Update parent of subtree.  We want to delete the downlink to the top
2205
   * parent page/root of the subtree, and the *following* key.  Easiest way
2206
   * is to copy the right sibling's downlink over the downlink that points
2207
   * to top parent page, and then delete the right sibling's original pivot
2208
   * tuple.
2209
   *
2210
   * Lanin and Shasha make the key space move left when deleting a page,
2211
   * whereas the key space moves right here.  That's why we cannot simply
2212
   * delete the pivot tuple with the downlink to the top parent page.  See
2213
   * nbtree/README.
2214
   */
2215
0
  page = BufferGetPage(subtreeparent);
2216
0
  opaque = BTPageGetOpaque(page);
2217
2218
0
  itemid = PageGetItemId(page, poffset);
2219
0
  itup = (IndexTuple) PageGetItem(page, itemid);
2220
0
  BTreeTupleSetDownLink(itup, topparentrightsib);
2221
2222
0
  nextoffset = OffsetNumberNext(poffset);
2223
0
  PageIndexTupleDelete(page, nextoffset);
2224
2225
  /*
2226
   * Mark the leaf page as half-dead, and stamp it with a link to the top
2227
   * parent page.  When the leaf page is also the top parent page, the link
2228
   * is set to InvalidBlockNumber.
2229
   */
2230
0
  page = BufferGetPage(leafbuf);
2231
0
  opaque = BTPageGetOpaque(page);
2232
0
  opaque->btpo_flags |= BTP_HALF_DEAD;
2233
2234
0
  Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);
2235
0
  MemSet(&trunctuple, 0, sizeof(IndexTupleData));
2236
0
  trunctuple.t_info = sizeof(IndexTupleData);
2237
0
  if (topparent != leafblkno)
2238
0
    BTreeTupleSetTopParent(&trunctuple, topparent);
2239
0
  else
2240
0
    BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);
2241
2242
0
  if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,
2243
0
                 IndexTupleSize(&trunctuple)))
2244
0
    elog(ERROR, "could not overwrite high key in half-dead page");
2245
2246
  /* Must mark buffers dirty before XLogInsert */
2247
0
  MarkBufferDirty(subtreeparent);
2248
0
  MarkBufferDirty(leafbuf);
2249
2250
  /* XLOG stuff */
2251
0
  if (RelationNeedsWAL(rel))
2252
0
  {
2253
0
    xl_btree_mark_page_halfdead xlrec;
2254
0
    XLogRecPtr  recptr;
2255
2256
0
    xlrec.poffset = poffset;
2257
0
    xlrec.leafblk = leafblkno;
2258
0
    if (topparent != leafblkno)
2259
0
      xlrec.topparent = topparent;
2260
0
    else
2261
0
      xlrec.topparent = InvalidBlockNumber;
2262
2263
0
    XLogBeginInsert();
2264
0
    XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);
2265
0
    XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);
2266
2267
0
    page = BufferGetPage(leafbuf);
2268
0
    opaque = BTPageGetOpaque(page);
2269
0
    xlrec.leftblk = opaque->btpo_prev;
2270
0
    xlrec.rightblk = opaque->btpo_next;
2271
2272
0
    XLogRegisterData(&xlrec, SizeOfBtreeMarkPageHalfDead);
2273
2274
0
    recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);
2275
2276
0
    page = BufferGetPage(subtreeparent);
2277
0
    PageSetLSN(page, recptr);
2278
0
    page = BufferGetPage(leafbuf);
2279
0
    PageSetLSN(page, recptr);
2280
0
  }
2281
2282
0
  END_CRIT_SECTION();
2283
2284
0
  _bt_relbuf(rel, subtreeparent);
2285
0
  return true;
2286
0
}
2287
2288
/*
2289
 * Second stage of page deletion.
2290
 *
2291
 * Unlinks a single page (in the subtree undergoing deletion) from its
2292
 * siblings.  Also marks the page deleted.
2293
 *
2294
 * To get rid of the whole subtree, including the leaf page itself, call here
2295
 * until the leaf page is deleted.  The original "top parent" established in
2296
 * the first stage of deletion is deleted in the first call here, while the
2297
 * leaf page is deleted in the last call here.  Note that the leaf page itself
2298
 * is often the initial top parent page.
2299
 *
2300
 * Returns 'false' if the page could not be unlinked (shouldn't happen).  If
2301
 * the right sibling of the current target page is empty, *rightsib_empty is
2302
 * set to true, allowing caller to delete the target's right sibling page in
2303
 * passing.  Note that *rightsib_empty is only actually used by caller when
2304
 * target page is leafbuf, following last call here for leafbuf/the subtree
2305
 * containing leafbuf.  (We always set *rightsib_empty for caller, just to be
2306
 * consistent.)
2307
 *
2308
 * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
2309
 * On success exit, we'll be holding pin and write lock.  On failure exit,
2310
 * we'll release both pin and lock before returning (we define it that way
2311
 * to avoid having to reacquire a lock we already released).
2312
 */
2313
static bool
2314
_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
2315
             bool *rightsib_empty, BTVacState *vstate)
2316
0
{
2317
0
  BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
2318
0
  IndexBulkDeleteResult *stats = vstate->stats;
2319
0
  BlockNumber leafleftsib;
2320
0
  BlockNumber leafrightsib;
2321
0
  BlockNumber target;
2322
0
  BlockNumber leftsib;
2323
0
  BlockNumber rightsib;
2324
0
  Buffer    lbuf = InvalidBuffer;
2325
0
  Buffer    buf;
2326
0
  Buffer    rbuf;
2327
0
  Buffer    metabuf = InvalidBuffer;
2328
0
  Page    metapg = NULL;
2329
0
  BTMetaPageData *metad = NULL;
2330
0
  ItemId    itemid;
2331
0
  Page    page;
2332
0
  BTPageOpaque opaque;
2333
0
  FullTransactionId safexid;
2334
0
  bool    rightsib_is_rightmost;
2335
0
  uint32    targetlevel;
2336
0
  IndexTuple  leafhikey;
2337
0
  BlockNumber leaftopparent;
2338
2339
0
  page = BufferGetPage(leafbuf);
2340
0
  opaque = BTPageGetOpaque(page);
2341
2342
0
  Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));
2343
2344
  /*
2345
   * Remember some information about the leaf page.
2346
   */
2347
0
  itemid = PageGetItemId(page, P_HIKEY);
2348
0
  leafhikey = (IndexTuple) PageGetItem(page, itemid);
2349
0
  target = BTreeTupleGetTopParent(leafhikey);
2350
0
  leafleftsib = opaque->btpo_prev;
2351
0
  leafrightsib = opaque->btpo_next;
2352
2353
0
  _bt_unlockbuf(rel, leafbuf);
2354
2355
  /*
2356
   * Check here, as calling loops will have locks held, preventing
2357
   * interrupts from being processed.
2358
   */
2359
0
  CHECK_FOR_INTERRUPTS();
2360
2361
  /* Unlink the current top parent of the subtree */
2362
0
  if (!BlockNumberIsValid(target))
2363
0
  {
2364
    /* Target is leaf page (or leaf page is top parent, if you prefer) */
2365
0
    target = leafblkno;
2366
2367
0
    buf = leafbuf;
2368
0
    leftsib = leafleftsib;
2369
0
    targetlevel = 0;
2370
0
  }
2371
0
  else
2372
0
  {
2373
    /* Target is the internal page taken from leaf's top parent link */
2374
0
    Assert(target != leafblkno);
2375
2376
    /* Fetch the block number of the target's left sibling */
2377
0
    buf = _bt_getbuf(rel, target, BT_READ);
2378
0
    page = BufferGetPage(buf);
2379
0
    opaque = BTPageGetOpaque(page);
2380
0
    leftsib = opaque->btpo_prev;
2381
0
    targetlevel = opaque->btpo_level;
2382
0
    Assert(targetlevel > 0);
2383
2384
    /*
2385
     * To avoid deadlocks, we'd better drop the target page lock before
2386
     * going further.
2387
     */
2388
0
    _bt_unlockbuf(rel, buf);
2389
0
  }
2390
2391
  /*
2392
   * We have to lock the pages we need to modify in the standard order:
2393
   * moving right, then up.  Else we will deadlock against other writers.
2394
   *
2395
   * So, first lock the leaf page, if it's not the target.  Then find and
2396
   * write-lock the current left sibling of the target page.  The sibling
2397
   * that was current a moment ago could have split, so we may have to move
2398
   * right.
2399
   */
2400
0
  if (target != leafblkno)
2401
0
    _bt_lockbuf(rel, leafbuf, BT_WRITE);
2402
0
  if (leftsib != P_NONE)
2403
0
  {
2404
0
    lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2405
0
    page = BufferGetPage(lbuf);
2406
0
    opaque = BTPageGetOpaque(page);
2407
0
    while (P_ISDELETED(opaque) || opaque->btpo_next != target)
2408
0
    {
2409
0
      bool    leftsibvalid = true;
2410
2411
      /*
2412
       * Before we follow the link from the page that was the left
2413
       * sibling mere moments ago, validate its right link.  This
2414
       * reduces the opportunities for loop to fail to ever make any
2415
       * progress in the presence of index corruption.
2416
       *
2417
       * Note: we rely on the assumption that there can only be one
2418
       * vacuum process running at a time (against the same index).
2419
       */
2420
0
      if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||
2421
0
        leftsib == opaque->btpo_next)
2422
0
        leftsibvalid = false;
2423
2424
0
      leftsib = opaque->btpo_next;
2425
0
      _bt_relbuf(rel, lbuf);
2426
2427
0
      if (!leftsibvalid)
2428
0
      {
2429
        /*
2430
         * This is known to fail in the field; sibling link corruption
2431
         * is relatively common.  Press on with vacuuming rather than
2432
         * just throwing an ERROR.
2433
         */
2434
0
        ereport(LOG,
2435
0
            (errcode(ERRCODE_INDEX_CORRUPTED),
2436
0
             errmsg_internal("valid left sibling for deletion target could not be located: "
2437
0
                     "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",
2438
0
                     leftsib, target, leafblkno, scanblkno,
2439
0
                     targetlevel, RelationGetRelationName(rel))));
2440
2441
        /* Must release all pins and locks on failure exit */
2442
0
        ReleaseBuffer(buf);
2443
0
        if (target != leafblkno)
2444
0
          _bt_relbuf(rel, leafbuf);
2445
2446
0
        return false;
2447
0
      }
2448
2449
0
      CHECK_FOR_INTERRUPTS();
2450
2451
      /* step right one page */
2452
0
      lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
2453
0
      page = BufferGetPage(lbuf);
2454
0
      opaque = BTPageGetOpaque(page);
2455
0
    }
2456
0
  }
2457
0
  else
2458
0
    lbuf = InvalidBuffer;
2459
2460
  /* Next write-lock the target page itself */
2461
0
  _bt_lockbuf(rel, buf, BT_WRITE);
2462
0
  page = BufferGetPage(buf);
2463
0
  opaque = BTPageGetOpaque(page);
2464
2465
  /*
2466
   * Check page is still empty etc, else abandon deletion.  This is just for
2467
   * paranoia's sake; a half-dead page cannot resurrect because there can be
2468
   * only one vacuum process running at a time.
2469
   */
2470
0
  if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
2471
0
    elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",
2472
0
       target, RelationGetRelationName(rel));
2473
2474
0
  if (opaque->btpo_prev != leftsib)
2475
0
    ereport(ERROR,
2476
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
2477
0
         errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",
2478
0
                 leftsib, opaque->btpo_prev, target,
2479
0
                 RelationGetRelationName(rel))));
2480
2481
0
  if (target == leafblkno)
2482
0
  {
2483
0
    if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
2484
0
      !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
2485
0
      elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",
2486
0
         target, RelationGetRelationName(rel));
2487
2488
    /* Leaf page is also target page: don't set leaftopparent */
2489
0
    leaftopparent = InvalidBlockNumber;
2490
0
  }
2491
0
  else
2492
0
  {
2493
0
    IndexTuple  finaldataitem;
2494
2495
0
    if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
2496
0
      P_ISLEAF(opaque))
2497
0
      elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",
2498
0
         targetlevel, target, RelationGetRelationName(rel));
2499
2500
    /* Target is internal: set leaftopparent for next call here...  */
2501
0
    itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
2502
0
    finaldataitem = (IndexTuple) PageGetItem(page, itemid);
2503
0
    leaftopparent = BTreeTupleGetDownLink(finaldataitem);
2504
    /* ...except when it would be a redundant pointer-to-self */
2505
0
    if (leaftopparent == leafblkno)
2506
0
      leaftopparent = InvalidBlockNumber;
2507
0
  }
2508
2509
  /* No leaftopparent for level 0 (leaf page) or level 1 target */
2510
0
  Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);
2511
2512
  /*
2513
   * And next write-lock the (current) right sibling.
2514
   */
2515
0
  rightsib = opaque->btpo_next;
2516
0
  rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
2517
0
  page = BufferGetPage(rbuf);
2518
0
  opaque = BTPageGetOpaque(page);
2519
2520
  /*
2521
   * Validate target's right sibling page.  Its left link must point back to
2522
   * the target page.
2523
   */
2524
0
  if (opaque->btpo_prev != target)
2525
0
  {
2526
    /*
2527
     * This is known to fail in the field; sibling link corruption is
2528
     * relatively common.  Press on with vacuuming rather than just
2529
     * throwing an ERROR (same approach used for left-sibling's-right-link
2530
     * validation check a moment ago).
2531
     */
2532
0
    ereport(LOG,
2533
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
2534
0
         errmsg_internal("right sibling's left-link doesn't match: "
2535
0
                 "right sibling %u of target %u with leafblkno %u "
2536
0
                 "and scanblkno %u spuriously links to non-target %u "
2537
0
                 "on level %u of index \"%s\"",
2538
0
                 rightsib, target, leafblkno,
2539
0
                 scanblkno, opaque->btpo_prev,
2540
0
                 targetlevel, RelationGetRelationName(rel))));
2541
2542
    /* Must release all pins and locks on failure exit */
2543
0
    if (BufferIsValid(lbuf))
2544
0
      _bt_relbuf(rel, lbuf);
2545
0
    _bt_relbuf(rel, rbuf);
2546
0
    _bt_relbuf(rel, buf);
2547
0
    if (target != leafblkno)
2548
0
      _bt_relbuf(rel, leafbuf);
2549
2550
0
    return false;
2551
0
  }
2552
2553
0
  rightsib_is_rightmost = P_RIGHTMOST(opaque);
2554
0
  *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
2555
2556
  /*
2557
   * If we are deleting the next-to-last page on the target's level, then
2558
   * the rightsib is a candidate to become the new fast root. (In theory, it
2559
   * might be possible to push the fast root even further down, but the odds
2560
   * of doing so are slim, and the locking considerations daunting.)
2561
   *
2562
   * We can safely acquire a lock on the metapage here --- see comments for
2563
   * _bt_newlevel().
2564
   */
2565
0
  if (leftsib == P_NONE && rightsib_is_rightmost)
2566
0
  {
2567
0
    page = BufferGetPage(rbuf);
2568
0
    opaque = BTPageGetOpaque(page);
2569
0
    if (P_RIGHTMOST(opaque))
2570
0
    {
2571
      /* rightsib will be the only one left on the level */
2572
0
      metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2573
0
      metapg = BufferGetPage(metabuf);
2574
0
      metad = BTPageGetMeta(metapg);
2575
2576
      /*
2577
       * The expected case here is btm_fastlevel == targetlevel+1; if
2578
       * the fastlevel is <= targetlevel, something is wrong, and we
2579
       * choose to overwrite it to fix it.
2580
       */
2581
0
      if (metad->btm_fastlevel > targetlevel + 1)
2582
0
      {
2583
        /* no update wanted */
2584
0
        _bt_relbuf(rel, metabuf);
2585
0
        metabuf = InvalidBuffer;
2586
0
      }
2587
0
    }
2588
0
  }
2589
2590
  /*
2591
   * Here we begin doing the deletion.
2592
   */
2593
2594
  /* No ereport(ERROR) until changes are logged */
2595
0
  START_CRIT_SECTION();
2596
2597
  /*
2598
   * Update siblings' side-links.  Note the target page's side-links will
2599
   * continue to point to the siblings.  Asserts here are just rechecking
2600
   * things we already verified above.
2601
   */
2602
0
  if (BufferIsValid(lbuf))
2603
0
  {
2604
0
    page = BufferGetPage(lbuf);
2605
0
    opaque = BTPageGetOpaque(page);
2606
0
    Assert(opaque->btpo_next == target);
2607
0
    opaque->btpo_next = rightsib;
2608
0
  }
2609
0
  page = BufferGetPage(rbuf);
2610
0
  opaque = BTPageGetOpaque(page);
2611
0
  Assert(opaque->btpo_prev == target);
2612
0
  opaque->btpo_prev = leftsib;
2613
2614
  /*
2615
   * If we deleted a parent of the targeted leaf page, instead of the leaf
2616
   * itself, update the leaf to point to the next remaining child in the
2617
   * subtree.
2618
   *
2619
   * Note: We rely on the fact that a buffer pin on the leaf page has been
2620
   * held since leafhikey was initialized.  This is safe, though only
2621
   * because the page was already half-dead at that point.  The leaf page
2622
   * cannot have been modified by any other backend during the period when
2623
   * no lock was held.
2624
   */
2625
0
  if (target != leafblkno)
2626
0
    BTreeTupleSetTopParent(leafhikey, leaftopparent);
2627
2628
  /*
2629
   * Mark the page itself deleted.  It can be recycled when all current
2630
   * transactions are gone.  Storing GetTopTransactionId() would work, but
2631
   * we're in VACUUM and would not otherwise have an XID.  Having already
2632
   * updated links to the target, ReadNextFullTransactionId() suffices as an
2633
   * upper bound.  Any scan having retained a now-stale link is advertising
2634
   * in its PGPROC an xmin less than or equal to the value we read here.  It
2635
   * will continue to do so, holding back the xmin horizon, for the duration
2636
   * of that scan.
2637
   */
2638
0
  page = BufferGetPage(buf);
2639
0
  opaque = BTPageGetOpaque(page);
2640
0
  Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
2641
2642
  /*
2643
   * Store upper bound XID that's used to determine when deleted page is no
2644
   * longer needed as a tombstone
2645
   */
2646
0
  safexid = ReadNextFullTransactionId();
2647
0
  BTPageSetDeleted(page, safexid);
2648
0
  opaque->btpo_cycleid = 0;
2649
2650
  /* And update the metapage, if needed */
2651
0
  if (BufferIsValid(metabuf))
2652
0
  {
2653
    /* upgrade metapage if needed */
2654
0
    if (metad->btm_version < BTREE_NOVAC_VERSION)
2655
0
      _bt_upgrademetapage(metapg);
2656
0
    metad->btm_fastroot = rightsib;
2657
0
    metad->btm_fastlevel = targetlevel;
2658
0
    MarkBufferDirty(metabuf);
2659
0
  }
2660
2661
  /* Must mark buffers dirty before XLogInsert */
2662
0
  MarkBufferDirty(rbuf);
2663
0
  MarkBufferDirty(buf);
2664
0
  if (BufferIsValid(lbuf))
2665
0
    MarkBufferDirty(lbuf);
2666
0
  if (target != leafblkno)
2667
0
    MarkBufferDirty(leafbuf);
2668
2669
  /* XLOG stuff */
2670
0
  if (RelationNeedsWAL(rel))
2671
0
  {
2672
0
    xl_btree_unlink_page xlrec;
2673
0
    xl_btree_metadata xlmeta;
2674
0
    uint8   xlinfo;
2675
0
    XLogRecPtr  recptr;
2676
2677
0
    XLogBeginInsert();
2678
2679
0
    XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);
2680
0
    if (BufferIsValid(lbuf))
2681
0
      XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2682
0
    XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);
2683
0
    if (target != leafblkno)
2684
0
      XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
2685
2686
    /* information stored on the target/to-be-unlinked block */
2687
0
    xlrec.leftsib = leftsib;
2688
0
    xlrec.rightsib = rightsib;
2689
0
    xlrec.level = targetlevel;
2690
0
    xlrec.safexid = safexid;
2691
2692
    /* information needed to recreate the leaf block (if not the target) */
2693
0
    xlrec.leafleftsib = leafleftsib;
2694
0
    xlrec.leafrightsib = leafrightsib;
2695
0
    xlrec.leaftopparent = leaftopparent;
2696
2697
0
    XLogRegisterData(&xlrec, SizeOfBtreeUnlinkPage);
2698
2699
0
    if (BufferIsValid(metabuf))
2700
0
    {
2701
0
      XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2702
2703
0
      Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2704
0
      xlmeta.version = metad->btm_version;
2705
0
      xlmeta.root = metad->btm_root;
2706
0
      xlmeta.level = metad->btm_level;
2707
0
      xlmeta.fastroot = metad->btm_fastroot;
2708
0
      xlmeta.fastlevel = metad->btm_fastlevel;
2709
0
      xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2710
0
      xlmeta.allequalimage = metad->btm_allequalimage;
2711
2712
0
      XLogRegisterBufData(4, &xlmeta, sizeof(xl_btree_metadata));
2713
0
      xlinfo = XLOG_BTREE_UNLINK_PAGE_META;
2714
0
    }
2715
0
    else
2716
0
      xlinfo = XLOG_BTREE_UNLINK_PAGE;
2717
2718
0
    recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2719
2720
0
    if (BufferIsValid(metabuf))
2721
0
    {
2722
0
      PageSetLSN(metapg, recptr);
2723
0
    }
2724
0
    page = BufferGetPage(rbuf);
2725
0
    PageSetLSN(page, recptr);
2726
0
    page = BufferGetPage(buf);
2727
0
    PageSetLSN(page, recptr);
2728
0
    if (BufferIsValid(lbuf))
2729
0
    {
2730
0
      page = BufferGetPage(lbuf);
2731
0
      PageSetLSN(page, recptr);
2732
0
    }
2733
0
    if (target != leafblkno)
2734
0
    {
2735
0
      page = BufferGetPage(leafbuf);
2736
0
      PageSetLSN(page, recptr);
2737
0
    }
2738
0
  }
2739
2740
0
  END_CRIT_SECTION();
2741
2742
  /* release metapage */
2743
0
  if (BufferIsValid(metabuf))
2744
0
    _bt_relbuf(rel, metabuf);
2745
2746
  /* release siblings */
2747
0
  if (BufferIsValid(lbuf))
2748
0
    _bt_relbuf(rel, lbuf);
2749
0
  _bt_relbuf(rel, rbuf);
2750
2751
  /* If the target is not leafbuf, we're done with it now -- release it */
2752
0
  if (target != leafblkno)
2753
0
    _bt_relbuf(rel, buf);
2754
2755
  /*
2756
   * Maintain pages_newly_deleted, which is simply the number of pages
2757
   * deleted by the ongoing VACUUM operation.
2758
   *
2759
   * Maintain pages_deleted in a way that takes into account how
2760
   * btvacuumpage() will count deleted pages that have yet to become
2761
   * scanblkno -- only count page when it's not going to get that treatment
2762
   * later on.
2763
   */
2764
0
  stats->pages_newly_deleted++;
2765
0
  if (target <= scanblkno)
2766
0
    stats->pages_deleted++;
2767
2768
  /*
2769
   * Remember information about the target page (now a newly deleted page)
2770
   * in dedicated vstate space for later.  The page will be considered as a
2771
   * candidate to place in the FSM at the end of the current btvacuumscan()
2772
   * call.
2773
   */
2774
0
  _bt_pendingfsm_add(vstate, target, safexid);
2775
2776
  /* Success - hold on to lock on leafbuf (might also have been target) */
2777
0
  return true;
2778
0
}
2779
2780
/*
2781
 * Establish how tall the to-be-deleted subtree will be during the first stage
2782
 * of page deletion.
2783
 *
2784
 * Caller's child argument is the block number of the page caller wants to
2785
 * delete (this is leafbuf's block number, except when we're called
2786
 * recursively).  stack is a search stack leading to it.  Note that we will
2787
 * update the stack entry(s) to reflect current downlink positions --- this is
2788
 * similar to the corresponding point in page split handling.
2789
 *
2790
 * If "first stage" caller cannot go ahead with deleting _any_ pages, returns
2791
 * false.  Returns true on success, in which case caller can use certain
2792
 * details established here to perform the first stage of deletion.  This
2793
 * function is the last point at which page deletion may be deemed unsafe
2794
 * (barring index corruption, or unexpected concurrent page deletions).
2795
 *
2796
 * We write lock the parent of the root of the to-be-deleted subtree for
2797
 * caller on success (i.e. we leave our lock on the *subtreeparent buffer for
2798
 * caller).  Caller will have to remove a downlink from *subtreeparent.  We
2799
 * also set a *subtreeparent offset number in *poffset, to indicate the
2800
 * location of the pivot tuple that contains the relevant downlink.
2801
 *
2802
 * The root of the to-be-deleted subtree is called the "top parent".  Note
2803
 * that the leafbuf page is often the final "top parent" page (you can think
2804
 * of the leafbuf page as a degenerate single page subtree when that happens).
2805
 * Caller should initialize *topparent to the target leafbuf page block number
2806
 * (while *topparentrightsib should be set to leafbuf's right sibling block
2807
 * number).  We will update *topparent (and *topparentrightsib) for caller
2808
 * here, though only when it turns out that caller will delete at least one
2809
 * internal page (i.e. only when caller needs to store a valid link to the top
2810
 * parent block in the leafbuf page using BTreeTupleSetTopParent()).
2811
 */
2812
static bool
2813
_bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child,
2814
            BTStack stack, Buffer *subtreeparent,
2815
            OffsetNumber *poffset, BlockNumber *topparent,
2816
            BlockNumber *topparentrightsib)
2817
0
{
2818
0
  BlockNumber parent,
2819
0
        leftsibparent;
2820
0
  OffsetNumber parentoffset,
2821
0
        maxoff;
2822
0
  Buffer    pbuf;
2823
0
  Page    page;
2824
0
  BTPageOpaque opaque;
2825
2826
  /*
2827
   * Locate the pivot tuple whose downlink points to "child".  Write lock
2828
   * the parent page itself.
2829
   */
2830
0
  pbuf = _bt_getstackbuf(rel, heaprel, stack, child);
2831
0
  if (pbuf == InvalidBuffer)
2832
0
  {
2833
    /*
2834
     * Failed to "re-find" a pivot tuple whose downlink matched our child
2835
     * block number on the parent level -- the index must be corrupt.
2836
     * Don't even try to delete the leafbuf subtree.  Just report the
2837
     * issue and press on with vacuuming the index.
2838
     *
2839
     * Note: _bt_getstackbuf() recovers from concurrent page splits that
2840
     * take place on the parent level.  Its approach is a near-exhaustive
2841
     * linear search.  This also gives it a surprisingly good chance of
2842
     * recovering in the event of a buggy or inconsistent opclass.  But we
2843
     * don't rely on that here.
2844
     */
2845
0
    ereport(LOG,
2846
0
        (errcode(ERRCODE_INDEX_CORRUPTED),
2847
0
         errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",
2848
0
                 RelationGetRelationName(rel), child)));
2849
0
    Assert(false);
2850
0
    return false;
2851
0
  }
2852
2853
0
  parent = stack->bts_blkno;
2854
0
  parentoffset = stack->bts_offset;
2855
2856
0
  page = BufferGetPage(pbuf);
2857
0
  opaque = BTPageGetOpaque(page);
2858
0
  maxoff = PageGetMaxOffsetNumber(page);
2859
0
  leftsibparent = opaque->btpo_prev;
2860
2861
  /*
2862
   * _bt_getstackbuf() completes page splits on returned parent buffer when
2863
   * required.
2864
   *
2865
   * In general it's a bad idea for VACUUM to use up more disk space, which
2866
   * is why page deletion does not finish incomplete page splits most of the
2867
   * time.  We allow this limited exception because the risk is much lower,
2868
   * and the potential downside of not proceeding is much higher:  A single
2869
   * internal page with the INCOMPLETE_SPLIT flag set might otherwise
2870
   * prevent us from deleting hundreds of empty leaf pages from one level
2871
   * down.
2872
   */
2873
0
  Assert(!P_INCOMPLETE_SPLIT(opaque));
2874
2875
0
  if (parentoffset < maxoff)
2876
0
  {
2877
    /*
2878
     * Child is not the rightmost child in parent, so it's safe to delete
2879
     * the subtree whose root/topparent is child page
2880
     */
2881
0
    *subtreeparent = pbuf;
2882
0
    *poffset = parentoffset;
2883
0
    return true;
2884
0
  }
2885
2886
  /*
2887
   * Child is the rightmost child of parent.
2888
   *
2889
   * Since it's the rightmost child of parent, deleting the child (or
2890
   * deleting the subtree whose root/topparent is the child page) is only
2891
   * safe when it's also possible to delete the parent.
2892
   */
2893
0
  Assert(parentoffset == maxoff);
2894
0
  if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))
2895
0
  {
2896
    /*
2897
     * Child isn't parent's only child, or parent is rightmost on its
2898
     * entire level.  Definitely cannot delete any pages.
2899
     */
2900
0
    _bt_relbuf(rel, pbuf);
2901
0
    return false;
2902
0
  }
2903
2904
  /*
2905
   * Now make sure that the parent deletion is itself safe by examining the
2906
   * child's grandparent page.  Recurse, passing the parent page as the
2907
   * child page (child's grandparent is the parent on the next level up). If
2908
   * parent deletion is unsafe, then child deletion must also be unsafe (in
2909
   * which case caller cannot delete any pages at all).
2910
   */
2911
0
  *topparent = parent;
2912
0
  *topparentrightsib = opaque->btpo_next;
2913
2914
  /*
2915
   * Release lock on parent before recursing.
2916
   *
2917
   * It's OK to release page locks on parent before recursive call locks
2918
   * grandparent.  An internal page can only acquire an entry if the child
2919
   * is split, but that cannot happen as long as we still hold a lock on the
2920
   * leafbuf page.
2921
   */
2922
0
  _bt_relbuf(rel, pbuf);
2923
2924
  /*
2925
   * Before recursing, check that the left sibling of parent (if any) is not
2926
   * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the
2927
   * parent lock).
2928
   *
2929
   * Note: We deliberately avoid completing incomplete splits here.
2930
   */
2931
0
  if (_bt_leftsib_splitflag(rel, leftsibparent, parent))
2932
0
    return false;
2933
2934
  /* Recurse to examine child page's grandparent page */
2935
0
  return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent,
2936
0
                   subtreeparent, poffset,
2937
0
                   topparent, topparentrightsib);
2938
0
}
2939
2940
/*
2941
 * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize
2942
 * optimization.
2943
 *
2944
 * Called at the start of a btvacuumscan().  Caller's cleanuponly argument
2945
 * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().
2946
 *
2947
 * We expect to allocate memory inside VACUUM's top-level memory context here.
2948
 * The working buffer is subject to a limit based on work_mem.  Our strategy
2949
 * when the array can no longer grow within the bounds of that limit is to
2950
 * stop saving additional newly deleted pages, while proceeding as usual with
2951
 * the pages that we can fit.
2952
 */
2953
void
2954
_bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)
2955
0
{
2956
0
  Size    maxbufsize;
2957
2958
  /*
2959
   * Don't bother with optimization in cleanup-only case -- we don't expect
2960
   * any newly deleted pages.  Besides, cleanup-only calls to btvacuumscan()
2961
   * can only take place because this optimization didn't work out during
2962
   * the last VACUUM.
2963
   */
2964
0
  if (cleanuponly)
2965
0
    return;
2966
2967
  /*
2968
   * Cap maximum size of array so that we always respect work_mem.  Avoid
2969
   * int overflow here.
2970
   */
2971
0
  vstate->bufsize = 256;
2972
0
  maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);
2973
0
  maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));
2974
  /* BTVacState.maxbufsize has type int */
2975
0
  maxbufsize = Min(maxbufsize, INT_MAX);
2976
  /* Stay sane with small work_mem */
2977
0
  maxbufsize = Max(maxbufsize, vstate->bufsize);
2978
0
  vstate->maxbufsize = (int) maxbufsize;
2979
2980
  /* Allocate buffer, indicate that there are currently 0 pending pages */
2981
0
  vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);
2982
0
  vstate->npendingpages = 0;
2983
0
}
2984
2985
/*
2986
 * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during
2987
 * the ongoing VACUUM operation) into the free space map -- though only when
2988
 * it is actually safe to do so by now.
2989
 *
2990
 * Called at the end of a btvacuumscan(), just before free space map vacuuming
2991
 * takes place.
2992
 *
2993
 * Frees memory allocated by _bt_pendingfsm_init(), if any.
2994
 */
2995
void
2996
_bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)
2997
0
{
2998
0
  IndexBulkDeleteResult *stats = vstate->stats;
2999
0
  Relation  heaprel = vstate->info->heaprel;
3000
3001
0
  Assert(stats->pages_newly_deleted >= vstate->npendingpages);
3002
0
  Assert(heaprel != NULL);
3003
3004
0
  if (vstate->npendingpages == 0)
3005
0
  {
3006
    /* Just free memory when nothing to do */
3007
0
    if (vstate->pendingpages)
3008
0
      pfree(vstate->pendingpages);
3009
3010
0
    return;
3011
0
  }
3012
3013
#ifdef DEBUG_BTREE_PENDING_FSM
3014
3015
  /*
3016
   * Debugging aid: Sleep for 5 seconds to greatly increase the chances of
3017
   * placing pending pages in the FSM.  Note that the optimization will
3018
   * never be effective without some other backend concurrently consuming an
3019
   * XID.
3020
   */
3021
  pg_usleep(5000000L);
3022
#endif
3023
3024
  /*
3025
   * Recompute VACUUM XID boundaries.
3026
   *
3027
   * We don't actually care about the oldest non-removable XID.  Computing
3028
   * the oldest such XID has a useful side-effect that we rely on: it
3029
   * forcibly updates the XID horizon state for this backend.  This step is
3030
   * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize
3031
   * that it is now safe to recycle newly deleted pages without this step.
3032
   */
3033
0
  GetOldestNonRemovableTransactionId(heaprel);
3034
3035
0
  for (int i = 0; i < vstate->npendingpages; i++)
3036
0
  {
3037
0
    BlockNumber target = vstate->pendingpages[i].target;
3038
0
    FullTransactionId safexid = vstate->pendingpages[i].safexid;
3039
3040
    /*
3041
     * Do the equivalent of checking BTPageIsRecyclable(), but without
3042
     * accessing the page again a second time.
3043
     *
3044
     * Give up on finding the first non-recyclable page -- all later pages
3045
     * must be non-recyclable too, since _bt_pendingfsm_add() adds pages
3046
     * to the array in safexid order.
3047
     */
3048
0
    if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))
3049
0
      break;
3050
3051
0
    RecordFreeIndexPage(rel, target);
3052
0
    stats->pages_free++;
3053
0
  }
3054
3055
0
  pfree(vstate->pendingpages);
3056
0
}
3057
3058
/*
3059
 * Maintain array of pages that were deleted during current btvacuumscan()
3060
 * call, for use in _bt_pendingfsm_finalize()
3061
 */
3062
static void
3063
_bt_pendingfsm_add(BTVacState *vstate,
3064
           BlockNumber target,
3065
           FullTransactionId safexid)
3066
0
{
3067
0
  Assert(vstate->npendingpages <= vstate->bufsize);
3068
0
  Assert(vstate->bufsize <= vstate->maxbufsize);
3069
3070
#ifdef USE_ASSERT_CHECKING
3071
3072
  /*
3073
   * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the
3074
   * array will always be in safexid order (since that is the order that we
3075
   * save them in here)
3076
   */
3077
  if (vstate->npendingpages > 0)
3078
  {
3079
    FullTransactionId lastsafexid =
3080
      vstate->pendingpages[vstate->npendingpages - 1].safexid;
3081
3082
    Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));
3083
  }
3084
#endif
3085
3086
  /*
3087
   * If temp buffer reaches maxbufsize/work_mem capacity then we discard
3088
   * information about this page.
3089
   *
3090
   * Note that this also covers the case where we opted to not use the
3091
   * optimization in _bt_pendingfsm_init().
3092
   */
3093
0
  if (vstate->npendingpages == vstate->maxbufsize)
3094
0
    return;
3095
3096
  /* Consider enlarging buffer */
3097
0
  if (vstate->npendingpages == vstate->bufsize)
3098
0
  {
3099
0
    int     newbufsize = vstate->bufsize * 2;
3100
3101
    /* Respect work_mem */
3102
0
    if (newbufsize > vstate->maxbufsize)
3103
0
      newbufsize = vstate->maxbufsize;
3104
3105
0
    vstate->bufsize = newbufsize;
3106
0
    vstate->pendingpages =
3107
0
      repalloc(vstate->pendingpages,
3108
0
           sizeof(BTPendingFSM) * vstate->bufsize);
3109
0
  }
3110
3111
  /* Save metadata for newly deleted page */
3112
0
  vstate->pendingpages[vstate->npendingpages].target = target;
3113
0
  vstate->pendingpages[vstate->npendingpages].safexid = safexid;
3114
0
  vstate->npendingpages++;
3115
0
}