Coverage Report

Created: 2025-09-27 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/postgres/src/backend/access/nbtree/nbtinsert.c
Line
Count
Source
1
/*-------------------------------------------------------------------------
2
 *
3
 * nbtinsert.c
4
 *    Item insertion in Lehman and Yao btrees for Postgres.
5
 *
6
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7
 * Portions Copyright (c) 1994, Regents of the University of California
8
 *
9
 *
10
 * IDENTIFICATION
11
 *    src/backend/access/nbtree/nbtinsert.c
12
 *
13
 *-------------------------------------------------------------------------
14
 */
15
16
#include "postgres.h"
17
18
#include "access/nbtree.h"
19
#include "access/nbtxlog.h"
20
#include "access/tableam.h"
21
#include "access/transam.h"
22
#include "access/xloginsert.h"
23
#include "common/int.h"
24
#include "common/pg_prng.h"
25
#include "lib/qunique.h"
26
#include "miscadmin.h"
27
#include "storage/lmgr.h"
28
#include "storage/predicate.h"
29
30
/* Minimum tree height for application of fastpath optimization */
31
0
#define BTREE_FASTPATH_MIN_LEVEL  2
32
33
34
static BTStack _bt_search_insert(Relation rel, Relation heaprel,
35
                 BTInsertState insertstate);
36
static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate,
37
                    Relation heapRel,
38
                    IndexUniqueCheck checkUnique, bool *is_unique,
39
                    uint32 *speculativeToken);
40
static OffsetNumber _bt_findinsertloc(Relation rel,
41
                    BTInsertState insertstate,
42
                    bool checkingunique,
43
                    bool indexUnchanged,
44
                    BTStack stack,
45
                    Relation heapRel);
46
static void _bt_stepright(Relation rel, Relation heaprel,
47
              BTInsertState insertstate, BTStack stack);
48
static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key,
49
               Buffer buf,
50
               Buffer cbuf,
51
               BTStack stack,
52
               IndexTuple itup,
53
               Size itemsz,
54
               OffsetNumber newitemoff,
55
               int postingoff,
56
               bool split_only_page);
57
static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key,
58
            Buffer buf, Buffer cbuf, OffsetNumber newitemoff,
59
            Size newitemsz, IndexTuple newitem, IndexTuple orignewitem,
60
            IndexTuple nposting, uint16 postingoff);
61
static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf,
62
                Buffer rbuf, BTStack stack, bool isroot, bool isonly);
63
static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf);
64
static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
65
                OffsetNumber itup_off, bool newfirstdataitem);
66
static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
67
                     BTInsertState insertstate,
68
                     bool simpleonly, bool checkingunique,
69
                     bool uniquedup, bool indexUnchanged);
70
static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
71
                 OffsetNumber *deletable, int ndeletable,
72
                 IndexTuple newitem, OffsetNumber minoff,
73
                 OffsetNumber maxoff);
74
static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable,
75
                   int ndeletable, IndexTuple newitem,
76
                   int *nblocks);
77
static inline int _bt_blk_cmp(const void *arg1, const void *arg2);
78
79
/*
80
 *  _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
81
 *
82
 *    This routine is called by the public interface routine, btinsert.
83
 *    By here, itup is filled in, including the TID.
84
 *
85
 *    If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
86
 *    will allow duplicates.  Otherwise (UNIQUE_CHECK_YES or
87
 *    UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
88
 *    For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
89
 *    don't actually insert.
90
 *
91
 *    indexUnchanged executor hint indicates if itup is from an
92
 *    UPDATE that didn't logically change the indexed value, but
93
 *    must nevertheless have a new entry to point to a successor
94
 *    version.
95
 *
96
 *    The result value is only significant for UNIQUE_CHECK_PARTIAL:
97
 *    it must be true if the entry is known unique, else false.
98
 *    (In the current implementation we'll also return true after a
99
 *    successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but
100
 *    that's just a coding artifact.)
101
 */
102
bool
103
_bt_doinsert(Relation rel, IndexTuple itup,
104
       IndexUniqueCheck checkUnique, bool indexUnchanged,
105
       Relation heapRel)
106
0
{
107
0
  bool    is_unique = false;
108
0
  BTInsertStateData insertstate;
109
0
  BTScanInsert itup_key;
110
0
  BTStack   stack;
111
0
  bool    checkingunique = (checkUnique != UNIQUE_CHECK_NO);
112
113
  /* we need an insertion scan key to do our search, so build one */
114
0
  itup_key = _bt_mkscankey(rel, itup);
115
116
0
  if (checkingunique)
117
0
  {
118
0
    if (!itup_key->anynullkeys)
119
0
    {
120
      /* No (heapkeyspace) scantid until uniqueness established */
121
0
      itup_key->scantid = NULL;
122
0
    }
123
0
    else
124
0
    {
125
      /*
126
       * Scan key for new tuple contains NULL key values.  Bypass
127
       * checkingunique steps.  They are unnecessary because core code
128
       * considers NULL unequal to every value, including NULL.
129
       *
130
       * This optimization avoids O(N^2) behavior within the
131
       * _bt_findinsertloc() heapkeyspace path when a unique index has a
132
       * large number of "duplicates" with NULL key values.
133
       */
134
0
      checkingunique = false;
135
      /* Tuple is unique in the sense that core code cares about */
136
0
      Assert(checkUnique != UNIQUE_CHECK_EXISTING);
137
0
      is_unique = true;
138
0
    }
139
0
  }
140
141
  /*
142
   * Fill in the BTInsertState working area, to track the current page and
143
   * position within the page to insert on.
144
   *
145
   * Note that itemsz is passed down to lower level code that deals with
146
   * inserting the item.  It must be MAXALIGN()'d.  This ensures that space
147
   * accounting code consistently considers the alignment overhead that we
148
   * expect PageAddItem() will add later.  (Actually, index_form_tuple() is
149
   * already conservative about alignment, but we don't rely on that from
150
   * this distance.  Besides, preserving the "true" tuple size in index
151
   * tuple headers for the benefit of nbtsplitloc.c might happen someday.
152
   * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.)
153
   */
154
0
  insertstate.itup = itup;
155
0
  insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
156
0
  insertstate.itup_key = itup_key;
157
0
  insertstate.bounds_valid = false;
158
0
  insertstate.buf = InvalidBuffer;
159
0
  insertstate.postingoff = 0;
160
161
0
search:
162
163
  /*
164
   * Find and lock the leaf page that the tuple should be added to by
165
   * searching from the root page.  insertstate.buf will hold a buffer that
166
   * is locked in exclusive mode afterwards.
167
   */
168
0
  stack = _bt_search_insert(rel, heapRel, &insertstate);
169
170
  /*
171
   * checkingunique inserts are not allowed to go ahead when two tuples with
172
   * equal key attribute values would be visible to new MVCC snapshots once
173
   * the xact commits.  Check for conflicts in the locked page/buffer (if
174
   * needed) here.
175
   *
176
   * It might be necessary to check a page to the right in _bt_check_unique,
177
   * though that should be very rare.  In practice the first page the value
178
   * could be on (with scantid omitted) is almost always also the only page
179
   * that a matching tuple might be found on.  This is due to the behavior
180
   * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can
181
   * only be allowed to cross a page boundary when there is no candidate
182
   * leaf page split point that avoids it.  Also, _bt_check_unique can use
183
   * the leaf page high key to determine that there will be no duplicates on
184
   * the right sibling without actually visiting it (it uses the high key in
185
   * cases where the new item happens to belong at the far right of the leaf
186
   * page).
187
   *
188
   * NOTE: obviously, _bt_check_unique can only detect keys that are already
189
   * in the index; so it cannot defend against concurrent insertions of the
190
   * same key.  We protect against that by means of holding a write lock on
191
   * the first page the value could be on, with omitted/-inf value for the
192
   * implicit heap TID tiebreaker attribute.  Any other would-be inserter of
193
   * the same key must acquire a write lock on the same page, so only one
194
   * would-be inserter can be making the check at one time.  Furthermore,
195
   * once we are past the check we hold write locks continuously until we
196
   * have performed our insertion, so no later inserter can fail to see our
197
   * insertion.  (This requires some care in _bt_findinsertloc.)
198
   *
199
   * If we must wait for another xact, we release the lock while waiting,
200
   * and then must perform a new search.
201
   *
202
   * For a partial uniqueness check, we don't wait for the other xact. Just
203
   * let the tuple in and return false for possibly non-unique, or true for
204
   * definitely unique.
205
   */
206
0
  if (checkingunique)
207
0
  {
208
0
    TransactionId xwait;
209
0
    uint32    speculativeToken;
210
211
0
    xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique,
212
0
                 &is_unique, &speculativeToken);
213
214
0
    if (unlikely(TransactionIdIsValid(xwait)))
215
0
    {
216
      /* Have to wait for the other guy ... */
217
0
      _bt_relbuf(rel, insertstate.buf);
218
0
      insertstate.buf = InvalidBuffer;
219
220
      /*
221
       * If it's a speculative insertion, wait for it to finish (ie. to
222
       * go ahead with the insertion, or kill the tuple).  Otherwise
223
       * wait for the transaction to finish as usual.
224
       */
225
0
      if (speculativeToken)
226
0
        SpeculativeInsertionWait(xwait, speculativeToken);
227
0
      else
228
0
        XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex);
229
230
      /* start over... */
231
0
      if (stack)
232
0
        _bt_freestack(stack);
233
0
      goto search;
234
0
    }
235
236
    /* Uniqueness is established -- restore heap tid as scantid */
237
0
    if (itup_key->heapkeyspace)
238
0
      itup_key->scantid = &itup->t_tid;
239
0
  }
240
241
0
  if (checkUnique != UNIQUE_CHECK_EXISTING)
242
0
  {
243
0
    OffsetNumber newitemoff;
244
245
    /*
246
     * The only conflict predicate locking cares about for indexes is when
247
     * an index tuple insert conflicts with an existing lock.  We don't
248
     * know the actual page we're going to insert on for sure just yet in
249
     * checkingunique and !heapkeyspace cases, but it's okay to use the
250
     * first page the value could be on (with scantid omitted) instead.
251
     */
252
0
    CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf));
253
254
    /*
255
     * Do the insertion.  Note that insertstate contains cached binary
256
     * search bounds established within _bt_check_unique when insertion is
257
     * checkingunique.
258
     */
259
0
    newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
260
0
                     indexUnchanged, stack, heapRel);
261
0
    _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer,
262
0
             stack, itup, insertstate.itemsz, newitemoff,
263
0
             insertstate.postingoff, false);
264
0
  }
265
0
  else
266
0
  {
267
    /* just release the buffer */
268
0
    _bt_relbuf(rel, insertstate.buf);
269
0
  }
270
271
  /* be tidy */
272
0
  if (stack)
273
0
    _bt_freestack(stack);
274
0
  pfree(itup_key);
275
276
0
  return is_unique;
277
0
}
278
279
/*
280
 *  _bt_search_insert() -- _bt_search() wrapper for inserts
281
 *
282
 * Search the tree for a particular scankey, or more precisely for the first
283
 * leaf page it could be on.  Try to make use of the fastpath optimization's
284
 * rightmost leaf page cache before actually searching the tree from the root
285
 * page, though.
286
 *
287
 * Return value is a stack of parent-page pointers (though see notes about
288
 * fastpath optimization and page splits below).  insertstate->buf is set to
289
 * the address of the leaf-page buffer, which is write-locked and pinned in
290
 * all cases (if necessary by creating a new empty root page for caller).
291
 *
292
 * The fastpath optimization avoids most of the work of searching the tree
293
 * repeatedly when a single backend inserts successive new tuples on the
294
 * rightmost leaf page of an index.  A backend cache of the rightmost leaf
295
 * page is maintained within _bt_insertonpg(), and used here.  The cache is
296
 * invalidated here when an insert of a non-pivot tuple must take place on a
297
 * non-rightmost leaf page.
298
 *
299
 * The optimization helps with indexes on an auto-incremented field.  It also
300
 * helps with indexes on datetime columns, as well as indexes with lots of
301
 * NULL values.  (NULLs usually get inserted in the rightmost page for single
302
 * column indexes, since they usually get treated as coming after everything
303
 * else in the key space.  Individual NULL tuples will generally be placed on
304
 * the rightmost leaf page due to the influence of the heap TID column.)
305
 *
306
 * Note that we avoid applying the optimization when there is insufficient
307
 * space on the rightmost page to fit caller's new item.  This is necessary
308
 * because we'll need to return a real descent stack when a page split is
309
 * expected (actually, caller can cope with a leaf page split that uses a NULL
310
 * stack, but that's very slow and so must be avoided).  Note also that the
311
 * fastpath optimization acquires the lock on the page conditionally as a way
312
 * of reducing extra contention when there are concurrent insertions into the
313
 * rightmost page (we give up if we'd have to wait for the lock).  We assume
314
 * that it isn't useful to apply the optimization when there is contention,
315
 * since each per-backend cache won't stay valid for long.
316
 */
317
static BTStack
318
_bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate)
319
0
{
320
0
  Assert(insertstate->buf == InvalidBuffer);
321
0
  Assert(!insertstate->bounds_valid);
322
0
  Assert(insertstate->postingoff == 0);
323
324
0
  if (RelationGetTargetBlock(rel) != InvalidBlockNumber)
325
0
  {
326
    /* Simulate a _bt_getbuf() call with conditional locking */
327
0
    insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
328
0
    if (_bt_conditionallockbuf(rel, insertstate->buf))
329
0
    {
330
0
      Page    page;
331
0
      BTPageOpaque opaque;
332
333
0
      _bt_checkpage(rel, insertstate->buf);
334
0
      page = BufferGetPage(insertstate->buf);
335
0
      opaque = BTPageGetOpaque(page);
336
337
      /*
338
       * Check if the page is still the rightmost leaf page and has
339
       * enough free space to accommodate the new tuple.  Also check
340
       * that the insertion scan key is strictly greater than the first
341
       * non-pivot tuple on the page.  (Note that we expect itup_key's
342
       * scantid to be unset when our caller is a checkingunique
343
       * inserter.)
344
       */
345
0
      if (P_RIGHTMOST(opaque) &&
346
0
        P_ISLEAF(opaque) &&
347
0
        !P_IGNORE(opaque) &&
348
0
        PageGetFreeSpace(page) > insertstate->itemsz &&
349
0
        PageGetMaxOffsetNumber(page) >= P_HIKEY &&
350
0
        _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0)
351
0
      {
352
        /*
353
         * Caller can use the fastpath optimization because cached
354
         * block is still rightmost leaf page, which can fit caller's
355
         * new tuple without splitting.  Keep block in local cache for
356
         * next insert, and have caller use NULL stack.
357
         *
358
         * Note that _bt_insert_parent() has an assertion that catches
359
         * leaf page splits that somehow follow from a fastpath insert
360
         * (it should only be passed a NULL stack when it must deal
361
         * with a concurrent root page split, and never because a NULL
362
         * stack was returned here).
363
         */
364
0
        return NULL;
365
0
      }
366
367
      /* Page unsuitable for caller, drop lock and pin */
368
0
      _bt_relbuf(rel, insertstate->buf);
369
0
    }
370
0
    else
371
0
    {
372
      /* Lock unavailable, drop pin */
373
0
      ReleaseBuffer(insertstate->buf);
374
0
    }
375
376
    /* Forget block, since cache doesn't appear to be useful */
377
0
    RelationSetTargetBlock(rel, InvalidBlockNumber);
378
0
  }
379
380
  /* Cannot use optimization -- descend tree, return proper descent stack */
381
0
  return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf,
382
0
            BT_WRITE);
383
0
}
384
385
/*
386
 *  _bt_check_unique() -- Check for violation of unique index constraint
387
 *
388
 * Returns InvalidTransactionId if there is no conflict, else an xact ID
389
 * we must wait for to see if it commits a conflicting tuple.   If an actual
390
 * conflict is detected, no return --- just ereport().  If an xact ID is
391
 * returned, and the conflicting tuple still has a speculative insertion in
392
 * progress, *speculativeToken is set to non-zero, and the caller can wait for
393
 * the verdict on the insertion using SpeculativeInsertionWait().
394
 *
395
 * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
396
 * InvalidTransactionId because we don't want to wait.  In this case we
397
 * set *is_unique to false if there is a potential conflict, and the
398
 * core code must redo the uniqueness check later.
399
 *
400
 * As a side-effect, sets state in insertstate that can later be used by
401
 * _bt_findinsertloc() to reuse most of the binary search work we do
402
 * here.
403
 *
404
 * This code treats NULLs as equal, unlike the default semantics for unique
405
 * indexes.  So do not call here when there are NULL values in scan key and
406
 * the index uses the default NULLS DISTINCT mode.
407
 */
408
static TransactionId
409
_bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
410
         IndexUniqueCheck checkUnique, bool *is_unique,
411
         uint32 *speculativeToken)
412
0
{
413
0
  IndexTuple  itup = insertstate->itup;
414
0
  IndexTuple  curitup = NULL;
415
0
  ItemId    curitemid = NULL;
416
0
  BTScanInsert itup_key = insertstate->itup_key;
417
0
  SnapshotData SnapshotDirty;
418
0
  OffsetNumber offset;
419
0
  OffsetNumber maxoff;
420
0
  Page    page;
421
0
  BTPageOpaque opaque;
422
0
  Buffer    nbuf = InvalidBuffer;
423
0
  bool    found = false;
424
0
  bool    inposting = false;
425
0
  bool    prevalldead = true;
426
0
  int     curposti = 0;
427
428
  /* Assume unique until we find a duplicate */
429
0
  *is_unique = true;
430
431
0
  InitDirtySnapshot(SnapshotDirty);
432
433
0
  page = BufferGetPage(insertstate->buf);
434
0
  opaque = BTPageGetOpaque(page);
435
0
  maxoff = PageGetMaxOffsetNumber(page);
436
437
  /*
438
   * Find the first tuple with the same key.
439
   *
440
   * This also saves the binary search bounds in insertstate.  We use them
441
   * in the fastpath below, but also in the _bt_findinsertloc() call later.
442
   */
443
0
  Assert(!insertstate->bounds_valid);
444
0
  offset = _bt_binsrch_insert(rel, insertstate);
445
446
  /*
447
   * Scan over all equal tuples, looking for live conflicts.
448
   */
449
0
  Assert(!insertstate->bounds_valid || insertstate->low == offset);
450
0
  Assert(!itup_key->anynullkeys);
451
0
  Assert(itup_key->scantid == NULL);
452
0
  for (;;)
453
0
  {
454
    /*
455
     * Each iteration of the loop processes one heap TID, not one index
456
     * tuple.  Current offset number for page isn't usually advanced on
457
     * iterations that process heap TIDs from posting list tuples.
458
     *
459
     * "inposting" state is set when _inside_ a posting list --- not when
460
     * we're at the start (or end) of a posting list.  We advance curposti
461
     * at the end of the iteration when inside a posting list tuple.  In
462
     * general, every loop iteration either advances the page offset or
463
     * advances curposti --- an iteration that handles the rightmost/max
464
     * heap TID in a posting list finally advances the page offset (and
465
     * unsets "inposting").
466
     *
467
     * Make sure the offset points to an actual index tuple before trying
468
     * to examine it...
469
     */
470
0
    if (offset <= maxoff)
471
0
    {
472
      /*
473
       * Fastpath: In most cases, we can use cached search bounds to
474
       * limit our consideration to items that are definitely
475
       * duplicates.  This fastpath doesn't apply when the original page
476
       * is empty, or when initial offset is past the end of the
477
       * original page, which may indicate that we need to examine a
478
       * second or subsequent page.
479
       *
480
       * Note that this optimization allows us to avoid calling
481
       * _bt_compare() directly when there are no duplicates, as long as
482
       * the offset where the key will go is not at the end of the page.
483
       */
484
0
      if (nbuf == InvalidBuffer && offset == insertstate->stricthigh)
485
0
      {
486
0
        Assert(insertstate->bounds_valid);
487
0
        Assert(insertstate->low >= P_FIRSTDATAKEY(opaque));
488
0
        Assert(insertstate->low <= insertstate->stricthigh);
489
0
        Assert(_bt_compare(rel, itup_key, page, offset) < 0);
490
0
        break;
491
0
      }
492
493
      /*
494
       * We can skip items that are already marked killed.
495
       *
496
       * In the presence of heavy update activity an index may contain
497
       * many killed items with the same key; running _bt_compare() on
498
       * each killed item gets expensive.  Just advance over killed
499
       * items as quickly as we can.  We only apply _bt_compare() when
500
       * we get to a non-killed item.  We could reuse the bounds to
501
       * avoid _bt_compare() calls for known equal tuples, but it
502
       * doesn't seem worth it.
503
       */
504
0
      if (!inposting)
505
0
        curitemid = PageGetItemId(page, offset);
506
0
      if (inposting || !ItemIdIsDead(curitemid))
507
0
      {
508
0
        ItemPointerData htid;
509
0
        bool    all_dead = false;
510
511
0
        if (!inposting)
512
0
        {
513
          /* Plain tuple, or first TID in posting list tuple */
514
0
          if (_bt_compare(rel, itup_key, page, offset) != 0)
515
0
            break; /* we're past all the equal tuples */
516
517
          /* Advanced curitup */
518
0
          curitup = (IndexTuple) PageGetItem(page, curitemid);
519
0
          Assert(!BTreeTupleIsPivot(curitup));
520
0
        }
521
522
        /* okay, we gotta fetch the heap tuple using htid ... */
523
0
        if (!BTreeTupleIsPosting(curitup))
524
0
        {
525
          /* ... htid is from simple non-pivot tuple */
526
0
          Assert(!inposting);
527
0
          htid = curitup->t_tid;
528
0
        }
529
0
        else if (!inposting)
530
0
        {
531
          /* ... htid is first TID in new posting list */
532
0
          inposting = true;
533
0
          prevalldead = true;
534
0
          curposti = 0;
535
0
          htid = *BTreeTupleGetPostingN(curitup, 0);
536
0
        }
537
0
        else
538
0
        {
539
          /* ... htid is second or subsequent TID in posting list */
540
0
          Assert(curposti > 0);
541
0
          htid = *BTreeTupleGetPostingN(curitup, curposti);
542
0
        }
543
544
        /*
545
         * If we are doing a recheck, we expect to find the tuple we
546
         * are rechecking.  It's not a duplicate, but we have to keep
547
         * scanning.
548
         */
549
0
        if (checkUnique == UNIQUE_CHECK_EXISTING &&
550
0
          ItemPointerCompare(&htid, &itup->t_tid) == 0)
551
0
        {
552
0
          found = true;
553
0
        }
554
555
        /*
556
         * Check if there's any table tuples for this index entry
557
         * satisfying SnapshotDirty. This is necessary because for AMs
558
         * with optimizations like heap's HOT, we have just a single
559
         * index entry for the entire chain.
560
         */
561
0
        else if (table_index_fetch_tuple_check(heapRel, &htid,
562
0
                             &SnapshotDirty,
563
0
                             &all_dead))
564
0
        {
565
0
          TransactionId xwait;
566
567
          /*
568
           * It is a duplicate. If we are only doing a partial
569
           * check, then don't bother checking if the tuple is being
570
           * updated in another transaction. Just return the fact
571
           * that it is a potential conflict and leave the full
572
           * check till later. Don't invalidate binary search
573
           * bounds.
574
           */
575
0
          if (checkUnique == UNIQUE_CHECK_PARTIAL)
576
0
          {
577
0
            if (nbuf != InvalidBuffer)
578
0
              _bt_relbuf(rel, nbuf);
579
0
            *is_unique = false;
580
0
            return InvalidTransactionId;
581
0
          }
582
583
          /*
584
           * If this tuple is being updated by other transaction
585
           * then we have to wait for its commit/abort.
586
           */
587
0
          xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ?
588
0
            SnapshotDirty.xmin : SnapshotDirty.xmax;
589
590
0
          if (TransactionIdIsValid(xwait))
591
0
          {
592
0
            if (nbuf != InvalidBuffer)
593
0
              _bt_relbuf(rel, nbuf);
594
            /* Tell _bt_doinsert to wait... */
595
0
            *speculativeToken = SnapshotDirty.speculativeToken;
596
            /* Caller releases lock on buf immediately */
597
0
            insertstate->bounds_valid = false;
598
0
            return xwait;
599
0
          }
600
601
          /*
602
           * Otherwise we have a definite conflict.  But before
603
           * complaining, look to see if the tuple we want to insert
604
           * is itself now committed dead --- if so, don't complain.
605
           * This is a waste of time in normal scenarios but we must
606
           * do it to support CREATE INDEX CONCURRENTLY.
607
           *
608
           * We must follow HOT-chains here because during
609
           * concurrent index build, we insert the root TID though
610
           * the actual tuple may be somewhere in the HOT-chain.
611
           * While following the chain we might not stop at the
612
           * exact tuple which triggered the insert, but that's OK
613
           * because if we find a live tuple anywhere in this chain,
614
           * we have a unique key conflict.  The other live tuple is
615
           * not part of this chain because it had a different index
616
           * entry.
617
           */
618
0
          htid = itup->t_tid;
619
0
          if (table_index_fetch_tuple_check(heapRel, &htid,
620
0
                            SnapshotSelf, NULL))
621
0
          {
622
            /* Normal case --- it's still live */
623
0
          }
624
0
          else
625
0
          {
626
            /*
627
             * It's been deleted, so no error, and no need to
628
             * continue searching
629
             */
630
0
            break;
631
0
          }
632
633
          /*
634
           * Check for a conflict-in as we would if we were going to
635
           * write to this page.  We aren't actually going to write,
636
           * but we want a chance to report SSI conflicts that would
637
           * otherwise be masked by this unique constraint
638
           * violation.
639
           */
640
0
          CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf));
641
642
          /*
643
           * This is a definite conflict.  Break the tuple down into
644
           * datums and report the error.  But first, make sure we
645
           * release the buffer locks we're holding ---
646
           * BuildIndexValueDescription could make catalog accesses,
647
           * which in the worst case might touch this same index and
648
           * cause deadlocks.
649
           */
650
0
          if (nbuf != InvalidBuffer)
651
0
            _bt_relbuf(rel, nbuf);
652
0
          _bt_relbuf(rel, insertstate->buf);
653
0
          insertstate->buf = InvalidBuffer;
654
0
          insertstate->bounds_valid = false;
655
656
0
          {
657
0
            Datum   values[INDEX_MAX_KEYS];
658
0
            bool    isnull[INDEX_MAX_KEYS];
659
0
            char     *key_desc;
660
661
0
            index_deform_tuple(itup, RelationGetDescr(rel),
662
0
                       values, isnull);
663
664
0
            key_desc = BuildIndexValueDescription(rel, values,
665
0
                                isnull);
666
667
0
            ereport(ERROR,
668
0
                (errcode(ERRCODE_UNIQUE_VIOLATION),
669
0
                 errmsg("duplicate key value violates unique constraint \"%s\"",
670
0
                    RelationGetRelationName(rel)),
671
0
                 key_desc ? errdetail("Key %s already exists.",
672
0
                            key_desc) : 0,
673
0
                 errtableconstraint(heapRel,
674
0
                          RelationGetRelationName(rel))));
675
0
          }
676
0
        }
677
0
        else if (all_dead && (!inposting ||
678
0
                    (prevalldead &&
679
0
                     curposti == BTreeTupleGetNPosting(curitup) - 1)))
680
0
        {
681
          /*
682
           * The conflicting tuple (or all HOT chains pointed to by
683
           * all posting list TIDs) is dead to everyone, so mark the
684
           * index entry killed.
685
           */
686
0
          ItemIdMarkDead(curitemid);
687
0
          opaque->btpo_flags |= BTP_HAS_GARBAGE;
688
689
          /*
690
           * Mark buffer with a dirty hint, since state is not
691
           * crucial. Be sure to mark the proper buffer dirty.
692
           */
693
0
          if (nbuf != InvalidBuffer)
694
0
            MarkBufferDirtyHint(nbuf, true);
695
0
          else
696
0
            MarkBufferDirtyHint(insertstate->buf, true);
697
0
        }
698
699
        /*
700
         * Remember if posting list tuple has even a single HOT chain
701
         * whose members are not all dead
702
         */
703
0
        if (!all_dead && inposting)
704
0
          prevalldead = false;
705
0
      }
706
0
    }
707
708
0
    if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1)
709
0
    {
710
      /* Advance to next TID in same posting list */
711
0
      curposti++;
712
0
      continue;
713
0
    }
714
0
    else if (offset < maxoff)
715
0
    {
716
      /* Advance to next tuple */
717
0
      curposti = 0;
718
0
      inposting = false;
719
0
      offset = OffsetNumberNext(offset);
720
0
    }
721
0
    else
722
0
    {
723
0
      int     highkeycmp;
724
725
      /* If scankey == hikey we gotta check the next page too */
726
0
      if (P_RIGHTMOST(opaque))
727
0
        break;
728
0
      highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY);
729
0
      Assert(highkeycmp <= 0);
730
0
      if (highkeycmp != 0)
731
0
        break;
732
      /* Advance to next non-dead page --- there must be one */
733
0
      for (;;)
734
0
      {
735
0
        BlockNumber nblkno = opaque->btpo_next;
736
737
0
        nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ);
738
0
        page = BufferGetPage(nbuf);
739
0
        opaque = BTPageGetOpaque(page);
740
0
        if (!P_IGNORE(opaque))
741
0
          break;
742
0
        if (P_RIGHTMOST(opaque))
743
0
          elog(ERROR, "fell off the end of index \"%s\"",
744
0
             RelationGetRelationName(rel));
745
0
      }
746
      /* Will also advance to next tuple */
747
0
      curposti = 0;
748
0
      inposting = false;
749
0
      maxoff = PageGetMaxOffsetNumber(page);
750
0
      offset = P_FIRSTDATAKEY(opaque);
751
      /* Don't invalidate binary search bounds */
752
0
    }
753
0
  }
754
755
  /*
756
   * If we are doing a recheck then we should have found the tuple we are
757
   * checking.  Otherwise there's something very wrong --- probably, the
758
   * index is on a non-immutable expression.
759
   */
760
0
  if (checkUnique == UNIQUE_CHECK_EXISTING && !found)
761
0
    ereport(ERROR,
762
0
        (errcode(ERRCODE_INTERNAL_ERROR),
763
0
         errmsg("failed to re-find tuple within index \"%s\"",
764
0
            RelationGetRelationName(rel)),
765
0
         errhint("This may be because of a non-immutable index expression."),
766
0
         errtableconstraint(heapRel,
767
0
                  RelationGetRelationName(rel))));
768
769
0
  if (nbuf != InvalidBuffer)
770
0
    _bt_relbuf(rel, nbuf);
771
772
0
  return InvalidTransactionId;
773
0
}
774
775
776
/*
777
 *  _bt_findinsertloc() -- Finds an insert location for a tuple
778
 *
779
 *    On entry, insertstate buffer contains the page the new tuple belongs
780
 *    on.  It is exclusive-locked and pinned by the caller.
781
 *
782
 *    If 'checkingunique' is true, the buffer on entry is the first page
783
 *    that contains duplicates of the new key.  If there are duplicates on
784
 *    multiple pages, the correct insertion position might be some page to
785
 *    the right, rather than the first page.  In that case, this function
786
 *    moves right to the correct target page.
787
 *
788
 *    (In a !heapkeyspace index, there can be multiple pages with the same
789
 *    high key, where the new tuple could legitimately be placed on.  In
790
 *    that case, the caller passes the first page containing duplicates,
791
 *    just like when checkingunique=true.  If that page doesn't have enough
792
 *    room for the new tuple, this function moves right, trying to find a
793
 *    legal page that does.)
794
 *
795
 *    If 'indexUnchanged' is true, this is for an UPDATE that didn't
796
 *    logically change the indexed value, but must nevertheless have a new
797
 *    entry to point to a successor version.  This hint from the executor
798
 *    will influence our behavior when the page might have to be split and
799
 *    we must consider our options.  Bottom-up index deletion can avoid
800
 *    pathological version-driven page splits, but we only want to go to the
801
 *    trouble of trying it when we already have moderate confidence that
802
 *    it's appropriate.  The hint should not significantly affect our
803
 *    behavior over time unless practically all inserts on to the leaf page
804
 *    get the hint.
805
 *
806
 *    On exit, insertstate buffer contains the chosen insertion page, and
807
 *    the offset within that page is returned.  If _bt_findinsertloc needed
808
 *    to move right, the lock and pin on the original page are released, and
809
 *    the new buffer is exclusively locked and pinned instead.
810
 *
811
 *    If insertstate contains cached binary search bounds, we will take
812
 *    advantage of them.  This avoids repeating comparisons that we made in
813
 *    _bt_check_unique() already.
814
 */
815
static OffsetNumber
816
_bt_findinsertloc(Relation rel,
817
          BTInsertState insertstate,
818
          bool checkingunique,
819
          bool indexUnchanged,
820
          BTStack stack,
821
          Relation heapRel)
822
0
{
823
0
  BTScanInsert itup_key = insertstate->itup_key;
824
0
  Page    page = BufferGetPage(insertstate->buf);
825
0
  BTPageOpaque opaque;
826
0
  OffsetNumber newitemoff;
827
828
0
  opaque = BTPageGetOpaque(page);
829
830
  /* Check 1/3 of a page restriction */
831
0
  if (unlikely(insertstate->itemsz > BTMaxItemSize))
832
0
    _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page,
833
0
               insertstate->itup);
834
835
0
  Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque));
836
0
  Assert(!insertstate->bounds_valid || checkingunique);
837
0
  Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL);
838
0
  Assert(itup_key->heapkeyspace || itup_key->scantid == NULL);
839
0
  Assert(!itup_key->allequalimage || itup_key->heapkeyspace);
840
841
0
  if (itup_key->heapkeyspace)
842
0
  {
843
    /* Keep track of whether checkingunique duplicate seen */
844
0
    bool    uniquedup = indexUnchanged;
845
846
    /*
847
     * If we're inserting into a unique index, we may have to walk right
848
     * through leaf pages to find the one leaf page that we must insert on
849
     * to.
850
     *
851
     * This is needed for checkingunique callers because a scantid was not
852
     * used when we called _bt_search().  scantid can only be set after
853
     * _bt_check_unique() has checked for duplicates.  The buffer
854
     * initially stored in insertstate->buf has the page where the first
855
     * duplicate key might be found, which isn't always the page that new
856
     * tuple belongs on.  The heap TID attribute for new tuple (scantid)
857
     * could force us to insert on a sibling page, though that should be
858
     * very rare in practice.
859
     */
860
0
    if (checkingunique)
861
0
    {
862
0
      if (insertstate->low < insertstate->stricthigh)
863
0
      {
864
        /* Encountered a duplicate in _bt_check_unique() */
865
0
        Assert(insertstate->bounds_valid);
866
0
        uniquedup = true;
867
0
      }
868
869
0
      for (;;)
870
0
      {
871
        /*
872
         * Does the new tuple belong on this page?
873
         *
874
         * The earlier _bt_check_unique() call may well have
875
         * established a strict upper bound on the offset for the new
876
         * item.  If it's not the last item of the page (i.e. if there
877
         * is at least one tuple on the page that goes after the tuple
878
         * we're inserting) then we know that the tuple belongs on
879
         * this page.  We can skip the high key check.
880
         */
881
0
        if (insertstate->bounds_valid &&
882
0
          insertstate->low <= insertstate->stricthigh &&
883
0
          insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
884
0
          break;
885
886
        /* Test '<=', not '!=', since scantid is set now */
887
0
        if (P_RIGHTMOST(opaque) ||
888
0
          _bt_compare(rel, itup_key, page, P_HIKEY) <= 0)
889
0
          break;
890
891
0
        _bt_stepright(rel, heapRel, insertstate, stack);
892
        /* Update local state after stepping right */
893
0
        page = BufferGetPage(insertstate->buf);
894
0
        opaque = BTPageGetOpaque(page);
895
        /* Assume duplicates (if checkingunique) */
896
0
        uniquedup = true;
897
0
      }
898
0
    }
899
900
    /*
901
     * If the target page cannot fit newitem, try to avoid splitting the
902
     * page on insert by performing deletion or deduplication now
903
     */
904
0
    if (PageGetFreeSpace(page) < insertstate->itemsz)
905
0
      _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false,
906
0
                     checkingunique, uniquedup,
907
0
                     indexUnchanged);
908
0
  }
909
0
  else
910
0
  {
911
    /*----------
912
     * This is a !heapkeyspace (version 2 or 3) index.  The current page
913
     * is the first page that we could insert the new tuple to, but there
914
     * may be other pages to the right that we could opt to use instead.
915
     *
916
     * If the new key is equal to one or more existing keys, we can
917
     * legitimately place it anywhere in the series of equal keys.  In
918
     * fact, if the new key is equal to the page's "high key" we can place
919
     * it on the next page.  If it is equal to the high key, and there's
920
     * not room to insert the new tuple on the current page without
921
     * splitting, then we move right hoping to find more free space and
922
     * avoid a split.
923
     *
924
     * Keep scanning right until we
925
     *    (a) find a page with enough free space,
926
     *    (b) reach the last page where the tuple can legally go, or
927
     *    (c) get tired of searching.
928
     * (c) is not flippant; it is important because if there are many
929
     * pages' worth of equal keys, it's better to split one of the early
930
     * pages than to scan all the way to the end of the run of equal keys
931
     * on every insert.  We implement "get tired" as a random choice,
932
     * since stopping after scanning a fixed number of pages wouldn't work
933
     * well (we'd never reach the right-hand side of previously split
934
     * pages).  The probability of moving right is set at 0.99, which may
935
     * seem too high to change the behavior much, but it does an excellent
936
     * job of preventing O(N^2) behavior with many equal keys.
937
     *----------
938
     */
939
0
    while (PageGetFreeSpace(page) < insertstate->itemsz)
940
0
    {
941
      /*
942
       * Before considering moving right, see if we can obtain enough
943
       * space by erasing LP_DEAD items
944
       */
945
0
      if (P_HAS_GARBAGE(opaque))
946
0
      {
947
        /* Perform simple deletion */
948
0
        _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
949
0
                       false, false, false);
950
951
0
        if (PageGetFreeSpace(page) >= insertstate->itemsz)
952
0
          break;   /* OK, now we have enough space */
953
0
      }
954
955
      /*
956
       * Nope, so check conditions (b) and (c) enumerated above
957
       *
958
       * The earlier _bt_check_unique() call may well have established a
959
       * strict upper bound on the offset for the new item.  If it's not
960
       * the last item of the page (i.e. if there is at least one tuple
961
       * on the page that's greater than the tuple we're inserting to)
962
       * then we know that the tuple belongs on this page.  We can skip
963
       * the high key check.
964
       */
965
0
      if (insertstate->bounds_valid &&
966
0
        insertstate->low <= insertstate->stricthigh &&
967
0
        insertstate->stricthigh <= PageGetMaxOffsetNumber(page))
968
0
        break;
969
970
0
      if (P_RIGHTMOST(opaque) ||
971
0
        _bt_compare(rel, itup_key, page, P_HIKEY) != 0 ||
972
0
        pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100))
973
0
        break;
974
975
0
      _bt_stepright(rel, heapRel, insertstate, stack);
976
      /* Update local state after stepping right */
977
0
      page = BufferGetPage(insertstate->buf);
978
0
      opaque = BTPageGetOpaque(page);
979
0
    }
980
0
  }
981
982
  /*
983
   * We should now be on the correct page.  Find the offset within the page
984
   * for the new tuple. (Possibly reusing earlier search bounds.)
985
   */
986
0
  Assert(P_RIGHTMOST(opaque) ||
987
0
       _bt_compare(rel, itup_key, page, P_HIKEY) <= 0);
988
989
0
  newitemoff = _bt_binsrch_insert(rel, insertstate);
990
991
0
  if (insertstate->postingoff == -1)
992
0
  {
993
    /*
994
     * There is an overlapping posting list tuple with its LP_DEAD bit
995
     * set.  We don't want to unnecessarily unset its LP_DEAD bit while
996
     * performing a posting list split, so perform simple index tuple
997
     * deletion early.
998
     */
999
0
    _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true,
1000
0
                   false, false, false);
1001
1002
    /*
1003
     * Do new binary search.  New insert location cannot overlap with any
1004
     * posting list now.
1005
     */
1006
0
    Assert(!insertstate->bounds_valid);
1007
0
    insertstate->postingoff = 0;
1008
0
    newitemoff = _bt_binsrch_insert(rel, insertstate);
1009
0
    Assert(insertstate->postingoff == 0);
1010
0
  }
1011
1012
0
  return newitemoff;
1013
0
}
1014
1015
/*
1016
 * Step right to next non-dead page, during insertion.
1017
 *
1018
 * This is a bit more complicated than moving right in a search.  We must
1019
 * write-lock the target page before releasing write lock on current page;
1020
 * else someone else's _bt_check_unique scan could fail to see our insertion.
1021
 * Write locks on intermediate dead pages won't do because we don't know when
1022
 * they will get de-linked from the tree.
1023
 *
1024
 * This is more aggressive than it needs to be for non-unique !heapkeyspace
1025
 * indexes.
1026
 */
1027
static void
1028
_bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate,
1029
        BTStack stack)
1030
0
{
1031
0
  Page    page;
1032
0
  BTPageOpaque opaque;
1033
0
  Buffer    rbuf;
1034
0
  BlockNumber rblkno;
1035
1036
0
  Assert(heaprel != NULL);
1037
0
  page = BufferGetPage(insertstate->buf);
1038
0
  opaque = BTPageGetOpaque(page);
1039
1040
0
  rbuf = InvalidBuffer;
1041
0
  rblkno = opaque->btpo_next;
1042
0
  for (;;)
1043
0
  {
1044
0
    rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
1045
0
    page = BufferGetPage(rbuf);
1046
0
    opaque = BTPageGetOpaque(page);
1047
1048
    /*
1049
     * If this page was incompletely split, finish the split now.  We do
1050
     * this while holding a lock on the left sibling, which is not good
1051
     * because finishing the split could be a fairly lengthy operation.
1052
     * But this should happen very seldom.
1053
     */
1054
0
    if (P_INCOMPLETE_SPLIT(opaque))
1055
0
    {
1056
0
      _bt_finish_split(rel, heaprel, rbuf, stack);
1057
0
      rbuf = InvalidBuffer;
1058
0
      continue;
1059
0
    }
1060
1061
0
    if (!P_IGNORE(opaque))
1062
0
      break;
1063
0
    if (P_RIGHTMOST(opaque))
1064
0
      elog(ERROR, "fell off the end of index \"%s\"",
1065
0
         RelationGetRelationName(rel));
1066
1067
0
    rblkno = opaque->btpo_next;
1068
0
  }
1069
  /* rbuf locked; unlock buf, update state for caller */
1070
0
  _bt_relbuf(rel, insertstate->buf);
1071
0
  insertstate->buf = rbuf;
1072
0
  insertstate->bounds_valid = false;
1073
0
}
1074
1075
/*----------
1076
 *  _bt_insertonpg() -- Insert a tuple on a particular page in the index.
1077
 *
1078
 *    This recursive procedure does the following things:
1079
 *
1080
 *      +  if postingoff != 0, splits existing posting list tuple
1081
 *         (since it overlaps with new 'itup' tuple).
1082
 *      +  if necessary, splits the target page, using 'itup_key' for
1083
 *         suffix truncation on leaf pages (caller passes NULL for
1084
 *         non-leaf pages).
1085
 *      +  inserts the new tuple (might be split from posting list).
1086
 *      +  if the page was split, pops the parent stack, and finds the
1087
 *         right place to insert the new child pointer (by walking
1088
 *         right using information stored in the parent stack).
1089
 *      +  invokes itself with the appropriate tuple for the right
1090
 *         child page on the parent.
1091
 *      +  updates the metapage if a true root or fast root is split.
1092
 *
1093
 *    On entry, we must have the correct buffer in which to do the
1094
 *    insertion, and the buffer must be pinned and write-locked.  On return,
1095
 *    we will have dropped both the pin and the lock on the buffer.
1096
 *
1097
 *    This routine only performs retail tuple insertions.  'itup' should
1098
 *    always be either a non-highkey leaf item, or a downlink (new high
1099
 *    key items are created indirectly, when a page is split).  When
1100
 *    inserting to a non-leaf page, 'cbuf' is the left-sibling of the page
1101
 *    we're inserting the downlink for.  This function will clear the
1102
 *    INCOMPLETE_SPLIT flag on it, and release the buffer.
1103
 *----------
1104
 */
1105
static void
1106
_bt_insertonpg(Relation rel,
1107
         Relation heaprel,
1108
         BTScanInsert itup_key,
1109
         Buffer buf,
1110
         Buffer cbuf,
1111
         BTStack stack,
1112
         IndexTuple itup,
1113
         Size itemsz,
1114
         OffsetNumber newitemoff,
1115
         int postingoff,
1116
         bool split_only_page)
1117
0
{
1118
0
  Page    page;
1119
0
  BTPageOpaque opaque;
1120
0
  bool    isleaf,
1121
0
        isroot,
1122
0
        isrightmost,
1123
0
        isonly;
1124
0
  IndexTuple  oposting = NULL;
1125
0
  IndexTuple  origitup = NULL;
1126
0
  IndexTuple  nposting = NULL;
1127
1128
0
  page = BufferGetPage(buf);
1129
0
  opaque = BTPageGetOpaque(page);
1130
0
  isleaf = P_ISLEAF(opaque);
1131
0
  isroot = P_ISROOT(opaque);
1132
0
  isrightmost = P_RIGHTMOST(opaque);
1133
0
  isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque);
1134
1135
  /* child buffer must be given iff inserting on an internal page */
1136
0
  Assert(isleaf == !BufferIsValid(cbuf));
1137
  /* tuple must have appropriate number of attributes */
1138
0
  Assert(!isleaf ||
1139
0
       BTreeTupleGetNAtts(itup, rel) ==
1140
0
       IndexRelationGetNumberOfAttributes(rel));
1141
0
  Assert(isleaf ||
1142
0
       BTreeTupleGetNAtts(itup, rel) <=
1143
0
       IndexRelationGetNumberOfKeyAttributes(rel));
1144
0
  Assert(!BTreeTupleIsPosting(itup));
1145
0
  Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz);
1146
  /* Caller must always finish incomplete split for us */
1147
0
  Assert(!P_INCOMPLETE_SPLIT(opaque));
1148
1149
  /*
1150
   * Every internal page should have exactly one negative infinity item at
1151
   * all times.  Only _bt_split() and _bt_newlevel() should add items that
1152
   * become negative infinity items through truncation, since they're the
1153
   * only routines that allocate new internal pages.
1154
   */
1155
0
  Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque));
1156
1157
  /*
1158
   * Do we need to split an existing posting list item?
1159
   */
1160
0
  if (postingoff != 0)
1161
0
  {
1162
0
    ItemId    itemid = PageGetItemId(page, newitemoff);
1163
1164
    /*
1165
     * The new tuple is a duplicate with a heap TID that falls inside the
1166
     * range of an existing posting list tuple on a leaf page.  Prepare to
1167
     * split an existing posting list.  Overwriting the posting list with
1168
     * its post-split version is treated as an extra step in either the
1169
     * insert or page split critical section.
1170
     */
1171
0
    Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage);
1172
0
    oposting = (IndexTuple) PageGetItem(page, itemid);
1173
1174
    /*
1175
     * postingoff value comes from earlier call to _bt_binsrch_posting().
1176
     * Its binary search might think that a plain tuple must be a posting
1177
     * list tuple that needs to be split.  This can happen with corruption
1178
     * involving an existing plain tuple that is a duplicate of the new
1179
     * item, up to and including its table TID.  Check for that here in
1180
     * passing.
1181
     *
1182
     * Also verify that our caller has made sure that the existing posting
1183
     * list tuple does not have its LP_DEAD bit set.
1184
     */
1185
0
    if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid))
1186
0
      ereport(ERROR,
1187
0
          (errcode(ERRCODE_INDEX_CORRUPTED),
1188
0
           errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"",
1189
0
                   ItemPointerGetBlockNumber(&itup->t_tid),
1190
0
                   ItemPointerGetOffsetNumber(&itup->t_tid),
1191
0
                   newitemoff, BufferGetBlockNumber(buf),
1192
0
                   RelationGetRelationName(rel))));
1193
1194
    /* use a mutable copy of itup as our itup from here on */
1195
0
    origitup = itup;
1196
0
    itup = CopyIndexTuple(origitup);
1197
0
    nposting = _bt_swap_posting(itup, oposting, postingoff);
1198
    /* itup now contains rightmost/max TID from oposting */
1199
1200
    /* Alter offset so that newitem goes after posting list */
1201
0
    newitemoff = OffsetNumberNext(newitemoff);
1202
0
  }
1203
1204
  /*
1205
   * Do we need to split the page to fit the item on it?
1206
   *
1207
   * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
1208
   * so this comparison is correct even though we appear to be accounting
1209
   * only for the item and not for its line pointer.
1210
   */
1211
0
  if (PageGetFreeSpace(page) < itemsz)
1212
0
  {
1213
0
    Buffer    rbuf;
1214
1215
0
    Assert(!split_only_page);
1216
1217
    /* split the buffer into left and right halves */
1218
0
    rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz,
1219
0
             itup, origitup, nposting, postingoff);
1220
0
    PredicateLockPageSplit(rel,
1221
0
                 BufferGetBlockNumber(buf),
1222
0
                 BufferGetBlockNumber(rbuf));
1223
1224
    /*----------
1225
     * By here,
1226
     *
1227
     *    +  our target page has been split;
1228
     *    +  the original tuple has been inserted;
1229
     *    +  we have write locks on both the old (left half)
1230
     *       and new (right half) buffers, after the split; and
1231
     *    +  we know the key we want to insert into the parent
1232
     *       (it's the "high key" on the left child page).
1233
     *
1234
     * We're ready to do the parent insertion.  We need to hold onto the
1235
     * locks for the child pages until we locate the parent, but we can
1236
     * at least release the lock on the right child before doing the
1237
     * actual insertion.  The lock on the left child will be released
1238
     * last of all by parent insertion, where it is the 'cbuf' of parent
1239
     * page.
1240
     *----------
1241
     */
1242
0
    _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly);
1243
0
  }
1244
0
  else
1245
0
  {
1246
0
    Buffer    metabuf = InvalidBuffer;
1247
0
    Page    metapg = NULL;
1248
0
    BTMetaPageData *metad = NULL;
1249
0
    BlockNumber blockcache;
1250
1251
    /*
1252
     * If we are doing this insert because we split a page that was the
1253
     * only one on its tree level, but was not the root, it may have been
1254
     * the "fast root".  We need to ensure that the fast root link points
1255
     * at or above the current page.  We can safely acquire a lock on the
1256
     * metapage here --- see comments for _bt_newlevel().
1257
     */
1258
0
    if (unlikely(split_only_page))
1259
0
    {
1260
0
      Assert(!isleaf);
1261
0
      Assert(BufferIsValid(cbuf));
1262
1263
0
      metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
1264
0
      metapg = BufferGetPage(metabuf);
1265
0
      metad = BTPageGetMeta(metapg);
1266
1267
0
      if (metad->btm_fastlevel >= opaque->btpo_level)
1268
0
      {
1269
        /* no update wanted */
1270
0
        _bt_relbuf(rel, metabuf);
1271
0
        metabuf = InvalidBuffer;
1272
0
      }
1273
0
    }
1274
1275
    /* Do the update.  No ereport(ERROR) until changes are logged */
1276
0
    START_CRIT_SECTION();
1277
1278
0
    if (postingoff != 0)
1279
0
      memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting)));
1280
1281
0
    if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false,
1282
0
            false) == InvalidOffsetNumber)
1283
0
      elog(PANIC, "failed to add new item to block %u in index \"%s\"",
1284
0
         BufferGetBlockNumber(buf), RelationGetRelationName(rel));
1285
1286
0
    MarkBufferDirty(buf);
1287
1288
0
    if (BufferIsValid(metabuf))
1289
0
    {
1290
      /* upgrade meta-page if needed */
1291
0
      if (metad->btm_version < BTREE_NOVAC_VERSION)
1292
0
        _bt_upgrademetapage(metapg);
1293
0
      metad->btm_fastroot = BufferGetBlockNumber(buf);
1294
0
      metad->btm_fastlevel = opaque->btpo_level;
1295
0
      MarkBufferDirty(metabuf);
1296
0
    }
1297
1298
    /*
1299
     * Clear INCOMPLETE_SPLIT flag on child if inserting the new item
1300
     * finishes a split
1301
     */
1302
0
    if (!isleaf)
1303
0
    {
1304
0
      Page    cpage = BufferGetPage(cbuf);
1305
0
      BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1306
1307
0
      Assert(P_INCOMPLETE_SPLIT(cpageop));
1308
0
      cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1309
0
      MarkBufferDirty(cbuf);
1310
0
    }
1311
1312
    /* XLOG stuff */
1313
0
    if (RelationNeedsWAL(rel))
1314
0
    {
1315
0
      xl_btree_insert xlrec;
1316
0
      xl_btree_metadata xlmeta;
1317
0
      uint8   xlinfo;
1318
0
      XLogRecPtr  recptr;
1319
0
      uint16    upostingoff;
1320
1321
0
      xlrec.offnum = newitemoff;
1322
1323
0
      XLogBeginInsert();
1324
0
      XLogRegisterData(&xlrec, SizeOfBtreeInsert);
1325
1326
0
      if (isleaf && postingoff == 0)
1327
0
      {
1328
        /* Simple leaf insert */
1329
0
        xlinfo = XLOG_BTREE_INSERT_LEAF;
1330
0
      }
1331
0
      else if (postingoff != 0)
1332
0
      {
1333
        /*
1334
         * Leaf insert with posting list split.  Must include
1335
         * postingoff field before newitem/orignewitem.
1336
         */
1337
0
        Assert(isleaf);
1338
0
        xlinfo = XLOG_BTREE_INSERT_POST;
1339
0
      }
1340
0
      else
1341
0
      {
1342
        /* Internal page insert, which finishes a split on cbuf */
1343
0
        xlinfo = XLOG_BTREE_INSERT_UPPER;
1344
0
        XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
1345
1346
0
        if (BufferIsValid(metabuf))
1347
0
        {
1348
          /* Actually, it's an internal page insert + meta update */
1349
0
          xlinfo = XLOG_BTREE_INSERT_META;
1350
1351
0
          Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
1352
0
          xlmeta.version = metad->btm_version;
1353
0
          xlmeta.root = metad->btm_root;
1354
0
          xlmeta.level = metad->btm_level;
1355
0
          xlmeta.fastroot = metad->btm_fastroot;
1356
0
          xlmeta.fastlevel = metad->btm_fastlevel;
1357
0
          xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
1358
0
          xlmeta.allequalimage = metad->btm_allequalimage;
1359
1360
0
          XLogRegisterBuffer(2, metabuf,
1361
0
                     REGBUF_WILL_INIT | REGBUF_STANDARD);
1362
0
          XLogRegisterBufData(2, &xlmeta,
1363
0
                    sizeof(xl_btree_metadata));
1364
0
        }
1365
0
      }
1366
1367
0
      XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1368
0
      if (postingoff == 0)
1369
0
      {
1370
        /* Just log itup from caller */
1371
0
        XLogRegisterBufData(0, itup, IndexTupleSize(itup));
1372
0
      }
1373
0
      else
1374
0
      {
1375
        /*
1376
         * Insert with posting list split (XLOG_BTREE_INSERT_POST
1377
         * record) case.
1378
         *
1379
         * Log postingoff.  Also log origitup, not itup.  REDO routine
1380
         * must reconstruct final itup (as well as nposting) using
1381
         * _bt_swap_posting().
1382
         */
1383
0
        upostingoff = postingoff;
1384
1385
0
        XLogRegisterBufData(0, &upostingoff, sizeof(uint16));
1386
0
        XLogRegisterBufData(0, origitup,
1387
0
                  IndexTupleSize(origitup));
1388
0
      }
1389
1390
0
      recptr = XLogInsert(RM_BTREE_ID, xlinfo);
1391
1392
0
      if (BufferIsValid(metabuf))
1393
0
        PageSetLSN(metapg, recptr);
1394
0
      if (!isleaf)
1395
0
        PageSetLSN(BufferGetPage(cbuf), recptr);
1396
1397
0
      PageSetLSN(page, recptr);
1398
0
    }
1399
1400
0
    END_CRIT_SECTION();
1401
1402
    /* Release subsidiary buffers */
1403
0
    if (BufferIsValid(metabuf))
1404
0
      _bt_relbuf(rel, metabuf);
1405
0
    if (!isleaf)
1406
0
      _bt_relbuf(rel, cbuf);
1407
1408
    /*
1409
     * Cache the block number if this is the rightmost leaf page.  Cache
1410
     * may be used by a future inserter within _bt_search_insert().
1411
     */
1412
0
    blockcache = InvalidBlockNumber;
1413
0
    if (isrightmost && isleaf && !isroot)
1414
0
      blockcache = BufferGetBlockNumber(buf);
1415
1416
    /* Release buffer for insertion target block */
1417
0
    _bt_relbuf(rel, buf);
1418
1419
    /*
1420
     * If we decided to cache the insertion target block before releasing
1421
     * its buffer lock, then cache it now.  Check the height of the tree
1422
     * first, though.  We don't go for the optimization with small
1423
     * indexes.  Defer final check to this point to ensure that we don't
1424
     * call _bt_getrootheight while holding a buffer lock.
1425
     */
1426
0
    if (BlockNumberIsValid(blockcache) &&
1427
0
      _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL)
1428
0
      RelationSetTargetBlock(rel, blockcache);
1429
0
  }
1430
1431
  /* be tidy */
1432
0
  if (postingoff != 0)
1433
0
  {
1434
    /* itup is actually a modified copy of caller's original */
1435
0
    pfree(nposting);
1436
0
    pfree(itup);
1437
0
  }
1438
0
}
1439
1440
/*
1441
 *  _bt_split() -- split a page in the btree.
1442
 *
1443
 *    On entry, buf is the page to split, and is pinned and write-locked.
1444
 *    newitemoff etc. tell us about the new item that must be inserted
1445
 *    along with the data from the original page.
1446
 *
1447
 *    itup_key is used for suffix truncation on leaf pages (internal
1448
 *    page callers pass NULL).  When splitting a non-leaf page, 'cbuf'
1449
 *    is the left-sibling of the page we're inserting the downlink for.
1450
 *    This function will clear the INCOMPLETE_SPLIT flag on it, and
1451
 *    release the buffer.
1452
 *
1453
 *    orignewitem, nposting, and postingoff are needed when an insert of
1454
 *    orignewitem results in both a posting list split and a page split.
1455
 *    These extra posting list split details are used here in the same
1456
 *    way as they are used in the more common case where a posting list
1457
 *    split does not coincide with a page split.  We need to deal with
1458
 *    posting list splits directly in order to ensure that everything
1459
 *    that follows from the insert of orignewitem is handled as a single
1460
 *    atomic operation (though caller's insert of a new pivot/downlink
1461
 *    into parent page will still be a separate operation).  See
1462
 *    nbtree/README for details on the design of posting list splits.
1463
 *
1464
 *    Returns the new right sibling of buf, pinned and write-locked.
1465
 *    The pin and lock on buf are maintained.
1466
 */
1467
static Buffer
1468
_bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf,
1469
      Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
1470
      IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff)
1471
0
{
1472
0
  Buffer    rbuf;
1473
0
  Page    origpage;
1474
0
  Page    leftpage,
1475
0
        rightpage;
1476
0
  PGAlignedBlock leftpage_buf,
1477
0
        rightpage_buf;
1478
0
  BlockNumber origpagenumber,
1479
0
        rightpagenumber;
1480
0
  BTPageOpaque ropaque,
1481
0
        lopaque,
1482
0
        oopaque;
1483
0
  Buffer    sbuf = InvalidBuffer;
1484
0
  Page    spage = NULL;
1485
0
  BTPageOpaque sopaque = NULL;
1486
0
  Size    itemsz;
1487
0
  ItemId    itemid;
1488
0
  IndexTuple  firstright,
1489
0
        lefthighkey;
1490
0
  OffsetNumber firstrightoff;
1491
0
  OffsetNumber afterleftoff,
1492
0
        afterrightoff,
1493
0
        minusinfoff;
1494
0
  OffsetNumber origpagepostingoff;
1495
0
  OffsetNumber maxoff;
1496
0
  OffsetNumber i;
1497
0
  bool    newitemonleft,
1498
0
        isleaf,
1499
0
        isrightmost;
1500
1501
  /*
1502
   * origpage is the original page to be split.  leftpage is a temporary
1503
   * buffer that receives the left-sibling data, which will be copied back
1504
   * into origpage on success.  rightpage is the new page that will receive
1505
   * the right-sibling data.
1506
   *
1507
   * leftpage is allocated after choosing a split point.  rightpage's new
1508
   * buffer isn't acquired until after leftpage is initialized and has new
1509
   * high key, the last point where splitting the page may fail (barring
1510
   * corruption).  Failing before acquiring new buffer won't have lasting
1511
   * consequences, since origpage won't have been modified and leftpage is
1512
   * only workspace.
1513
   */
1514
0
  origpage = BufferGetPage(buf);
1515
0
  oopaque = BTPageGetOpaque(origpage);
1516
0
  isleaf = P_ISLEAF(oopaque);
1517
0
  isrightmost = P_RIGHTMOST(oopaque);
1518
0
  maxoff = PageGetMaxOffsetNumber(origpage);
1519
0
  origpagenumber = BufferGetBlockNumber(buf);
1520
1521
  /*
1522
   * Choose a point to split origpage at.
1523
   *
1524
   * A split point can be thought of as a point _between_ two existing data
1525
   * items on origpage (the lastleft and firstright tuples), provided you
1526
   * pretend that the new item that didn't fit is already on origpage.
1527
   *
1528
   * Since origpage does not actually contain newitem, the representation of
1529
   * split points needs to work with two boundary cases: splits where
1530
   * newitem is lastleft, and splits where newitem is firstright.
1531
   * newitemonleft resolves the ambiguity that would otherwise exist when
1532
   * newitemoff == firstrightoff.  In all other cases it's clear which side
1533
   * of the split every tuple goes on from context.  newitemonleft is
1534
   * usually (but not always) redundant information.
1535
   *
1536
   * firstrightoff is supposed to be an origpage offset number, but it's
1537
   * possible that its value will be maxoff+1, which is "past the end" of
1538
   * origpage.  This happens in the rare case where newitem goes after all
1539
   * existing items (i.e. newitemoff is maxoff+1) and we end up splitting
1540
   * origpage at the point that leaves newitem alone on new right page.  Any
1541
   * "!newitemonleft && newitemoff == firstrightoff" split point makes
1542
   * newitem the firstright tuple, though, so this case isn't a special
1543
   * case.
1544
   */
1545
0
  firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
1546
0
                   newitem, &newitemonleft);
1547
1548
  /* Use temporary buffer for leftpage */
1549
0
  leftpage = leftpage_buf.data;
1550
0
  _bt_pageinit(leftpage, BufferGetPageSize(buf));
1551
0
  lopaque = BTPageGetOpaque(leftpage);
1552
1553
  /*
1554
   * leftpage won't be the root when we're done.  Also, clear the SPLIT_END
1555
   * and HAS_GARBAGE flags.
1556
   */
1557
0
  lopaque->btpo_flags = oopaque->btpo_flags;
1558
0
  lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1559
  /* set flag in leftpage indicating that rightpage has no downlink yet */
1560
0
  lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
1561
0
  lopaque->btpo_prev = oopaque->btpo_prev;
1562
  /* handle btpo_next after rightpage buffer acquired */
1563
0
  lopaque->btpo_level = oopaque->btpo_level;
1564
  /* handle btpo_cycleid after rightpage buffer acquired */
1565
1566
  /*
1567
   * Copy the original page's LSN into leftpage, which will become the
1568
   * updated version of the page.  We need this because XLogInsert will
1569
   * examine the LSN and possibly dump it in a page image.
1570
   */
1571
0
  PageSetLSN(leftpage, PageGetLSN(origpage));
1572
1573
  /*
1574
   * Determine page offset number of existing overlapped-with-orignewitem
1575
   * posting list when it is necessary to perform a posting list split in
1576
   * passing.  Note that newitem was already changed by caller (newitem no
1577
   * longer has the orignewitem TID).
1578
   *
1579
   * This page offset number (origpagepostingoff) will be used to pretend
1580
   * that the posting split has already taken place, even though the
1581
   * required modifications to origpage won't occur until we reach the
1582
   * critical section.  The lastleft and firstright tuples of our page split
1583
   * point should, in effect, come from an imaginary version of origpage
1584
   * that has the nposting tuple instead of the original posting list tuple.
1585
   *
1586
   * Note: _bt_findsplitloc() should have compensated for coinciding posting
1587
   * list splits in just the same way, at least in theory.  It doesn't
1588
   * bother with that, though.  In practice it won't affect its choice of
1589
   * split point.
1590
   */
1591
0
  origpagepostingoff = InvalidOffsetNumber;
1592
0
  if (postingoff != 0)
1593
0
  {
1594
0
    Assert(isleaf);
1595
0
    Assert(ItemPointerCompare(&orignewitem->t_tid,
1596
0
                  &newitem->t_tid) < 0);
1597
0
    Assert(BTreeTupleIsPosting(nposting));
1598
0
    origpagepostingoff = OffsetNumberPrev(newitemoff);
1599
0
  }
1600
1601
  /*
1602
   * The high key for the new left page is a possibly-truncated copy of
1603
   * firstright on the leaf level (it's "firstright itself" on internal
1604
   * pages; see !isleaf comments below).  This may seem to be contrary to
1605
   * Lehman & Yao's approach of using a copy of lastleft as the new high key
1606
   * when splitting on the leaf level.  It isn't, though.
1607
   *
1608
   * Suffix truncation will leave the left page's high key fully equal to
1609
   * lastleft when lastleft and firstright are equal prior to heap TID (that
1610
   * is, the tiebreaker TID value comes from lastleft).  It isn't actually
1611
   * necessary for a new leaf high key to be a copy of lastleft for the L&Y
1612
   * "subtree" invariant to hold.  It's sufficient to make sure that the new
1613
   * leaf high key is strictly less than firstright, and greater than or
1614
   * equal to (not necessarily equal to) lastleft.  In other words, when
1615
   * suffix truncation isn't possible during a leaf page split, we take
1616
   * L&Y's exact approach to generating a new high key for the left page.
1617
   * (Actually, that is slightly inaccurate.  We don't just use a copy of
1618
   * lastleft.  A tuple with all the keys from firstright but the max heap
1619
   * TID from lastleft is used, to avoid introducing a special case.)
1620
   */
1621
0
  if (!newitemonleft && newitemoff == firstrightoff)
1622
0
  {
1623
    /* incoming tuple becomes firstright */
1624
0
    itemsz = newitemsz;
1625
0
    firstright = newitem;
1626
0
  }
1627
0
  else
1628
0
  {
1629
    /* existing item at firstrightoff becomes firstright */
1630
0
    itemid = PageGetItemId(origpage, firstrightoff);
1631
0
    itemsz = ItemIdGetLength(itemid);
1632
0
    firstright = (IndexTuple) PageGetItem(origpage, itemid);
1633
0
    if (firstrightoff == origpagepostingoff)
1634
0
      firstright = nposting;
1635
0
  }
1636
1637
0
  if (isleaf)
1638
0
  {
1639
0
    IndexTuple  lastleft;
1640
1641
    /* Attempt suffix truncation for leaf page splits */
1642
0
    if (newitemonleft && newitemoff == firstrightoff)
1643
0
    {
1644
      /* incoming tuple becomes lastleft */
1645
0
      lastleft = newitem;
1646
0
    }
1647
0
    else
1648
0
    {
1649
0
      OffsetNumber lastleftoff;
1650
1651
      /* existing item before firstrightoff becomes lastleft */
1652
0
      lastleftoff = OffsetNumberPrev(firstrightoff);
1653
0
      Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque));
1654
0
      itemid = PageGetItemId(origpage, lastleftoff);
1655
0
      lastleft = (IndexTuple) PageGetItem(origpage, itemid);
1656
0
      if (lastleftoff == origpagepostingoff)
1657
0
        lastleft = nposting;
1658
0
    }
1659
1660
0
    lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key);
1661
0
    itemsz = IndexTupleSize(lefthighkey);
1662
0
  }
1663
0
  else
1664
0
  {
1665
    /*
1666
     * Don't perform suffix truncation on a copy of firstright to make
1667
     * left page high key for internal page splits.  Must use firstright
1668
     * as new high key directly.
1669
     *
1670
     * Each distinct separator key value originates as a leaf level high
1671
     * key; all other separator keys/pivot tuples are copied from one
1672
     * level down.  A separator key in a grandparent page must be
1673
     * identical to high key in rightmost parent page of the subtree to
1674
     * its left, which must itself be identical to high key in rightmost
1675
     * child page of that same subtree (this even applies to separator
1676
     * from grandparent's high key).  There must always be an unbroken
1677
     * "seam" of identical separator keys that guide index scans at every
1678
     * level, starting from the grandparent.  That's why suffix truncation
1679
     * is unsafe here.
1680
     *
1681
     * Internal page splits will truncate firstright into a "negative
1682
     * infinity" data item when it gets inserted on the new right page
1683
     * below, though.  This happens during the call to _bt_pgaddtup() for
1684
     * the new first data item for right page.  Do not confuse this
1685
     * mechanism with suffix truncation.  It is just a convenient way of
1686
     * implementing page splits that split the internal page "inside"
1687
     * firstright.  The lefthighkey separator key cannot appear a second
1688
     * time in the right page (only firstright's downlink goes in right
1689
     * page).
1690
     */
1691
0
    lefthighkey = firstright;
1692
0
  }
1693
1694
  /*
1695
   * Add new high key to leftpage
1696
   */
1697
0
  afterleftoff = P_HIKEY;
1698
1699
0
  Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0);
1700
0
  Assert(BTreeTupleGetNAtts(lefthighkey, rel) <=
1701
0
       IndexRelationGetNumberOfKeyAttributes(rel));
1702
0
  Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey)));
1703
0
  if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false,
1704
0
          false) == InvalidOffsetNumber)
1705
0
    elog(ERROR, "failed to add high key to the left sibling"
1706
0
       " while splitting block %u of index \"%s\"",
1707
0
       origpagenumber, RelationGetRelationName(rel));
1708
0
  afterleftoff = OffsetNumberNext(afterleftoff);
1709
1710
  /*
1711
   * Acquire a new right page to split into, now that left page has a new
1712
   * high key.
1713
   *
1714
   * To not confuse future VACUUM operations, we zero the right page and
1715
   * work on an in-memory copy of it before writing WAL, then copy its
1716
   * contents back to the actual page once we start the critical section
1717
   * work.  This simplifies the split work, so as there is no need to zero
1718
   * the right page before throwing an error.
1719
   */
1720
0
  rbuf = _bt_allocbuf(rel, heaprel);
1721
0
  rightpage = rightpage_buf.data;
1722
1723
  /*
1724
   * Copy the contents of the right page into its temporary location, and
1725
   * zero the original space.
1726
   */
1727
0
  memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ);
1728
0
  memset(BufferGetPage(rbuf), 0, BLCKSZ);
1729
0
  rightpagenumber = BufferGetBlockNumber(rbuf);
1730
  /* rightpage was initialized by _bt_allocbuf */
1731
0
  ropaque = BTPageGetOpaque(rightpage);
1732
1733
  /*
1734
   * Finish off remaining leftpage special area fields.  They cannot be set
1735
   * before both origpage (leftpage) and rightpage buffers are acquired and
1736
   * locked.
1737
   *
1738
   * btpo_cycleid is only used with leaf pages, though we set it here in all
1739
   * cases just to be consistent.
1740
   */
1741
0
  lopaque->btpo_next = rightpagenumber;
1742
0
  lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
1743
1744
  /*
1745
   * rightpage won't be the root when we're done.  Also, clear the SPLIT_END
1746
   * and HAS_GARBAGE flags.
1747
   */
1748
0
  ropaque->btpo_flags = oopaque->btpo_flags;
1749
0
  ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
1750
0
  ropaque->btpo_prev = origpagenumber;
1751
0
  ropaque->btpo_next = oopaque->btpo_next;
1752
0
  ropaque->btpo_level = oopaque->btpo_level;
1753
0
  ropaque->btpo_cycleid = lopaque->btpo_cycleid;
1754
1755
  /*
1756
   * Add new high key to rightpage where necessary.
1757
   *
1758
   * If the page we're splitting is not the rightmost page at its level in
1759
   * the tree, then the first entry on the page is the high key from
1760
   * origpage.
1761
   */
1762
0
  afterrightoff = P_HIKEY;
1763
1764
0
  if (!isrightmost)
1765
0
  {
1766
0
    IndexTuple  righthighkey;
1767
1768
0
    itemid = PageGetItemId(origpage, P_HIKEY);
1769
0
    itemsz = ItemIdGetLength(itemid);
1770
0
    righthighkey = (IndexTuple) PageGetItem(origpage, itemid);
1771
0
    Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0);
1772
0
    Assert(BTreeTupleGetNAtts(righthighkey, rel) <=
1773
0
         IndexRelationGetNumberOfKeyAttributes(rel));
1774
0
    if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff,
1775
0
            false, false) == InvalidOffsetNumber)
1776
0
    {
1777
0
      elog(ERROR, "failed to add high key to the right sibling"
1778
0
         " while splitting block %u of index \"%s\"",
1779
0
         origpagenumber, RelationGetRelationName(rel));
1780
0
    }
1781
0
    afterrightoff = OffsetNumberNext(afterrightoff);
1782
0
  }
1783
1784
  /*
1785
   * Internal page splits truncate first data item on right page -- it
1786
   * becomes "minus infinity" item for the page.  Set this up here.
1787
   */
1788
0
  minusinfoff = InvalidOffsetNumber;
1789
0
  if (!isleaf)
1790
0
    minusinfoff = afterrightoff;
1791
1792
  /*
1793
   * Now transfer all the data items (non-pivot tuples in isleaf case, or
1794
   * additional pivot tuples in !isleaf case) to the appropriate page.
1795
   *
1796
   * Note: we *must* insert at least the right page's items in item-number
1797
   * order, for the benefit of _bt_restore_page().
1798
   */
1799
0
  for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i))
1800
0
  {
1801
0
    IndexTuple  dataitem;
1802
1803
0
    itemid = PageGetItemId(origpage, i);
1804
0
    itemsz = ItemIdGetLength(itemid);
1805
0
    dataitem = (IndexTuple) PageGetItem(origpage, itemid);
1806
1807
    /* replace original item with nposting due to posting split? */
1808
0
    if (i == origpagepostingoff)
1809
0
    {
1810
0
      Assert(BTreeTupleIsPosting(dataitem));
1811
0
      Assert(itemsz == MAXALIGN(IndexTupleSize(nposting)));
1812
0
      dataitem = nposting;
1813
0
    }
1814
1815
    /* does new item belong before this one? */
1816
0
    else if (i == newitemoff)
1817
0
    {
1818
0
      if (newitemonleft)
1819
0
      {
1820
0
        Assert(newitemoff <= firstrightoff);
1821
0
        if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff,
1822
0
                  false))
1823
0
        {
1824
0
          elog(ERROR, "failed to add new item to the left sibling"
1825
0
             " while splitting block %u of index \"%s\"",
1826
0
             origpagenumber, RelationGetRelationName(rel));
1827
0
        }
1828
0
        afterleftoff = OffsetNumberNext(afterleftoff);
1829
0
      }
1830
0
      else
1831
0
      {
1832
0
        Assert(newitemoff >= firstrightoff);
1833
0
        if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1834
0
                  afterrightoff == minusinfoff))
1835
0
        {
1836
0
          elog(ERROR, "failed to add new item to the right sibling"
1837
0
             " while splitting block %u of index \"%s\"",
1838
0
             origpagenumber, RelationGetRelationName(rel));
1839
0
        }
1840
0
        afterrightoff = OffsetNumberNext(afterrightoff);
1841
0
      }
1842
0
    }
1843
1844
    /* decide which page to put it on */
1845
0
    if (i < firstrightoff)
1846
0
    {
1847
0
      if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false))
1848
0
      {
1849
0
        elog(ERROR, "failed to add old item to the left sibling"
1850
0
           " while splitting block %u of index \"%s\"",
1851
0
           origpagenumber, RelationGetRelationName(rel));
1852
0
      }
1853
0
      afterleftoff = OffsetNumberNext(afterleftoff);
1854
0
    }
1855
0
    else
1856
0
    {
1857
0
      if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff,
1858
0
                afterrightoff == minusinfoff))
1859
0
      {
1860
0
        elog(ERROR, "failed to add old item to the right sibling"
1861
0
           " while splitting block %u of index \"%s\"",
1862
0
           origpagenumber, RelationGetRelationName(rel));
1863
0
      }
1864
0
      afterrightoff = OffsetNumberNext(afterrightoff);
1865
0
    }
1866
0
  }
1867
1868
  /* Handle case where newitem goes at the end of rightpage */
1869
0
  if (i <= newitemoff)
1870
0
  {
1871
    /*
1872
     * Can't have newitemonleft here; that would imply we were told to put
1873
     * *everything* on the left page, which cannot fit (if it could, we'd
1874
     * not be splitting the page).
1875
     */
1876
0
    Assert(!newitemonleft && newitemoff == maxoff + 1);
1877
0
    if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff,
1878
0
              afterrightoff == minusinfoff))
1879
0
    {
1880
0
      elog(ERROR, "failed to add new item to the right sibling"
1881
0
         " while splitting block %u of index \"%s\"",
1882
0
         origpagenumber, RelationGetRelationName(rel));
1883
0
    }
1884
0
    afterrightoff = OffsetNumberNext(afterrightoff);
1885
0
  }
1886
1887
  /*
1888
   * We have to grab the original right sibling (if any) and update its prev
1889
   * link.  We are guaranteed that this is deadlock-free, since we couple
1890
   * the locks in the standard order: left to right.
1891
   */
1892
0
  if (!isrightmost)
1893
0
  {
1894
0
    sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE);
1895
0
    spage = BufferGetPage(sbuf);
1896
0
    sopaque = BTPageGetOpaque(spage);
1897
0
    if (sopaque->btpo_prev != origpagenumber)
1898
0
    {
1899
0
      ereport(ERROR,
1900
0
          (errcode(ERRCODE_INDEX_CORRUPTED),
1901
0
           errmsg_internal("right sibling's left-link doesn't match: "
1902
0
                   "block %u links to %u instead of expected %u in index \"%s\"",
1903
0
                   oopaque->btpo_next, sopaque->btpo_prev, origpagenumber,
1904
0
                   RelationGetRelationName(rel))));
1905
0
    }
1906
1907
    /*
1908
     * Check to see if we can set the SPLIT_END flag in the right-hand
1909
     * split page; this can save some I/O for vacuum since it need not
1910
     * proceed to the right sibling.  We can set the flag if the right
1911
     * sibling has a different cycleid: that means it could not be part of
1912
     * a group of pages that were all split off from the same ancestor
1913
     * page.  If you're confused, imagine that page A splits to A B and
1914
     * then again, yielding A C B, while vacuum is in progress.  Tuples
1915
     * originally in A could now be in either B or C, hence vacuum must
1916
     * examine both pages.  But if D, our right sibling, has a different
1917
     * cycleid then it could not contain any tuples that were in A when
1918
     * the vacuum started.
1919
     */
1920
0
    if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
1921
0
      ropaque->btpo_flags |= BTP_SPLIT_END;
1922
0
  }
1923
1924
  /*
1925
   * Right sibling is locked, new siblings are prepared, but original page
1926
   * is not updated yet.
1927
   *
1928
   * NO EREPORT(ERROR) till right sibling is updated.  We can get away with
1929
   * not starting the critical section till here because we haven't been
1930
   * scribbling on the original page yet; see comments above.
1931
   */
1932
0
  START_CRIT_SECTION();
1933
1934
  /*
1935
   * By here, the original data page has been split into two new halves, and
1936
   * these are correct.  The algorithm requires that the left page never
1937
   * move during a split, so we copy the new left page back on top of the
1938
   * original.  We need to do this before writing the WAL record, so that
1939
   * XLogInsert can WAL log an image of the page if necessary.
1940
   */
1941
0
  memcpy(origpage, leftpage, BLCKSZ);
1942
  /* leftpage, lopaque must not be used below here */
1943
1944
  /*
1945
   * Move the contents of the right page from its temporary location to the
1946
   * destination buffer, before writing the WAL record.  Unlike the left
1947
   * page, the right page and its opaque area are still needed to complete
1948
   * the update of the page, so reinitialize them.
1949
   */
1950
0
  rightpage = BufferGetPage(rbuf);
1951
0
  memcpy(rightpage, rightpage_buf.data, BLCKSZ);
1952
0
  ropaque = BTPageGetOpaque(rightpage);
1953
1954
0
  MarkBufferDirty(buf);
1955
0
  MarkBufferDirty(rbuf);
1956
1957
0
  if (!isrightmost)
1958
0
  {
1959
0
    sopaque->btpo_prev = rightpagenumber;
1960
0
    MarkBufferDirty(sbuf);
1961
0
  }
1962
1963
  /*
1964
   * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
1965
   * a split
1966
   */
1967
0
  if (!isleaf)
1968
0
  {
1969
0
    Page    cpage = BufferGetPage(cbuf);
1970
0
    BTPageOpaque cpageop = BTPageGetOpaque(cpage);
1971
1972
0
    cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
1973
0
    MarkBufferDirty(cbuf);
1974
0
  }
1975
1976
  /* XLOG stuff */
1977
0
  if (RelationNeedsWAL(rel))
1978
0
  {
1979
0
    xl_btree_split xlrec;
1980
0
    uint8   xlinfo;
1981
0
    XLogRecPtr  recptr;
1982
1983
0
    xlrec.level = ropaque->btpo_level;
1984
    /* See comments below on newitem, orignewitem, and posting lists */
1985
0
    xlrec.firstrightoff = firstrightoff;
1986
0
    xlrec.newitemoff = newitemoff;
1987
0
    xlrec.postingoff = 0;
1988
0
    if (postingoff != 0 && origpagepostingoff < firstrightoff)
1989
0
      xlrec.postingoff = postingoff;
1990
1991
0
    XLogBeginInsert();
1992
0
    XLogRegisterData(&xlrec, SizeOfBtreeSplit);
1993
1994
0
    XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
1995
0
    XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT);
1996
    /* Log original right sibling, since we've changed its prev-pointer */
1997
0
    if (!isrightmost)
1998
0
      XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD);
1999
0
    if (!isleaf)
2000
0
      XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD);
2001
2002
    /*
2003
     * Log the new item, if it was inserted on the left page. (If it was
2004
     * put on the right page, we don't need to explicitly WAL log it
2005
     * because it's included with all the other items on the right page.)
2006
     * Show the new item as belonging to the left page buffer, so that it
2007
     * is not stored if XLogInsert decides it needs a full-page image of
2008
     * the left page.  We always store newitemoff in the record, though.
2009
     *
2010
     * The details are sometimes slightly different for page splits that
2011
     * coincide with a posting list split.  If both the replacement
2012
     * posting list and newitem go on the right page, then we don't need
2013
     * to log anything extra, just like the simple !newitemonleft
2014
     * no-posting-split case (postingoff is set to zero in the WAL record,
2015
     * so recovery doesn't need to process a posting list split at all).
2016
     * Otherwise, we set postingoff and log orignewitem instead of
2017
     * newitem, despite having actually inserted newitem.  REDO routine
2018
     * must reconstruct nposting and newitem using _bt_swap_posting().
2019
     *
2020
     * Note: It's possible that our page split point is the point that
2021
     * makes the posting list lastleft and newitem firstright.  This is
2022
     * the only case where we log orignewitem/newitem despite newitem
2023
     * going on the right page.  If XLogInsert decides that it can omit
2024
     * orignewitem due to logging a full-page image of the left page,
2025
     * everything still works out, since recovery only needs to log
2026
     * orignewitem for items on the left page (just like the regular
2027
     * newitem-logged case).
2028
     */
2029
0
    if (newitemonleft && xlrec.postingoff == 0)
2030
0
      XLogRegisterBufData(0, newitem, newitemsz);
2031
0
    else if (xlrec.postingoff != 0)
2032
0
    {
2033
0
      Assert(isleaf);
2034
0
      Assert(newitemonleft || firstrightoff == newitemoff);
2035
0
      Assert(newitemsz == IndexTupleSize(orignewitem));
2036
0
      XLogRegisterBufData(0, orignewitem, newitemsz);
2037
0
    }
2038
2039
    /* Log the left page's new high key */
2040
0
    if (!isleaf)
2041
0
    {
2042
      /* lefthighkey isn't local copy, get current pointer */
2043
0
      itemid = PageGetItemId(origpage, P_HIKEY);
2044
0
      lefthighkey = (IndexTuple) PageGetItem(origpage, itemid);
2045
0
    }
2046
0
    XLogRegisterBufData(0, lefthighkey,
2047
0
              MAXALIGN(IndexTupleSize(lefthighkey)));
2048
2049
    /*
2050
     * Log the contents of the right page in the format understood by
2051
     * _bt_restore_page().  The whole right page will be recreated.
2052
     *
2053
     * Direct access to page is not good but faster - we should implement
2054
     * some new func in page API.  Note we only store the tuples
2055
     * themselves, knowing that they were inserted in item-number order
2056
     * and so the line pointers can be reconstructed.  See comments for
2057
     * _bt_restore_page().
2058
     */
2059
0
    XLogRegisterBufData(1,
2060
0
              (char *) rightpage + ((PageHeader) rightpage)->pd_upper,
2061
0
              ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper);
2062
2063
0
    xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R;
2064
0
    recptr = XLogInsert(RM_BTREE_ID, xlinfo);
2065
2066
0
    PageSetLSN(origpage, recptr);
2067
0
    PageSetLSN(rightpage, recptr);
2068
0
    if (!isrightmost)
2069
0
      PageSetLSN(spage, recptr);
2070
0
    if (!isleaf)
2071
0
      PageSetLSN(BufferGetPage(cbuf), recptr);
2072
0
  }
2073
2074
0
  END_CRIT_SECTION();
2075
2076
  /* release the old right sibling */
2077
0
  if (!isrightmost)
2078
0
    _bt_relbuf(rel, sbuf);
2079
2080
  /* release the child */
2081
0
  if (!isleaf)
2082
0
    _bt_relbuf(rel, cbuf);
2083
2084
  /* be tidy */
2085
0
  if (isleaf)
2086
0
    pfree(lefthighkey);
2087
2088
  /* split's done */
2089
0
  return rbuf;
2090
0
}
2091
2092
/*
2093
 * _bt_insert_parent() -- Insert downlink into parent, completing split.
2094
 *
2095
 * On entry, buf and rbuf are the left and right split pages, which we
2096
 * still hold write locks on.  Both locks will be released here.  We
2097
 * release the rbuf lock once we have a write lock on the page that we
2098
 * intend to insert a downlink to rbuf on (i.e. buf's current parent page).
2099
 * The lock on buf is released at the same point as the lock on the parent
2100
 * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same
2101
 * atomic operation that completes the split by inserting a new downlink.
2102
 *
2103
 * stack - stack showing how we got here.  Will be NULL when splitting true
2104
 *      root, or during concurrent root split, where we can be inefficient
2105
 * isroot - we split the true root
2106
 * isonly - we split a page alone on its level (might have been fast root)
2107
 */
2108
static void
2109
_bt_insert_parent(Relation rel,
2110
          Relation heaprel,
2111
          Buffer buf,
2112
          Buffer rbuf,
2113
          BTStack stack,
2114
          bool isroot,
2115
          bool isonly)
2116
0
{
2117
0
  Assert(heaprel != NULL);
2118
2119
  /*
2120
   * Here we have to do something Lehman and Yao don't talk about: deal with
2121
   * a root split and construction of a new root.  If our stack is empty
2122
   * then we have just split a node on what had been the root level when we
2123
   * descended the tree.  If it was still the root then we perform a
2124
   * new-root construction.  If it *wasn't* the root anymore, search to find
2125
   * the next higher level that someone constructed meanwhile, and find the
2126
   * right place to insert as for the normal case.
2127
   *
2128
   * If we have to search for the parent level, we do so by re-descending
2129
   * from the root.  This is not super-efficient, but it's rare enough not
2130
   * to matter.
2131
   */
2132
0
  if (isroot)
2133
0
  {
2134
0
    Buffer    rootbuf;
2135
2136
0
    Assert(stack == NULL);
2137
0
    Assert(isonly);
2138
    /* create a new root node one level up and update the metapage */
2139
0
    rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf);
2140
    /* release the split buffers */
2141
0
    _bt_relbuf(rel, rootbuf);
2142
0
    _bt_relbuf(rel, rbuf);
2143
0
    _bt_relbuf(rel, buf);
2144
0
  }
2145
0
  else
2146
0
  {
2147
0
    BlockNumber bknum = BufferGetBlockNumber(buf);
2148
0
    BlockNumber rbknum = BufferGetBlockNumber(rbuf);
2149
0
    Page    page = BufferGetPage(buf);
2150
0
    IndexTuple  new_item;
2151
0
    BTStackData fakestack;
2152
0
    IndexTuple  ritem;
2153
0
    Buffer    pbuf;
2154
2155
0
    if (stack == NULL)
2156
0
    {
2157
0
      BTPageOpaque opaque;
2158
2159
0
      elog(DEBUG2, "concurrent ROOT page split");
2160
0
      opaque = BTPageGetOpaque(page);
2161
2162
      /*
2163
       * We should never reach here when a leaf page split takes place
2164
       * despite the insert of newitem being able to apply the fastpath
2165
       * optimization.  Make sure of that with an assertion.
2166
       *
2167
       * This is more of a performance issue than a correctness issue.
2168
       * The fastpath won't have a descent stack.  Using a phony stack
2169
       * here works, but never rely on that.  The fastpath should be
2170
       * rejected within _bt_search_insert() when the rightmost leaf
2171
       * page will split, since it's faster to go through _bt_search()
2172
       * and get a stack in the usual way.
2173
       */
2174
0
      Assert(!(P_ISLEAF(opaque) &&
2175
0
           BlockNumberIsValid(RelationGetTargetBlock(rel))));
2176
2177
      /* Find the leftmost page at the next level up */
2178
0
      pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false);
2179
      /* Set up a phony stack entry pointing there */
2180
0
      stack = &fakestack;
2181
0
      stack->bts_blkno = BufferGetBlockNumber(pbuf);
2182
0
      stack->bts_offset = InvalidOffsetNumber;
2183
0
      stack->bts_parent = NULL;
2184
0
      _bt_relbuf(rel, pbuf);
2185
0
    }
2186
2187
    /* get high key from left, a strict lower bound for new right page */
2188
0
    ritem = (IndexTuple) PageGetItem(page,
2189
0
                     PageGetItemId(page, P_HIKEY));
2190
2191
    /* form an index tuple that points at the new right page */
2192
0
    new_item = CopyIndexTuple(ritem);
2193
0
    BTreeTupleSetDownLink(new_item, rbknum);
2194
2195
    /*
2196
     * Re-find and write lock the parent of buf.
2197
     *
2198
     * It's possible that the location of buf's downlink has changed since
2199
     * our initial _bt_search() descent.  _bt_getstackbuf() will detect
2200
     * and recover from this, updating the stack, which ensures that the
2201
     * new downlink will be inserted at the correct offset. Even buf's
2202
     * parent may have changed.
2203
     */
2204
0
    pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum);
2205
2206
    /*
2207
     * Unlock the right child.  The left child will be unlocked in
2208
     * _bt_insertonpg().
2209
     *
2210
     * Unlocking the right child must be delayed until here to ensure that
2211
     * no concurrent VACUUM operation can become confused.  Page deletion
2212
     * cannot be allowed to fail to re-find a downlink for the rbuf page.
2213
     * (Actually, this is just a vestige of how things used to work.  The
2214
     * page deletion code is expected to check for the INCOMPLETE_SPLIT
2215
     * flag on the left child.  It won't attempt deletion of the right
2216
     * child until the split is complete.  Despite all this, we opt to
2217
     * conservatively delay unlocking the right child until here.)
2218
     */
2219
0
    _bt_relbuf(rel, rbuf);
2220
2221
0
    if (pbuf == InvalidBuffer)
2222
0
      ereport(ERROR,
2223
0
          (errcode(ERRCODE_INDEX_CORRUPTED),
2224
0
           errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u",
2225
0
                   RelationGetRelationName(rel), bknum, rbknum)));
2226
2227
    /* Recursively insert into the parent */
2228
0
    _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent,
2229
0
             new_item, MAXALIGN(IndexTupleSize(new_item)),
2230
0
             stack->bts_offset + 1, 0, isonly);
2231
2232
    /* be tidy */
2233
0
    pfree(new_item);
2234
0
  }
2235
0
}
2236
2237
/*
2238
 * _bt_finish_split() -- Finish an incomplete split
2239
 *
2240
 * A crash or other failure can leave a split incomplete.  The insertion
2241
 * routines won't allow to insert on a page that is incompletely split.
2242
 * Before inserting on such a page, call _bt_finish_split().
2243
 *
2244
 * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
2245
 * and unpinned.
2246
 *
2247
 * Caller must provide a valid heaprel, since finishing a page split requires
2248
 * allocating a new page if and when the parent page splits in turn.
2249
 */
2250
void
2251
_bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack)
2252
{
2253
  Page    lpage = BufferGetPage(lbuf);
2254
  BTPageOpaque lpageop = BTPageGetOpaque(lpage);
2255
  Buffer    rbuf;
2256
  Page    rpage;
2257
  BTPageOpaque rpageop;
2258
  bool    wasroot;
2259
  bool    wasonly;
2260
2261
  Assert(P_INCOMPLETE_SPLIT(lpageop));
2262
  Assert(heaprel != NULL);
2263
2264
  /* Lock right sibling, the one missing the downlink */
2265
  rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
2266
  rpage = BufferGetPage(rbuf);
2267
  rpageop = BTPageGetOpaque(rpage);
2268
2269
  /* Could this be a root split? */
2270
  if (!stack)
2271
  {
2272
    Buffer    metabuf;
2273
    Page    metapg;
2274
    BTMetaPageData *metad;
2275
2276
    /* acquire lock on the metapage */
2277
    metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2278
    metapg = BufferGetPage(metabuf);
2279
    metad = BTPageGetMeta(metapg);
2280
2281
    wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf));
2282
2283
    _bt_relbuf(rel, metabuf);
2284
  }
2285
  else
2286
    wasroot = false;
2287
2288
  /* Was this the only page on the level before split? */
2289
  wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
2290
2291
  elog(DEBUG1, "finishing incomplete split of %u/%u",
2292
     BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
2293
2294
  _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly);
2295
}
2296
2297
/*
2298
 *  _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot
2299
 *             tuple whose downlink points to child page.
2300
 *
2301
 *    Caller passes child's block number, which is used to identify
2302
 *    associated pivot tuple in parent page using a linear search that
2303
 *    matches on pivot's downlink/block number.  The expected location of
2304
 *    the pivot tuple is taken from the stack one level above the child
2305
 *    page.  This is used as a starting point.  Insertions into the
2306
 *    parent level could cause the pivot tuple to move right; deletions
2307
 *    could cause it to move left, but not left of the page we previously
2308
 *    found it on.
2309
 *
2310
 *    Caller can use its stack to relocate the pivot tuple/downlink for
2311
 *    any same-level page to the right of the page found by its initial
2312
 *    descent.  This is necessary because of the possibility that caller
2313
 *    moved right to recover from a concurrent page split.  It's also
2314
 *    convenient for certain callers to be able to step right when there
2315
 *    wasn't a concurrent page split, while still using their original
2316
 *    stack.  For example, the checkingunique _bt_doinsert() case may
2317
 *    have to step right when there are many physical duplicates, and its
2318
 *    scantid forces an insertion to the right of the "first page the
2319
 *    value could be on".  (This is also relied on by all of our callers
2320
 *    when dealing with !heapkeyspace indexes.)
2321
 *
2322
 *    Returns write-locked parent page buffer, or InvalidBuffer if pivot
2323
 *    tuple not found (should not happen).  Adjusts bts_blkno &
2324
 *    bts_offset if changed.  Page split caller should insert its new
2325
 *    pivot tuple for its new right sibling page on parent page, at the
2326
 *    offset number bts_offset + 1.
2327
 */
2328
Buffer
2329
_bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)
2330
0
{
2331
0
  BlockNumber blkno;
2332
0
  OffsetNumber start;
2333
2334
0
  blkno = stack->bts_blkno;
2335
0
  start = stack->bts_offset;
2336
2337
0
  for (;;)
2338
0
  {
2339
0
    Buffer    buf;
2340
0
    Page    page;
2341
0
    BTPageOpaque opaque;
2342
2343
0
    buf = _bt_getbuf(rel, blkno, BT_WRITE);
2344
0
    page = BufferGetPage(buf);
2345
0
    opaque = BTPageGetOpaque(page);
2346
2347
0
    Assert(heaprel != NULL);
2348
0
    if (P_INCOMPLETE_SPLIT(opaque))
2349
0
    {
2350
0
      _bt_finish_split(rel, heaprel, buf, stack->bts_parent);
2351
0
      continue;
2352
0
    }
2353
2354
0
    if (!P_IGNORE(opaque))
2355
0
    {
2356
0
      OffsetNumber offnum,
2357
0
            minoff,
2358
0
            maxoff;
2359
0
      ItemId    itemid;
2360
0
      IndexTuple  item;
2361
2362
0
      minoff = P_FIRSTDATAKEY(opaque);
2363
0
      maxoff = PageGetMaxOffsetNumber(page);
2364
2365
      /*
2366
       * start = InvalidOffsetNumber means "search the whole page". We
2367
       * need this test anyway due to possibility that page has a high
2368
       * key now when it didn't before.
2369
       */
2370
0
      if (start < minoff)
2371
0
        start = minoff;
2372
2373
      /*
2374
       * Need this check too, to guard against possibility that page
2375
       * split since we visited it originally.
2376
       */
2377
0
      if (start > maxoff)
2378
0
        start = OffsetNumberNext(maxoff);
2379
2380
      /*
2381
       * These loops will check every item on the page --- but in an
2382
       * order that's attuned to the probability of where it actually
2383
       * is.  Scan to the right first, then to the left.
2384
       */
2385
0
      for (offnum = start;
2386
0
         offnum <= maxoff;
2387
0
         offnum = OffsetNumberNext(offnum))
2388
0
      {
2389
0
        itemid = PageGetItemId(page, offnum);
2390
0
        item = (IndexTuple) PageGetItem(page, itemid);
2391
2392
0
        if (BTreeTupleGetDownLink(item) == child)
2393
0
        {
2394
          /* Return accurate pointer to where link is now */
2395
0
          stack->bts_blkno = blkno;
2396
0
          stack->bts_offset = offnum;
2397
0
          return buf;
2398
0
        }
2399
0
      }
2400
2401
0
      for (offnum = OffsetNumberPrev(start);
2402
0
         offnum >= minoff;
2403
0
         offnum = OffsetNumberPrev(offnum))
2404
0
      {
2405
0
        itemid = PageGetItemId(page, offnum);
2406
0
        item = (IndexTuple) PageGetItem(page, itemid);
2407
2408
0
        if (BTreeTupleGetDownLink(item) == child)
2409
0
        {
2410
          /* Return accurate pointer to where link is now */
2411
0
          stack->bts_blkno = blkno;
2412
0
          stack->bts_offset = offnum;
2413
0
          return buf;
2414
0
        }
2415
0
      }
2416
0
    }
2417
2418
    /*
2419
     * The item we're looking for moved right at least one page.
2420
     *
2421
     * Lehman and Yao couple/chain locks when moving right here, which we
2422
     * can avoid.  See nbtree/README.
2423
     */
2424
0
    if (P_RIGHTMOST(opaque))
2425
0
    {
2426
0
      _bt_relbuf(rel, buf);
2427
0
      return InvalidBuffer;
2428
0
    }
2429
0
    blkno = opaque->btpo_next;
2430
0
    start = InvalidOffsetNumber;
2431
0
    _bt_relbuf(rel, buf);
2432
0
  }
2433
0
}
2434
2435
/*
2436
 *  _bt_newlevel() -- Create a new level above root page.
2437
 *
2438
 *    We've just split the old root page and need to create a new one.
2439
 *    In order to do this, we add a new root page to the file, then lock
2440
 *    the metadata page and update it.  This is guaranteed to be deadlock-
2441
 *    free, because all readers release their locks on the metadata page
2442
 *    before trying to lock the root, and all writers lock the root before
2443
 *    trying to lock the metadata page.  We have a write lock on the old
2444
 *    root page, so we have not introduced any cycles into the waits-for
2445
 *    graph.
2446
 *
2447
 *    On entry, lbuf (the old root) and rbuf (its new peer) are write-
2448
 *    locked. On exit, a new root page exists with entries for the
2449
 *    two new children, metapage is updated and unlocked/unpinned.
2450
 *    The new root buffer is returned to caller which has to unlock/unpin
2451
 *    lbuf, rbuf & rootbuf.
2452
 */
2453
static Buffer
2454
_bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf)
2455
0
{
2456
0
  Buffer    rootbuf;
2457
0
  Page    lpage,
2458
0
        rootpage;
2459
0
  BlockNumber lbkno,
2460
0
        rbkno;
2461
0
  BlockNumber rootblknum;
2462
0
  BTPageOpaque rootopaque;
2463
0
  BTPageOpaque lopaque;
2464
0
  ItemId    itemid;
2465
0
  IndexTuple  item;
2466
0
  IndexTuple  left_item;
2467
0
  Size    left_item_sz;
2468
0
  IndexTuple  right_item;
2469
0
  Size    right_item_sz;
2470
0
  Buffer    metabuf;
2471
0
  Page    metapg;
2472
0
  BTMetaPageData *metad;
2473
2474
0
  lbkno = BufferGetBlockNumber(lbuf);
2475
0
  rbkno = BufferGetBlockNumber(rbuf);
2476
0
  lpage = BufferGetPage(lbuf);
2477
0
  lopaque = BTPageGetOpaque(lpage);
2478
2479
  /* get a new root page */
2480
0
  rootbuf = _bt_allocbuf(rel, heaprel);
2481
0
  rootpage = BufferGetPage(rootbuf);
2482
0
  rootblknum = BufferGetBlockNumber(rootbuf);
2483
2484
  /* acquire lock on the metapage */
2485
0
  metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
2486
0
  metapg = BufferGetPage(metabuf);
2487
0
  metad = BTPageGetMeta(metapg);
2488
2489
  /*
2490
   * Create downlink item for left page (old root).  The key value used is
2491
   * "minus infinity", a sentinel value that's reliably less than any real
2492
   * key value that could appear in the left page.
2493
   */
2494
0
  left_item_sz = sizeof(IndexTupleData);
2495
0
  left_item = (IndexTuple) palloc(left_item_sz);
2496
0
  left_item->t_info = left_item_sz;
2497
0
  BTreeTupleSetDownLink(left_item, lbkno);
2498
0
  BTreeTupleSetNAtts(left_item, 0, false);
2499
2500
  /*
2501
   * Create downlink item for right page.  The key for it is obtained from
2502
   * the "high key" position in the left page.
2503
   */
2504
0
  itemid = PageGetItemId(lpage, P_HIKEY);
2505
0
  right_item_sz = ItemIdGetLength(itemid);
2506
0
  item = (IndexTuple) PageGetItem(lpage, itemid);
2507
0
  right_item = CopyIndexTuple(item);
2508
0
  BTreeTupleSetDownLink(right_item, rbkno);
2509
2510
  /* NO EREPORT(ERROR) from here till newroot op is logged */
2511
0
  START_CRIT_SECTION();
2512
2513
  /* upgrade metapage if needed */
2514
0
  if (metad->btm_version < BTREE_NOVAC_VERSION)
2515
0
    _bt_upgrademetapage(metapg);
2516
2517
  /* set btree special data */
2518
0
  rootopaque = BTPageGetOpaque(rootpage);
2519
0
  rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
2520
0
  rootopaque->btpo_flags = BTP_ROOT;
2521
0
  rootopaque->btpo_level =
2522
0
    (BTPageGetOpaque(lpage))->btpo_level + 1;
2523
0
  rootopaque->btpo_cycleid = 0;
2524
2525
  /* update metapage data */
2526
0
  metad->btm_root = rootblknum;
2527
0
  metad->btm_level = rootopaque->btpo_level;
2528
0
  metad->btm_fastroot = rootblknum;
2529
0
  metad->btm_fastlevel = rootopaque->btpo_level;
2530
2531
  /*
2532
   * Insert the left page pointer into the new root page.  The root page is
2533
   * the rightmost page on its level so there is no "high key" in it; the
2534
   * two items will go into positions P_HIKEY and P_FIRSTKEY.
2535
   *
2536
   * Note: we *must* insert the two items in item-number order, for the
2537
   * benefit of _bt_restore_page().
2538
   */
2539
0
  Assert(BTreeTupleGetNAtts(left_item, rel) == 0);
2540
0
  if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY,
2541
0
          false, false) == InvalidOffsetNumber)
2542
0
    elog(PANIC, "failed to add leftkey to new root page"
2543
0
       " while splitting block %u of index \"%s\"",
2544
0
       BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2545
2546
  /*
2547
   * insert the right page pointer into the new root page.
2548
   */
2549
0
  Assert(BTreeTupleGetNAtts(right_item, rel) > 0);
2550
0
  Assert(BTreeTupleGetNAtts(right_item, rel) <=
2551
0
       IndexRelationGetNumberOfKeyAttributes(rel));
2552
0
  if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY,
2553
0
          false, false) == InvalidOffsetNumber)
2554
0
    elog(PANIC, "failed to add rightkey to new root page"
2555
0
       " while splitting block %u of index \"%s\"",
2556
0
       BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
2557
2558
  /* Clear the incomplete-split flag in the left child */
2559
0
  Assert(P_INCOMPLETE_SPLIT(lopaque));
2560
0
  lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
2561
0
  MarkBufferDirty(lbuf);
2562
2563
0
  MarkBufferDirty(rootbuf);
2564
0
  MarkBufferDirty(metabuf);
2565
2566
  /* XLOG stuff */
2567
0
  if (RelationNeedsWAL(rel))
2568
0
  {
2569
0
    xl_btree_newroot xlrec;
2570
0
    XLogRecPtr  recptr;
2571
0
    xl_btree_metadata md;
2572
2573
0
    xlrec.rootblk = rootblknum;
2574
0
    xlrec.level = metad->btm_level;
2575
2576
0
    XLogBeginInsert();
2577
0
    XLogRegisterData(&xlrec, SizeOfBtreeNewroot);
2578
2579
0
    XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);
2580
0
    XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);
2581
0
    XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
2582
2583
0
    Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
2584
0
    md.version = metad->btm_version;
2585
0
    md.root = rootblknum;
2586
0
    md.level = metad->btm_level;
2587
0
    md.fastroot = rootblknum;
2588
0
    md.fastlevel = metad->btm_level;
2589
0
    md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
2590
0
    md.allequalimage = metad->btm_allequalimage;
2591
2592
0
    XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));
2593
2594
    /*
2595
     * Direct access to page is not good but faster - we should implement
2596
     * some new func in page API.
2597
     */
2598
0
    XLogRegisterBufData(0,
2599
0
              (char *) rootpage + ((PageHeader) rootpage)->pd_upper,
2600
0
              ((PageHeader) rootpage)->pd_special -
2601
0
              ((PageHeader) rootpage)->pd_upper);
2602
2603
0
    recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);
2604
2605
0
    PageSetLSN(lpage, recptr);
2606
0
    PageSetLSN(rootpage, recptr);
2607
0
    PageSetLSN(metapg, recptr);
2608
0
  }
2609
2610
0
  END_CRIT_SECTION();
2611
2612
  /* done with metapage */
2613
0
  _bt_relbuf(rel, metabuf);
2614
2615
0
  pfree(left_item);
2616
0
  pfree(right_item);
2617
2618
0
  return rootbuf;
2619
0
}
2620
2621
/*
2622
 *  _bt_pgaddtup() -- add a data item to a particular page during split.
2623
 *
2624
 *    The difference between this routine and a bare PageAddItem call is
2625
 *    that this code can deal with the first data item on an internal btree
2626
 *    page in passing.  This data item (which is called "firstright" within
2627
 *    _bt_split()) has a key that must be treated as minus infinity after
2628
 *    the split.  Therefore, we truncate away all attributes when caller
2629
 *    specifies it's the first data item on page (downlink is not changed,
2630
 *    though).  This extra step is only needed for the right page of an
2631
 *    internal page split.  There is no need to do this for the first data
2632
 *    item on the existing/left page, since that will already have been
2633
 *    truncated during an earlier page split.
2634
 *
2635
 *    See _bt_split() for a high level explanation of why we truncate here.
2636
 *    Note that this routine has nothing to do with suffix truncation,
2637
 *    despite using some of the same infrastructure.
2638
 */
2639
static inline bool
2640
_bt_pgaddtup(Page page,
2641
       Size itemsize,
2642
       IndexTuple itup,
2643
       OffsetNumber itup_off,
2644
       bool newfirstdataitem)
2645
0
{
2646
0
  IndexTupleData trunctuple;
2647
2648
0
  if (newfirstdataitem)
2649
0
  {
2650
0
    trunctuple = *itup;
2651
0
    trunctuple.t_info = sizeof(IndexTupleData);
2652
0
    BTreeTupleSetNAtts(&trunctuple, 0, false);
2653
0
    itup = &trunctuple;
2654
0
    itemsize = sizeof(IndexTupleData);
2655
0
  }
2656
2657
0
  if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false,
2658
0
               false) == InvalidOffsetNumber))
2659
0
    return false;
2660
2661
0
  return true;
2662
0
}
2663
2664
/*
2665
 * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split.
2666
 *
2667
 * There are three operations performed here: simple index deletion, bottom-up
2668
 * index deletion, and deduplication.  If all three operations fail to free
2669
 * enough space for the incoming item then caller will go on to split the
2670
 * page.  We always consider simple deletion first.  If that doesn't work out
2671
 * we consider alternatives.  Callers that only want us to consider simple
2672
 * deletion (without any fallback) ask for that using the 'simpleonly'
2673
 * argument.
2674
 *
2675
 * We usually pick only one alternative "complex" operation when simple
2676
 * deletion alone won't prevent a page split.  The 'checkingunique',
2677
 * 'uniquedup', and 'indexUnchanged' arguments are used for that.
2678
 *
2679
 * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page
2680
 * level flag was found set.  The flag was useful back when there wasn't
2681
 * necessarily one single page for a duplicate tuple to go on (before heap TID
2682
 * became a part of the key space in version 4 indexes).  But we don't
2683
 * actually look at the flag anymore (it's not a gating condition for our
2684
 * caller).  That would cause us to miss tuples that are safe to delete,
2685
 * without getting any benefit in return.  We know that the alternative is to
2686
 * split the page; scanning the line pointer array in passing won't have
2687
 * noticeable overhead.  (We still maintain the BTP_HAS_GARBAGE flag despite
2688
 * all this because !heapkeyspace indexes must still do a "getting tired"
2689
 * linear search, and so are likely to get some benefit from using it as a
2690
 * gating condition.)
2691
 */
2692
static void
2693
_bt_delete_or_dedup_one_page(Relation rel, Relation heapRel,
2694
               BTInsertState insertstate,
2695
               bool simpleonly, bool checkingunique,
2696
               bool uniquedup, bool indexUnchanged)
2697
0
{
2698
0
  OffsetNumber deletable[MaxIndexTuplesPerPage];
2699
0
  int     ndeletable = 0;
2700
0
  OffsetNumber offnum,
2701
0
        minoff,
2702
0
        maxoff;
2703
0
  Buffer    buffer = insertstate->buf;
2704
0
  BTScanInsert itup_key = insertstate->itup_key;
2705
0
  Page    page = BufferGetPage(buffer);
2706
0
  BTPageOpaque opaque = BTPageGetOpaque(page);
2707
2708
0
  Assert(P_ISLEAF(opaque));
2709
0
  Assert(simpleonly || itup_key->heapkeyspace);
2710
0
  Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged));
2711
2712
  /*
2713
   * Scan over all items to see which ones need to be deleted according to
2714
   * LP_DEAD flags.  We'll usually manage to delete a few extra items that
2715
   * are not marked LP_DEAD in passing.  Often the extra items that actually
2716
   * end up getting deleted are items that would have had their LP_DEAD bit
2717
   * set before long anyway (if we opted not to include them as extras).
2718
   */
2719
0
  minoff = P_FIRSTDATAKEY(opaque);
2720
0
  maxoff = PageGetMaxOffsetNumber(page);
2721
0
  for (offnum = minoff;
2722
0
     offnum <= maxoff;
2723
0
     offnum = OffsetNumberNext(offnum))
2724
0
  {
2725
0
    ItemId    itemId = PageGetItemId(page, offnum);
2726
2727
0
    if (ItemIdIsDead(itemId))
2728
0
      deletable[ndeletable++] = offnum;
2729
0
  }
2730
2731
0
  if (ndeletable > 0)
2732
0
  {
2733
0
    _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable,
2734
0
               insertstate->itup, minoff, maxoff);
2735
0
    insertstate->bounds_valid = false;
2736
2737
    /* Return when a page split has already been avoided */
2738
0
    if (PageGetFreeSpace(page) >= insertstate->itemsz)
2739
0
      return;
2740
2741
    /* Might as well assume duplicates (if checkingunique) */
2742
0
    uniquedup = true;
2743
0
  }
2744
2745
  /*
2746
   * We're done with simple deletion.  Return early with callers that only
2747
   * call here so that simple deletion can be considered.  This includes
2748
   * callers that explicitly ask for this and checkingunique callers that
2749
   * probably don't have any version churn duplicates on the page.
2750
   *
2751
   * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we
2752
   * return at this point (or when we go on the try either or both of our
2753
   * other strategies and they also fail).  We do not bother expending a
2754
   * separate write to clear it, however.  Caller will definitely clear it
2755
   * when it goes on to split the page (note also that the deduplication
2756
   * process will clear the flag in passing, just to keep things tidy).
2757
   */
2758
0
  if (simpleonly || (checkingunique && !uniquedup))
2759
0
  {
2760
0
    Assert(!indexUnchanged);
2761
0
    return;
2762
0
  }
2763
2764
  /* Assume bounds about to be invalidated (this is almost certain now) */
2765
0
  insertstate->bounds_valid = false;
2766
2767
  /*
2768
   * Perform bottom-up index deletion pass when executor hint indicated that
2769
   * incoming item is logically unchanged, or for a unique index that is
2770
   * known to have physical duplicates for some other reason.  (There is a
2771
   * large overlap between these two cases for a unique index.  It's worth
2772
   * having both triggering conditions in order to apply the optimization in
2773
   * the event of successive related INSERT and DELETE statements.)
2774
   *
2775
   * We'll go on to do a deduplication pass when a bottom-up pass fails to
2776
   * delete an acceptable amount of free space (a significant fraction of
2777
   * the page, or space for the new item, whichever is greater).
2778
   *
2779
   * Note: Bottom-up index deletion uses the same equality/equivalence
2780
   * routines as deduplication internally.  However, it does not merge
2781
   * together index tuples, so the same correctness considerations do not
2782
   * apply.  We deliberately omit an index-is-allequalimage test here.
2783
   */
2784
0
  if ((indexUnchanged || uniquedup) &&
2785
0
    _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz))
2786
0
    return;
2787
2788
  /* Perform deduplication pass (when enabled and index-is-allequalimage) */
2789
0
  if (BTGetDeduplicateItems(rel) && itup_key->allequalimage)
2790
0
    _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz,
2791
0
             (indexUnchanged || uniquedup));
2792
0
}
2793
2794
/*
2795
 * _bt_simpledel_pass - Simple index tuple deletion pass.
2796
 *
2797
 * We delete all LP_DEAD-set index tuples on a leaf page.  The offset numbers
2798
 * of all such tuples are determined by caller (caller passes these to us as
2799
 * its 'deletable' argument).
2800
 *
2801
 * We might also delete extra index tuples that turn out to be safe to delete
2802
 * in passing (though they must be cheap to check in passing to begin with).
2803
 * There is no certainty that any extra tuples will be deleted, though.  The
2804
 * high level goal of the approach we take is to get the most out of each call
2805
 * here (without noticeably increasing the per-call overhead compared to what
2806
 * we need to do just to be able to delete the page's LP_DEAD-marked index
2807
 * tuples).
2808
 *
2809
 * The number of extra index tuples that turn out to be deletable might
2810
 * greatly exceed the number of LP_DEAD-marked index tuples due to various
2811
 * locality related effects.  For example, it's possible that the total number
2812
 * of table blocks (pointed to by all TIDs on the leaf page) is naturally
2813
 * quite low, in which case we might end up checking if it's possible to
2814
 * delete _most_ index tuples on the page (without the tableam needing to
2815
 * access additional table blocks).  The tableam will sometimes stumble upon
2816
 * _many_ extra deletable index tuples in indexes where this pattern is
2817
 * common.
2818
 *
2819
 * See nbtree/README for further details on simple index tuple deletion.
2820
 */
2821
static void
2822
_bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel,
2823
           OffsetNumber *deletable, int ndeletable, IndexTuple newitem,
2824
           OffsetNumber minoff, OffsetNumber maxoff)
2825
0
{
2826
0
  Page    page = BufferGetPage(buffer);
2827
0
  BlockNumber *deadblocks;
2828
0
  int     ndeadblocks;
2829
0
  TM_IndexDeleteOp delstate;
2830
0
  OffsetNumber offnum;
2831
2832
  /* Get array of table blocks pointed to by LP_DEAD-set tuples */
2833
0
  deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem,
2834
0
                &ndeadblocks);
2835
2836
  /* Initialize tableam state that describes index deletion operation */
2837
0
  delstate.irel = rel;
2838
0
  delstate.iblknum = BufferGetBlockNumber(buffer);
2839
0
  delstate.bottomup = false;
2840
0
  delstate.bottomupfreespace = 0;
2841
0
  delstate.ndeltids = 0;
2842
0
  delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete));
2843
0
  delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus));
2844
2845
0
  for (offnum = minoff;
2846
0
     offnum <= maxoff;
2847
0
     offnum = OffsetNumberNext(offnum))
2848
0
  {
2849
0
    ItemId    itemid = PageGetItemId(page, offnum);
2850
0
    IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
2851
0
    TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids];
2852
0
    TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids];
2853
0
    BlockNumber tidblock;
2854
0
    void     *match;
2855
2856
0
    if (!BTreeTupleIsPosting(itup))
2857
0
    {
2858
0
      tidblock = ItemPointerGetBlockNumber(&itup->t_tid);
2859
0
      match = bsearch(&tidblock, deadblocks, ndeadblocks,
2860
0
              sizeof(BlockNumber), _bt_blk_cmp);
2861
2862
0
      if (!match)
2863
0
      {
2864
0
        Assert(!ItemIdIsDead(itemid));
2865
0
        continue;
2866
0
      }
2867
2868
      /*
2869
       * TID's table block is among those pointed to by the TIDs from
2870
       * LP_DEAD-bit set tuples on page -- add TID to deltids
2871
       */
2872
0
      odeltid->tid = itup->t_tid;
2873
0
      odeltid->id = delstate.ndeltids;
2874
0
      ostatus->idxoffnum = offnum;
2875
0
      ostatus->knowndeletable = ItemIdIsDead(itemid);
2876
0
      ostatus->promising = false; /* unused */
2877
0
      ostatus->freespace = 0; /* unused */
2878
2879
0
      delstate.ndeltids++;
2880
0
    }
2881
0
    else
2882
0
    {
2883
0
      int     nitem = BTreeTupleGetNPosting(itup);
2884
2885
0
      for (int p = 0; p < nitem; p++)
2886
0
      {
2887
0
        ItemPointer tid = BTreeTupleGetPostingN(itup, p);
2888
2889
0
        tidblock = ItemPointerGetBlockNumber(tid);
2890
0
        match = bsearch(&tidblock, deadblocks, ndeadblocks,
2891
0
                sizeof(BlockNumber), _bt_blk_cmp);
2892
2893
0
        if (!match)
2894
0
        {
2895
0
          Assert(!ItemIdIsDead(itemid));
2896
0
          continue;
2897
0
        }
2898
2899
        /*
2900
         * TID's table block is among those pointed to by the TIDs
2901
         * from LP_DEAD-bit set tuples on page -- add TID to deltids
2902
         */
2903
0
        odeltid->tid = *tid;
2904
0
        odeltid->id = delstate.ndeltids;
2905
0
        ostatus->idxoffnum = offnum;
2906
0
        ostatus->knowndeletable = ItemIdIsDead(itemid);
2907
0
        ostatus->promising = false; /* unused */
2908
0
        ostatus->freespace = 0; /* unused */
2909
2910
0
        odeltid++;
2911
0
        ostatus++;
2912
0
        delstate.ndeltids++;
2913
0
      }
2914
0
    }
2915
0
  }
2916
2917
0
  pfree(deadblocks);
2918
2919
0
  Assert(delstate.ndeltids >= ndeletable);
2920
2921
  /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */
2922
0
  _bt_delitems_delete_check(rel, buffer, heapRel, &delstate);
2923
2924
0
  pfree(delstate.deltids);
2925
0
  pfree(delstate.status);
2926
0
}
2927
2928
/*
2929
 * _bt_deadblocks() -- Get LP_DEAD related table blocks.
2930
 *
2931
 * Builds sorted and unique-ified array of table block numbers from index
2932
 * tuple TIDs whose line pointers are marked LP_DEAD.  Also adds the table
2933
 * block from incoming newitem just in case it isn't among the LP_DEAD-related
2934
 * table blocks.
2935
 *
2936
 * Always counting the newitem's table block as an LP_DEAD related block makes
2937
 * sense because the cost is consistently low; it is practically certain that
2938
 * the table block will not incur a buffer miss in tableam.  On the other hand
2939
 * the benefit is often quite high.  There is a decent chance that there will
2940
 * be some deletable items from this block, since in general most garbage
2941
 * tuples became garbage in the recent past (in many cases this won't be the
2942
 * first logical row that core code added to/modified in table block
2943
 * recently).
2944
 *
2945
 * Returns final array, and sets *nblocks to its final size for caller.
2946
 */
2947
static BlockNumber *
2948
_bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable,
2949
         IndexTuple newitem, int *nblocks)
2950
0
{
2951
0
  int     spacentids,
2952
0
        ntids;
2953
0
  BlockNumber *tidblocks;
2954
2955
  /*
2956
   * Accumulate each TID's block in array whose initial size has space for
2957
   * one table block per LP_DEAD-set tuple (plus space for the newitem table
2958
   * block).  Array will only need to grow when there are LP_DEAD-marked
2959
   * posting list tuples (which is not that common).
2960
   */
2961
0
  spacentids = ndeletable + 1;
2962
0
  ntids = 0;
2963
0
  tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids);
2964
2965
  /*
2966
   * First add the table block for the incoming newitem.  This is the one
2967
   * case where simple deletion can visit a table block that doesn't have
2968
   * any known deletable items.
2969
   */
2970
0
  Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem));
2971
0
  tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid);
2972
2973
0
  for (int i = 0; i < ndeletable; i++)
2974
0
  {
2975
0
    ItemId    itemid = PageGetItemId(page, deletable[i]);
2976
0
    IndexTuple  itup = (IndexTuple) PageGetItem(page, itemid);
2977
2978
0
    Assert(ItemIdIsDead(itemid));
2979
2980
0
    if (!BTreeTupleIsPosting(itup))
2981
0
    {
2982
0
      if (ntids + 1 > spacentids)
2983
0
      {
2984
0
        spacentids *= 2;
2985
0
        tidblocks = (BlockNumber *)
2986
0
          repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
2987
0
      }
2988
2989
0
      tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid);
2990
0
    }
2991
0
    else
2992
0
    {
2993
0
      int     nposting = BTreeTupleGetNPosting(itup);
2994
2995
0
      if (ntids + nposting > spacentids)
2996
0
      {
2997
0
        spacentids = Max(spacentids * 2, ntids + nposting);
2998
0
        tidblocks = (BlockNumber *)
2999
0
          repalloc(tidblocks, sizeof(BlockNumber) * spacentids);
3000
0
      }
3001
3002
0
      for (int j = 0; j < nposting; j++)
3003
0
      {
3004
0
        ItemPointer tid = BTreeTupleGetPostingN(itup, j);
3005
3006
0
        tidblocks[ntids++] = ItemPointerGetBlockNumber(tid);
3007
0
      }
3008
0
    }
3009
0
  }
3010
3011
0
  qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3012
0
  *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp);
3013
3014
0
  return tidblocks;
3015
0
}
3016
3017
/*
3018
 * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass
3019
 */
3020
static inline int
3021
_bt_blk_cmp(const void *arg1, const void *arg2)
3022
0
{
3023
0
  BlockNumber b1 = *((BlockNumber *) arg1);
3024
0
  BlockNumber b2 = *((BlockNumber *) arg2);
3025
3026
0
  return pg_cmp_u32(b1, b2);
3027
0
}