/src/postgres/src/backend/access/nbtree/nbtinsert.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * nbtinsert.c |
4 | | * Item insertion in Lehman and Yao btrees for Postgres. |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/access/nbtree/nbtinsert.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | |
16 | | #include "postgres.h" |
17 | | |
18 | | #include "access/nbtree.h" |
19 | | #include "access/nbtxlog.h" |
20 | | #include "access/tableam.h" |
21 | | #include "access/transam.h" |
22 | | #include "access/xloginsert.h" |
23 | | #include "common/int.h" |
24 | | #include "common/pg_prng.h" |
25 | | #include "lib/qunique.h" |
26 | | #include "miscadmin.h" |
27 | | #include "storage/lmgr.h" |
28 | | #include "storage/predicate.h" |
29 | | |
30 | | /* Minimum tree height for application of fastpath optimization */ |
31 | 0 | #define BTREE_FASTPATH_MIN_LEVEL 2 |
32 | | |
33 | | |
34 | | static BTStack _bt_search_insert(Relation rel, Relation heaprel, |
35 | | BTInsertState insertstate); |
36 | | static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, |
37 | | Relation heapRel, |
38 | | IndexUniqueCheck checkUnique, bool *is_unique, |
39 | | uint32 *speculativeToken); |
40 | | static OffsetNumber _bt_findinsertloc(Relation rel, |
41 | | BTInsertState insertstate, |
42 | | bool checkingunique, |
43 | | bool indexUnchanged, |
44 | | BTStack stack, |
45 | | Relation heapRel); |
46 | | static void _bt_stepright(Relation rel, Relation heaprel, |
47 | | BTInsertState insertstate, BTStack stack); |
48 | | static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, |
49 | | Buffer buf, |
50 | | Buffer cbuf, |
51 | | BTStack stack, |
52 | | IndexTuple itup, |
53 | | Size itemsz, |
54 | | OffsetNumber newitemoff, |
55 | | int postingoff, |
56 | | bool split_only_page); |
57 | | static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, |
58 | | Buffer buf, Buffer cbuf, OffsetNumber newitemoff, |
59 | | Size newitemsz, IndexTuple newitem, IndexTuple orignewitem, |
60 | | IndexTuple nposting, uint16 postingoff); |
61 | | static void _bt_insert_parent(Relation rel, Relation heaprel, Buffer buf, |
62 | | Buffer rbuf, BTStack stack, bool isroot, bool isonly); |
63 | | static Buffer _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf); |
64 | | static inline bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup, |
65 | | OffsetNumber itup_off, bool newfirstdataitem); |
66 | | static void _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, |
67 | | BTInsertState insertstate, |
68 | | bool simpleonly, bool checkingunique, |
69 | | bool uniquedup, bool indexUnchanged); |
70 | | static void _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, |
71 | | OffsetNumber *deletable, int ndeletable, |
72 | | IndexTuple newitem, OffsetNumber minoff, |
73 | | OffsetNumber maxoff); |
74 | | static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable, |
75 | | int ndeletable, IndexTuple newitem, |
76 | | int *nblocks); |
77 | | static inline int _bt_blk_cmp(const void *arg1, const void *arg2); |
78 | | |
79 | | /* |
80 | | * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. |
81 | | * |
82 | | * This routine is called by the public interface routine, btinsert. |
83 | | * By here, itup is filled in, including the TID. |
84 | | * |
85 | | * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this |
86 | | * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or |
87 | | * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate. |
88 | | * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and |
89 | | * don't actually insert. |
90 | | * |
91 | | * indexUnchanged executor hint indicates if itup is from an |
92 | | * UPDATE that didn't logically change the indexed value, but |
93 | | * must nevertheless have a new entry to point to a successor |
94 | | * version. |
95 | | * |
96 | | * The result value is only significant for UNIQUE_CHECK_PARTIAL: |
97 | | * it must be true if the entry is known unique, else false. |
98 | | * (In the current implementation we'll also return true after a |
99 | | * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but |
100 | | * that's just a coding artifact.) |
101 | | */ |
102 | | bool |
103 | | _bt_doinsert(Relation rel, IndexTuple itup, |
104 | | IndexUniqueCheck checkUnique, bool indexUnchanged, |
105 | | Relation heapRel) |
106 | 0 | { |
107 | 0 | bool is_unique = false; |
108 | 0 | BTInsertStateData insertstate; |
109 | 0 | BTScanInsert itup_key; |
110 | 0 | BTStack stack; |
111 | 0 | bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); |
112 | | |
113 | | /* we need an insertion scan key to do our search, so build one */ |
114 | 0 | itup_key = _bt_mkscankey(rel, itup); |
115 | |
|
116 | 0 | if (checkingunique) |
117 | 0 | { |
118 | 0 | if (!itup_key->anynullkeys) |
119 | 0 | { |
120 | | /* No (heapkeyspace) scantid until uniqueness established */ |
121 | 0 | itup_key->scantid = NULL; |
122 | 0 | } |
123 | 0 | else |
124 | 0 | { |
125 | | /* |
126 | | * Scan key for new tuple contains NULL key values. Bypass |
127 | | * checkingunique steps. They are unnecessary because core code |
128 | | * considers NULL unequal to every value, including NULL. |
129 | | * |
130 | | * This optimization avoids O(N^2) behavior within the |
131 | | * _bt_findinsertloc() heapkeyspace path when a unique index has a |
132 | | * large number of "duplicates" with NULL key values. |
133 | | */ |
134 | 0 | checkingunique = false; |
135 | | /* Tuple is unique in the sense that core code cares about */ |
136 | 0 | Assert(checkUnique != UNIQUE_CHECK_EXISTING); |
137 | 0 | is_unique = true; |
138 | 0 | } |
139 | 0 | } |
140 | | |
141 | | /* |
142 | | * Fill in the BTInsertState working area, to track the current page and |
143 | | * position within the page to insert on. |
144 | | * |
145 | | * Note that itemsz is passed down to lower level code that deals with |
146 | | * inserting the item. It must be MAXALIGN()'d. This ensures that space |
147 | | * accounting code consistently considers the alignment overhead that we |
148 | | * expect PageAddItem() will add later. (Actually, index_form_tuple() is |
149 | | * already conservative about alignment, but we don't rely on that from |
150 | | * this distance. Besides, preserving the "true" tuple size in index |
151 | | * tuple headers for the benefit of nbtsplitloc.c might happen someday. |
152 | | * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.) |
153 | | */ |
154 | 0 | insertstate.itup = itup; |
155 | 0 | insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); |
156 | 0 | insertstate.itup_key = itup_key; |
157 | 0 | insertstate.bounds_valid = false; |
158 | 0 | insertstate.buf = InvalidBuffer; |
159 | 0 | insertstate.postingoff = 0; |
160 | |
|
161 | 0 | search: |
162 | | |
163 | | /* |
164 | | * Find and lock the leaf page that the tuple should be added to by |
165 | | * searching from the root page. insertstate.buf will hold a buffer that |
166 | | * is locked in exclusive mode afterwards. |
167 | | */ |
168 | 0 | stack = _bt_search_insert(rel, heapRel, &insertstate); |
169 | | |
170 | | /* |
171 | | * checkingunique inserts are not allowed to go ahead when two tuples with |
172 | | * equal key attribute values would be visible to new MVCC snapshots once |
173 | | * the xact commits. Check for conflicts in the locked page/buffer (if |
174 | | * needed) here. |
175 | | * |
176 | | * It might be necessary to check a page to the right in _bt_check_unique, |
177 | | * though that should be very rare. In practice the first page the value |
178 | | * could be on (with scantid omitted) is almost always also the only page |
179 | | * that a matching tuple might be found on. This is due to the behavior |
180 | | * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can |
181 | | * only be allowed to cross a page boundary when there is no candidate |
182 | | * leaf page split point that avoids it. Also, _bt_check_unique can use |
183 | | * the leaf page high key to determine that there will be no duplicates on |
184 | | * the right sibling without actually visiting it (it uses the high key in |
185 | | * cases where the new item happens to belong at the far right of the leaf |
186 | | * page). |
187 | | * |
188 | | * NOTE: obviously, _bt_check_unique can only detect keys that are already |
189 | | * in the index; so it cannot defend against concurrent insertions of the |
190 | | * same key. We protect against that by means of holding a write lock on |
191 | | * the first page the value could be on, with omitted/-inf value for the |
192 | | * implicit heap TID tiebreaker attribute. Any other would-be inserter of |
193 | | * the same key must acquire a write lock on the same page, so only one |
194 | | * would-be inserter can be making the check at one time. Furthermore, |
195 | | * once we are past the check we hold write locks continuously until we |
196 | | * have performed our insertion, so no later inserter can fail to see our |
197 | | * insertion. (This requires some care in _bt_findinsertloc.) |
198 | | * |
199 | | * If we must wait for another xact, we release the lock while waiting, |
200 | | * and then must perform a new search. |
201 | | * |
202 | | * For a partial uniqueness check, we don't wait for the other xact. Just |
203 | | * let the tuple in and return false for possibly non-unique, or true for |
204 | | * definitely unique. |
205 | | */ |
206 | 0 | if (checkingunique) |
207 | 0 | { |
208 | 0 | TransactionId xwait; |
209 | 0 | uint32 speculativeToken; |
210 | |
|
211 | 0 | xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique, |
212 | 0 | &is_unique, &speculativeToken); |
213 | |
|
214 | 0 | if (unlikely(TransactionIdIsValid(xwait))) |
215 | 0 | { |
216 | | /* Have to wait for the other guy ... */ |
217 | 0 | _bt_relbuf(rel, insertstate.buf); |
218 | 0 | insertstate.buf = InvalidBuffer; |
219 | | |
220 | | /* |
221 | | * If it's a speculative insertion, wait for it to finish (ie. to |
222 | | * go ahead with the insertion, or kill the tuple). Otherwise |
223 | | * wait for the transaction to finish as usual. |
224 | | */ |
225 | 0 | if (speculativeToken) |
226 | 0 | SpeculativeInsertionWait(xwait, speculativeToken); |
227 | 0 | else |
228 | 0 | XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); |
229 | | |
230 | | /* start over... */ |
231 | 0 | if (stack) |
232 | 0 | _bt_freestack(stack); |
233 | 0 | goto search; |
234 | 0 | } |
235 | | |
236 | | /* Uniqueness is established -- restore heap tid as scantid */ |
237 | 0 | if (itup_key->heapkeyspace) |
238 | 0 | itup_key->scantid = &itup->t_tid; |
239 | 0 | } |
240 | | |
241 | 0 | if (checkUnique != UNIQUE_CHECK_EXISTING) |
242 | 0 | { |
243 | 0 | OffsetNumber newitemoff; |
244 | | |
245 | | /* |
246 | | * The only conflict predicate locking cares about for indexes is when |
247 | | * an index tuple insert conflicts with an existing lock. We don't |
248 | | * know the actual page we're going to insert on for sure just yet in |
249 | | * checkingunique and !heapkeyspace cases, but it's okay to use the |
250 | | * first page the value could be on (with scantid omitted) instead. |
251 | | */ |
252 | 0 | CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf)); |
253 | | |
254 | | /* |
255 | | * Do the insertion. Note that insertstate contains cached binary |
256 | | * search bounds established within _bt_check_unique when insertion is |
257 | | * checkingunique. |
258 | | */ |
259 | 0 | newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, |
260 | 0 | indexUnchanged, stack, heapRel); |
261 | 0 | _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer, |
262 | 0 | stack, itup, insertstate.itemsz, newitemoff, |
263 | 0 | insertstate.postingoff, false); |
264 | 0 | } |
265 | 0 | else |
266 | 0 | { |
267 | | /* just release the buffer */ |
268 | 0 | _bt_relbuf(rel, insertstate.buf); |
269 | 0 | } |
270 | | |
271 | | /* be tidy */ |
272 | 0 | if (stack) |
273 | 0 | _bt_freestack(stack); |
274 | 0 | pfree(itup_key); |
275 | |
|
276 | 0 | return is_unique; |
277 | 0 | } |
278 | | |
279 | | /* |
280 | | * _bt_search_insert() -- _bt_search() wrapper for inserts |
281 | | * |
282 | | * Search the tree for a particular scankey, or more precisely for the first |
283 | | * leaf page it could be on. Try to make use of the fastpath optimization's |
284 | | * rightmost leaf page cache before actually searching the tree from the root |
285 | | * page, though. |
286 | | * |
287 | | * Return value is a stack of parent-page pointers (though see notes about |
288 | | * fastpath optimization and page splits below). insertstate->buf is set to |
289 | | * the address of the leaf-page buffer, which is write-locked and pinned in |
290 | | * all cases (if necessary by creating a new empty root page for caller). |
291 | | * |
292 | | * The fastpath optimization avoids most of the work of searching the tree |
293 | | * repeatedly when a single backend inserts successive new tuples on the |
294 | | * rightmost leaf page of an index. A backend cache of the rightmost leaf |
295 | | * page is maintained within _bt_insertonpg(), and used here. The cache is |
296 | | * invalidated here when an insert of a non-pivot tuple must take place on a |
297 | | * non-rightmost leaf page. |
298 | | * |
299 | | * The optimization helps with indexes on an auto-incremented field. It also |
300 | | * helps with indexes on datetime columns, as well as indexes with lots of |
301 | | * NULL values. (NULLs usually get inserted in the rightmost page for single |
302 | | * column indexes, since they usually get treated as coming after everything |
303 | | * else in the key space. Individual NULL tuples will generally be placed on |
304 | | * the rightmost leaf page due to the influence of the heap TID column.) |
305 | | * |
306 | | * Note that we avoid applying the optimization when there is insufficient |
307 | | * space on the rightmost page to fit caller's new item. This is necessary |
308 | | * because we'll need to return a real descent stack when a page split is |
309 | | * expected (actually, caller can cope with a leaf page split that uses a NULL |
310 | | * stack, but that's very slow and so must be avoided). Note also that the |
311 | | * fastpath optimization acquires the lock on the page conditionally as a way |
312 | | * of reducing extra contention when there are concurrent insertions into the |
313 | | * rightmost page (we give up if we'd have to wait for the lock). We assume |
314 | | * that it isn't useful to apply the optimization when there is contention, |
315 | | * since each per-backend cache won't stay valid for long. |
316 | | */ |
317 | | static BTStack |
318 | | _bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate) |
319 | 0 | { |
320 | 0 | Assert(insertstate->buf == InvalidBuffer); |
321 | 0 | Assert(!insertstate->bounds_valid); |
322 | 0 | Assert(insertstate->postingoff == 0); |
323 | |
|
324 | 0 | if (RelationGetTargetBlock(rel) != InvalidBlockNumber) |
325 | 0 | { |
326 | | /* Simulate a _bt_getbuf() call with conditional locking */ |
327 | 0 | insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); |
328 | 0 | if (_bt_conditionallockbuf(rel, insertstate->buf)) |
329 | 0 | { |
330 | 0 | Page page; |
331 | 0 | BTPageOpaque opaque; |
332 | |
|
333 | 0 | _bt_checkpage(rel, insertstate->buf); |
334 | 0 | page = BufferGetPage(insertstate->buf); |
335 | 0 | opaque = BTPageGetOpaque(page); |
336 | | |
337 | | /* |
338 | | * Check if the page is still the rightmost leaf page and has |
339 | | * enough free space to accommodate the new tuple. Also check |
340 | | * that the insertion scan key is strictly greater than the first |
341 | | * non-pivot tuple on the page. (Note that we expect itup_key's |
342 | | * scantid to be unset when our caller is a checkingunique |
343 | | * inserter.) |
344 | | */ |
345 | 0 | if (P_RIGHTMOST(opaque) && |
346 | 0 | P_ISLEAF(opaque) && |
347 | 0 | !P_IGNORE(opaque) && |
348 | 0 | PageGetFreeSpace(page) > insertstate->itemsz && |
349 | 0 | PageGetMaxOffsetNumber(page) >= P_HIKEY && |
350 | 0 | _bt_compare(rel, insertstate->itup_key, page, P_HIKEY) > 0) |
351 | 0 | { |
352 | | /* |
353 | | * Caller can use the fastpath optimization because cached |
354 | | * block is still rightmost leaf page, which can fit caller's |
355 | | * new tuple without splitting. Keep block in local cache for |
356 | | * next insert, and have caller use NULL stack. |
357 | | * |
358 | | * Note that _bt_insert_parent() has an assertion that catches |
359 | | * leaf page splits that somehow follow from a fastpath insert |
360 | | * (it should only be passed a NULL stack when it must deal |
361 | | * with a concurrent root page split, and never because a NULL |
362 | | * stack was returned here). |
363 | | */ |
364 | 0 | return NULL; |
365 | 0 | } |
366 | | |
367 | | /* Page unsuitable for caller, drop lock and pin */ |
368 | 0 | _bt_relbuf(rel, insertstate->buf); |
369 | 0 | } |
370 | 0 | else |
371 | 0 | { |
372 | | /* Lock unavailable, drop pin */ |
373 | 0 | ReleaseBuffer(insertstate->buf); |
374 | 0 | } |
375 | | |
376 | | /* Forget block, since cache doesn't appear to be useful */ |
377 | 0 | RelationSetTargetBlock(rel, InvalidBlockNumber); |
378 | 0 | } |
379 | | |
380 | | /* Cannot use optimization -- descend tree, return proper descent stack */ |
381 | 0 | return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf, |
382 | 0 | BT_WRITE); |
383 | 0 | } |
384 | | |
385 | | /* |
386 | | * _bt_check_unique() -- Check for violation of unique index constraint |
387 | | * |
388 | | * Returns InvalidTransactionId if there is no conflict, else an xact ID |
389 | | * we must wait for to see if it commits a conflicting tuple. If an actual |
390 | | * conflict is detected, no return --- just ereport(). If an xact ID is |
391 | | * returned, and the conflicting tuple still has a speculative insertion in |
392 | | * progress, *speculativeToken is set to non-zero, and the caller can wait for |
393 | | * the verdict on the insertion using SpeculativeInsertionWait(). |
394 | | * |
395 | | * However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return |
396 | | * InvalidTransactionId because we don't want to wait. In this case we |
397 | | * set *is_unique to false if there is a potential conflict, and the |
398 | | * core code must redo the uniqueness check later. |
399 | | * |
400 | | * As a side-effect, sets state in insertstate that can later be used by |
401 | | * _bt_findinsertloc() to reuse most of the binary search work we do |
402 | | * here. |
403 | | * |
404 | | * This code treats NULLs as equal, unlike the default semantics for unique |
405 | | * indexes. So do not call here when there are NULL values in scan key and |
406 | | * the index uses the default NULLS DISTINCT mode. |
407 | | */ |
408 | | static TransactionId |
409 | | _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, |
410 | | IndexUniqueCheck checkUnique, bool *is_unique, |
411 | | uint32 *speculativeToken) |
412 | 0 | { |
413 | 0 | IndexTuple itup = insertstate->itup; |
414 | 0 | IndexTuple curitup = NULL; |
415 | 0 | ItemId curitemid = NULL; |
416 | 0 | BTScanInsert itup_key = insertstate->itup_key; |
417 | 0 | SnapshotData SnapshotDirty; |
418 | 0 | OffsetNumber offset; |
419 | 0 | OffsetNumber maxoff; |
420 | 0 | Page page; |
421 | 0 | BTPageOpaque opaque; |
422 | 0 | Buffer nbuf = InvalidBuffer; |
423 | 0 | bool found = false; |
424 | 0 | bool inposting = false; |
425 | 0 | bool prevalldead = true; |
426 | 0 | int curposti = 0; |
427 | | |
428 | | /* Assume unique until we find a duplicate */ |
429 | 0 | *is_unique = true; |
430 | |
|
431 | 0 | InitDirtySnapshot(SnapshotDirty); |
432 | |
|
433 | 0 | page = BufferGetPage(insertstate->buf); |
434 | 0 | opaque = BTPageGetOpaque(page); |
435 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
436 | | |
437 | | /* |
438 | | * Find the first tuple with the same key. |
439 | | * |
440 | | * This also saves the binary search bounds in insertstate. We use them |
441 | | * in the fastpath below, but also in the _bt_findinsertloc() call later. |
442 | | */ |
443 | 0 | Assert(!insertstate->bounds_valid); |
444 | 0 | offset = _bt_binsrch_insert(rel, insertstate); |
445 | | |
446 | | /* |
447 | | * Scan over all equal tuples, looking for live conflicts. |
448 | | */ |
449 | 0 | Assert(!insertstate->bounds_valid || insertstate->low == offset); |
450 | 0 | Assert(!itup_key->anynullkeys); |
451 | 0 | Assert(itup_key->scantid == NULL); |
452 | 0 | for (;;) |
453 | 0 | { |
454 | | /* |
455 | | * Each iteration of the loop processes one heap TID, not one index |
456 | | * tuple. Current offset number for page isn't usually advanced on |
457 | | * iterations that process heap TIDs from posting list tuples. |
458 | | * |
459 | | * "inposting" state is set when _inside_ a posting list --- not when |
460 | | * we're at the start (or end) of a posting list. We advance curposti |
461 | | * at the end of the iteration when inside a posting list tuple. In |
462 | | * general, every loop iteration either advances the page offset or |
463 | | * advances curposti --- an iteration that handles the rightmost/max |
464 | | * heap TID in a posting list finally advances the page offset (and |
465 | | * unsets "inposting"). |
466 | | * |
467 | | * Make sure the offset points to an actual index tuple before trying |
468 | | * to examine it... |
469 | | */ |
470 | 0 | if (offset <= maxoff) |
471 | 0 | { |
472 | | /* |
473 | | * Fastpath: In most cases, we can use cached search bounds to |
474 | | * limit our consideration to items that are definitely |
475 | | * duplicates. This fastpath doesn't apply when the original page |
476 | | * is empty, or when initial offset is past the end of the |
477 | | * original page, which may indicate that we need to examine a |
478 | | * second or subsequent page. |
479 | | * |
480 | | * Note that this optimization allows us to avoid calling |
481 | | * _bt_compare() directly when there are no duplicates, as long as |
482 | | * the offset where the key will go is not at the end of the page. |
483 | | */ |
484 | 0 | if (nbuf == InvalidBuffer && offset == insertstate->stricthigh) |
485 | 0 | { |
486 | 0 | Assert(insertstate->bounds_valid); |
487 | 0 | Assert(insertstate->low >= P_FIRSTDATAKEY(opaque)); |
488 | 0 | Assert(insertstate->low <= insertstate->stricthigh); |
489 | 0 | Assert(_bt_compare(rel, itup_key, page, offset) < 0); |
490 | 0 | break; |
491 | 0 | } |
492 | | |
493 | | /* |
494 | | * We can skip items that are already marked killed. |
495 | | * |
496 | | * In the presence of heavy update activity an index may contain |
497 | | * many killed items with the same key; running _bt_compare() on |
498 | | * each killed item gets expensive. Just advance over killed |
499 | | * items as quickly as we can. We only apply _bt_compare() when |
500 | | * we get to a non-killed item. We could reuse the bounds to |
501 | | * avoid _bt_compare() calls for known equal tuples, but it |
502 | | * doesn't seem worth it. |
503 | | */ |
504 | 0 | if (!inposting) |
505 | 0 | curitemid = PageGetItemId(page, offset); |
506 | 0 | if (inposting || !ItemIdIsDead(curitemid)) |
507 | 0 | { |
508 | 0 | ItemPointerData htid; |
509 | 0 | bool all_dead = false; |
510 | |
|
511 | 0 | if (!inposting) |
512 | 0 | { |
513 | | /* Plain tuple, or first TID in posting list tuple */ |
514 | 0 | if (_bt_compare(rel, itup_key, page, offset) != 0) |
515 | 0 | break; /* we're past all the equal tuples */ |
516 | | |
517 | | /* Advanced curitup */ |
518 | 0 | curitup = (IndexTuple) PageGetItem(page, curitemid); |
519 | 0 | Assert(!BTreeTupleIsPivot(curitup)); |
520 | 0 | } |
521 | | |
522 | | /* okay, we gotta fetch the heap tuple using htid ... */ |
523 | 0 | if (!BTreeTupleIsPosting(curitup)) |
524 | 0 | { |
525 | | /* ... htid is from simple non-pivot tuple */ |
526 | 0 | Assert(!inposting); |
527 | 0 | htid = curitup->t_tid; |
528 | 0 | } |
529 | 0 | else if (!inposting) |
530 | 0 | { |
531 | | /* ... htid is first TID in new posting list */ |
532 | 0 | inposting = true; |
533 | 0 | prevalldead = true; |
534 | 0 | curposti = 0; |
535 | 0 | htid = *BTreeTupleGetPostingN(curitup, 0); |
536 | 0 | } |
537 | 0 | else |
538 | 0 | { |
539 | | /* ... htid is second or subsequent TID in posting list */ |
540 | 0 | Assert(curposti > 0); |
541 | 0 | htid = *BTreeTupleGetPostingN(curitup, curposti); |
542 | 0 | } |
543 | | |
544 | | /* |
545 | | * If we are doing a recheck, we expect to find the tuple we |
546 | | * are rechecking. It's not a duplicate, but we have to keep |
547 | | * scanning. |
548 | | */ |
549 | 0 | if (checkUnique == UNIQUE_CHECK_EXISTING && |
550 | 0 | ItemPointerCompare(&htid, &itup->t_tid) == 0) |
551 | 0 | { |
552 | 0 | found = true; |
553 | 0 | } |
554 | | |
555 | | /* |
556 | | * Check if there's any table tuples for this index entry |
557 | | * satisfying SnapshotDirty. This is necessary because for AMs |
558 | | * with optimizations like heap's HOT, we have just a single |
559 | | * index entry for the entire chain. |
560 | | */ |
561 | 0 | else if (table_index_fetch_tuple_check(heapRel, &htid, |
562 | 0 | &SnapshotDirty, |
563 | 0 | &all_dead)) |
564 | 0 | { |
565 | 0 | TransactionId xwait; |
566 | | |
567 | | /* |
568 | | * It is a duplicate. If we are only doing a partial |
569 | | * check, then don't bother checking if the tuple is being |
570 | | * updated in another transaction. Just return the fact |
571 | | * that it is a potential conflict and leave the full |
572 | | * check till later. Don't invalidate binary search |
573 | | * bounds. |
574 | | */ |
575 | 0 | if (checkUnique == UNIQUE_CHECK_PARTIAL) |
576 | 0 | { |
577 | 0 | if (nbuf != InvalidBuffer) |
578 | 0 | _bt_relbuf(rel, nbuf); |
579 | 0 | *is_unique = false; |
580 | 0 | return InvalidTransactionId; |
581 | 0 | } |
582 | | |
583 | | /* |
584 | | * If this tuple is being updated by other transaction |
585 | | * then we have to wait for its commit/abort. |
586 | | */ |
587 | 0 | xwait = (TransactionIdIsValid(SnapshotDirty.xmin)) ? |
588 | 0 | SnapshotDirty.xmin : SnapshotDirty.xmax; |
589 | |
|
590 | 0 | if (TransactionIdIsValid(xwait)) |
591 | 0 | { |
592 | 0 | if (nbuf != InvalidBuffer) |
593 | 0 | _bt_relbuf(rel, nbuf); |
594 | | /* Tell _bt_doinsert to wait... */ |
595 | 0 | *speculativeToken = SnapshotDirty.speculativeToken; |
596 | | /* Caller releases lock on buf immediately */ |
597 | 0 | insertstate->bounds_valid = false; |
598 | 0 | return xwait; |
599 | 0 | } |
600 | | |
601 | | /* |
602 | | * Otherwise we have a definite conflict. But before |
603 | | * complaining, look to see if the tuple we want to insert |
604 | | * is itself now committed dead --- if so, don't complain. |
605 | | * This is a waste of time in normal scenarios but we must |
606 | | * do it to support CREATE INDEX CONCURRENTLY. |
607 | | * |
608 | | * We must follow HOT-chains here because during |
609 | | * concurrent index build, we insert the root TID though |
610 | | * the actual tuple may be somewhere in the HOT-chain. |
611 | | * While following the chain we might not stop at the |
612 | | * exact tuple which triggered the insert, but that's OK |
613 | | * because if we find a live tuple anywhere in this chain, |
614 | | * we have a unique key conflict. The other live tuple is |
615 | | * not part of this chain because it had a different index |
616 | | * entry. |
617 | | */ |
618 | 0 | htid = itup->t_tid; |
619 | 0 | if (table_index_fetch_tuple_check(heapRel, &htid, |
620 | 0 | SnapshotSelf, NULL)) |
621 | 0 | { |
622 | | /* Normal case --- it's still live */ |
623 | 0 | } |
624 | 0 | else |
625 | 0 | { |
626 | | /* |
627 | | * It's been deleted, so no error, and no need to |
628 | | * continue searching |
629 | | */ |
630 | 0 | break; |
631 | 0 | } |
632 | | |
633 | | /* |
634 | | * Check for a conflict-in as we would if we were going to |
635 | | * write to this page. We aren't actually going to write, |
636 | | * but we want a chance to report SSI conflicts that would |
637 | | * otherwise be masked by this unique constraint |
638 | | * violation. |
639 | | */ |
640 | 0 | CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate->buf)); |
641 | | |
642 | | /* |
643 | | * This is a definite conflict. Break the tuple down into |
644 | | * datums and report the error. But first, make sure we |
645 | | * release the buffer locks we're holding --- |
646 | | * BuildIndexValueDescription could make catalog accesses, |
647 | | * which in the worst case might touch this same index and |
648 | | * cause deadlocks. |
649 | | */ |
650 | 0 | if (nbuf != InvalidBuffer) |
651 | 0 | _bt_relbuf(rel, nbuf); |
652 | 0 | _bt_relbuf(rel, insertstate->buf); |
653 | 0 | insertstate->buf = InvalidBuffer; |
654 | 0 | insertstate->bounds_valid = false; |
655 | |
|
656 | 0 | { |
657 | 0 | Datum values[INDEX_MAX_KEYS]; |
658 | 0 | bool isnull[INDEX_MAX_KEYS]; |
659 | 0 | char *key_desc; |
660 | |
|
661 | 0 | index_deform_tuple(itup, RelationGetDescr(rel), |
662 | 0 | values, isnull); |
663 | |
|
664 | 0 | key_desc = BuildIndexValueDescription(rel, values, |
665 | 0 | isnull); |
666 | |
|
667 | 0 | ereport(ERROR, |
668 | 0 | (errcode(ERRCODE_UNIQUE_VIOLATION), |
669 | 0 | errmsg("duplicate key value violates unique constraint \"%s\"", |
670 | 0 | RelationGetRelationName(rel)), |
671 | 0 | key_desc ? errdetail("Key %s already exists.", |
672 | 0 | key_desc) : 0, |
673 | 0 | errtableconstraint(heapRel, |
674 | 0 | RelationGetRelationName(rel)))); |
675 | 0 | } |
676 | 0 | } |
677 | 0 | else if (all_dead && (!inposting || |
678 | 0 | (prevalldead && |
679 | 0 | curposti == BTreeTupleGetNPosting(curitup) - 1))) |
680 | 0 | { |
681 | | /* |
682 | | * The conflicting tuple (or all HOT chains pointed to by |
683 | | * all posting list TIDs) is dead to everyone, so mark the |
684 | | * index entry killed. |
685 | | */ |
686 | 0 | ItemIdMarkDead(curitemid); |
687 | 0 | opaque->btpo_flags |= BTP_HAS_GARBAGE; |
688 | | |
689 | | /* |
690 | | * Mark buffer with a dirty hint, since state is not |
691 | | * crucial. Be sure to mark the proper buffer dirty. |
692 | | */ |
693 | 0 | if (nbuf != InvalidBuffer) |
694 | 0 | MarkBufferDirtyHint(nbuf, true); |
695 | 0 | else |
696 | 0 | MarkBufferDirtyHint(insertstate->buf, true); |
697 | 0 | } |
698 | | |
699 | | /* |
700 | | * Remember if posting list tuple has even a single HOT chain |
701 | | * whose members are not all dead |
702 | | */ |
703 | 0 | if (!all_dead && inposting) |
704 | 0 | prevalldead = false; |
705 | 0 | } |
706 | 0 | } |
707 | | |
708 | 0 | if (inposting && curposti < BTreeTupleGetNPosting(curitup) - 1) |
709 | 0 | { |
710 | | /* Advance to next TID in same posting list */ |
711 | 0 | curposti++; |
712 | 0 | continue; |
713 | 0 | } |
714 | 0 | else if (offset < maxoff) |
715 | 0 | { |
716 | | /* Advance to next tuple */ |
717 | 0 | curposti = 0; |
718 | 0 | inposting = false; |
719 | 0 | offset = OffsetNumberNext(offset); |
720 | 0 | } |
721 | 0 | else |
722 | 0 | { |
723 | 0 | int highkeycmp; |
724 | | |
725 | | /* If scankey == hikey we gotta check the next page too */ |
726 | 0 | if (P_RIGHTMOST(opaque)) |
727 | 0 | break; |
728 | 0 | highkeycmp = _bt_compare(rel, itup_key, page, P_HIKEY); |
729 | 0 | Assert(highkeycmp <= 0); |
730 | 0 | if (highkeycmp != 0) |
731 | 0 | break; |
732 | | /* Advance to next non-dead page --- there must be one */ |
733 | 0 | for (;;) |
734 | 0 | { |
735 | 0 | BlockNumber nblkno = opaque->btpo_next; |
736 | |
|
737 | 0 | nbuf = _bt_relandgetbuf(rel, nbuf, nblkno, BT_READ); |
738 | 0 | page = BufferGetPage(nbuf); |
739 | 0 | opaque = BTPageGetOpaque(page); |
740 | 0 | if (!P_IGNORE(opaque)) |
741 | 0 | break; |
742 | 0 | if (P_RIGHTMOST(opaque)) |
743 | 0 | elog(ERROR, "fell off the end of index \"%s\"", |
744 | 0 | RelationGetRelationName(rel)); |
745 | 0 | } |
746 | | /* Will also advance to next tuple */ |
747 | 0 | curposti = 0; |
748 | 0 | inposting = false; |
749 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
750 | 0 | offset = P_FIRSTDATAKEY(opaque); |
751 | | /* Don't invalidate binary search bounds */ |
752 | 0 | } |
753 | 0 | } |
754 | | |
755 | | /* |
756 | | * If we are doing a recheck then we should have found the tuple we are |
757 | | * checking. Otherwise there's something very wrong --- probably, the |
758 | | * index is on a non-immutable expression. |
759 | | */ |
760 | 0 | if (checkUnique == UNIQUE_CHECK_EXISTING && !found) |
761 | 0 | ereport(ERROR, |
762 | 0 | (errcode(ERRCODE_INTERNAL_ERROR), |
763 | 0 | errmsg("failed to re-find tuple within index \"%s\"", |
764 | 0 | RelationGetRelationName(rel)), |
765 | 0 | errhint("This may be because of a non-immutable index expression."), |
766 | 0 | errtableconstraint(heapRel, |
767 | 0 | RelationGetRelationName(rel)))); |
768 | | |
769 | 0 | if (nbuf != InvalidBuffer) |
770 | 0 | _bt_relbuf(rel, nbuf); |
771 | |
|
772 | 0 | return InvalidTransactionId; |
773 | 0 | } |
774 | | |
775 | | |
776 | | /* |
777 | | * _bt_findinsertloc() -- Finds an insert location for a tuple |
778 | | * |
779 | | * On entry, insertstate buffer contains the page the new tuple belongs |
780 | | * on. It is exclusive-locked and pinned by the caller. |
781 | | * |
782 | | * If 'checkingunique' is true, the buffer on entry is the first page |
783 | | * that contains duplicates of the new key. If there are duplicates on |
784 | | * multiple pages, the correct insertion position might be some page to |
785 | | * the right, rather than the first page. In that case, this function |
786 | | * moves right to the correct target page. |
787 | | * |
788 | | * (In a !heapkeyspace index, there can be multiple pages with the same |
789 | | * high key, where the new tuple could legitimately be placed on. In |
790 | | * that case, the caller passes the first page containing duplicates, |
791 | | * just like when checkingunique=true. If that page doesn't have enough |
792 | | * room for the new tuple, this function moves right, trying to find a |
793 | | * legal page that does.) |
794 | | * |
795 | | * If 'indexUnchanged' is true, this is for an UPDATE that didn't |
796 | | * logically change the indexed value, but must nevertheless have a new |
797 | | * entry to point to a successor version. This hint from the executor |
798 | | * will influence our behavior when the page might have to be split and |
799 | | * we must consider our options. Bottom-up index deletion can avoid |
800 | | * pathological version-driven page splits, but we only want to go to the |
801 | | * trouble of trying it when we already have moderate confidence that |
802 | | * it's appropriate. The hint should not significantly affect our |
803 | | * behavior over time unless practically all inserts on to the leaf page |
804 | | * get the hint. |
805 | | * |
806 | | * On exit, insertstate buffer contains the chosen insertion page, and |
807 | | * the offset within that page is returned. If _bt_findinsertloc needed |
808 | | * to move right, the lock and pin on the original page are released, and |
809 | | * the new buffer is exclusively locked and pinned instead. |
810 | | * |
811 | | * If insertstate contains cached binary search bounds, we will take |
812 | | * advantage of them. This avoids repeating comparisons that we made in |
813 | | * _bt_check_unique() already. |
814 | | */ |
815 | | static OffsetNumber |
816 | | _bt_findinsertloc(Relation rel, |
817 | | BTInsertState insertstate, |
818 | | bool checkingunique, |
819 | | bool indexUnchanged, |
820 | | BTStack stack, |
821 | | Relation heapRel) |
822 | 0 | { |
823 | 0 | BTScanInsert itup_key = insertstate->itup_key; |
824 | 0 | Page page = BufferGetPage(insertstate->buf); |
825 | 0 | BTPageOpaque opaque; |
826 | 0 | OffsetNumber newitemoff; |
827 | |
|
828 | 0 | opaque = BTPageGetOpaque(page); |
829 | | |
830 | | /* Check 1/3 of a page restriction */ |
831 | 0 | if (unlikely(insertstate->itemsz > BTMaxItemSize)) |
832 | 0 | _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page, |
833 | 0 | insertstate->itup); |
834 | |
|
835 | 0 | Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque)); |
836 | 0 | Assert(!insertstate->bounds_valid || checkingunique); |
837 | 0 | Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL); |
838 | 0 | Assert(itup_key->heapkeyspace || itup_key->scantid == NULL); |
839 | 0 | Assert(!itup_key->allequalimage || itup_key->heapkeyspace); |
840 | |
|
841 | 0 | if (itup_key->heapkeyspace) |
842 | 0 | { |
843 | | /* Keep track of whether checkingunique duplicate seen */ |
844 | 0 | bool uniquedup = indexUnchanged; |
845 | | |
846 | | /* |
847 | | * If we're inserting into a unique index, we may have to walk right |
848 | | * through leaf pages to find the one leaf page that we must insert on |
849 | | * to. |
850 | | * |
851 | | * This is needed for checkingunique callers because a scantid was not |
852 | | * used when we called _bt_search(). scantid can only be set after |
853 | | * _bt_check_unique() has checked for duplicates. The buffer |
854 | | * initially stored in insertstate->buf has the page where the first |
855 | | * duplicate key might be found, which isn't always the page that new |
856 | | * tuple belongs on. The heap TID attribute for new tuple (scantid) |
857 | | * could force us to insert on a sibling page, though that should be |
858 | | * very rare in practice. |
859 | | */ |
860 | 0 | if (checkingunique) |
861 | 0 | { |
862 | 0 | if (insertstate->low < insertstate->stricthigh) |
863 | 0 | { |
864 | | /* Encountered a duplicate in _bt_check_unique() */ |
865 | 0 | Assert(insertstate->bounds_valid); |
866 | 0 | uniquedup = true; |
867 | 0 | } |
868 | |
|
869 | 0 | for (;;) |
870 | 0 | { |
871 | | /* |
872 | | * Does the new tuple belong on this page? |
873 | | * |
874 | | * The earlier _bt_check_unique() call may well have |
875 | | * established a strict upper bound on the offset for the new |
876 | | * item. If it's not the last item of the page (i.e. if there |
877 | | * is at least one tuple on the page that goes after the tuple |
878 | | * we're inserting) then we know that the tuple belongs on |
879 | | * this page. We can skip the high key check. |
880 | | */ |
881 | 0 | if (insertstate->bounds_valid && |
882 | 0 | insertstate->low <= insertstate->stricthigh && |
883 | 0 | insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) |
884 | 0 | break; |
885 | | |
886 | | /* Test '<=', not '!=', since scantid is set now */ |
887 | 0 | if (P_RIGHTMOST(opaque) || |
888 | 0 | _bt_compare(rel, itup_key, page, P_HIKEY) <= 0) |
889 | 0 | break; |
890 | | |
891 | 0 | _bt_stepright(rel, heapRel, insertstate, stack); |
892 | | /* Update local state after stepping right */ |
893 | 0 | page = BufferGetPage(insertstate->buf); |
894 | 0 | opaque = BTPageGetOpaque(page); |
895 | | /* Assume duplicates (if checkingunique) */ |
896 | 0 | uniquedup = true; |
897 | 0 | } |
898 | 0 | } |
899 | | |
900 | | /* |
901 | | * If the target page cannot fit newitem, try to avoid splitting the |
902 | | * page on insert by performing deletion or deduplication now |
903 | | */ |
904 | 0 | if (PageGetFreeSpace(page) < insertstate->itemsz) |
905 | 0 | _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false, |
906 | 0 | checkingunique, uniquedup, |
907 | 0 | indexUnchanged); |
908 | 0 | } |
909 | 0 | else |
910 | 0 | { |
911 | | /*---------- |
912 | | * This is a !heapkeyspace (version 2 or 3) index. The current page |
913 | | * is the first page that we could insert the new tuple to, but there |
914 | | * may be other pages to the right that we could opt to use instead. |
915 | | * |
916 | | * If the new key is equal to one or more existing keys, we can |
917 | | * legitimately place it anywhere in the series of equal keys. In |
918 | | * fact, if the new key is equal to the page's "high key" we can place |
919 | | * it on the next page. If it is equal to the high key, and there's |
920 | | * not room to insert the new tuple on the current page without |
921 | | * splitting, then we move right hoping to find more free space and |
922 | | * avoid a split. |
923 | | * |
924 | | * Keep scanning right until we |
925 | | * (a) find a page with enough free space, |
926 | | * (b) reach the last page where the tuple can legally go, or |
927 | | * (c) get tired of searching. |
928 | | * (c) is not flippant; it is important because if there are many |
929 | | * pages' worth of equal keys, it's better to split one of the early |
930 | | * pages than to scan all the way to the end of the run of equal keys |
931 | | * on every insert. We implement "get tired" as a random choice, |
932 | | * since stopping after scanning a fixed number of pages wouldn't work |
933 | | * well (we'd never reach the right-hand side of previously split |
934 | | * pages). The probability of moving right is set at 0.99, which may |
935 | | * seem too high to change the behavior much, but it does an excellent |
936 | | * job of preventing O(N^2) behavior with many equal keys. |
937 | | *---------- |
938 | | */ |
939 | 0 | while (PageGetFreeSpace(page) < insertstate->itemsz) |
940 | 0 | { |
941 | | /* |
942 | | * Before considering moving right, see if we can obtain enough |
943 | | * space by erasing LP_DEAD items |
944 | | */ |
945 | 0 | if (P_HAS_GARBAGE(opaque)) |
946 | 0 | { |
947 | | /* Perform simple deletion */ |
948 | 0 | _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, |
949 | 0 | false, false, false); |
950 | |
|
951 | 0 | if (PageGetFreeSpace(page) >= insertstate->itemsz) |
952 | 0 | break; /* OK, now we have enough space */ |
953 | 0 | } |
954 | | |
955 | | /* |
956 | | * Nope, so check conditions (b) and (c) enumerated above |
957 | | * |
958 | | * The earlier _bt_check_unique() call may well have established a |
959 | | * strict upper bound on the offset for the new item. If it's not |
960 | | * the last item of the page (i.e. if there is at least one tuple |
961 | | * on the page that's greater than the tuple we're inserting to) |
962 | | * then we know that the tuple belongs on this page. We can skip |
963 | | * the high key check. |
964 | | */ |
965 | 0 | if (insertstate->bounds_valid && |
966 | 0 | insertstate->low <= insertstate->stricthigh && |
967 | 0 | insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) |
968 | 0 | break; |
969 | | |
970 | 0 | if (P_RIGHTMOST(opaque) || |
971 | 0 | _bt_compare(rel, itup_key, page, P_HIKEY) != 0 || |
972 | 0 | pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100)) |
973 | 0 | break; |
974 | | |
975 | 0 | _bt_stepright(rel, heapRel, insertstate, stack); |
976 | | /* Update local state after stepping right */ |
977 | 0 | page = BufferGetPage(insertstate->buf); |
978 | 0 | opaque = BTPageGetOpaque(page); |
979 | 0 | } |
980 | 0 | } |
981 | | |
982 | | /* |
983 | | * We should now be on the correct page. Find the offset within the page |
984 | | * for the new tuple. (Possibly reusing earlier search bounds.) |
985 | | */ |
986 | 0 | Assert(P_RIGHTMOST(opaque) || |
987 | 0 | _bt_compare(rel, itup_key, page, P_HIKEY) <= 0); |
988 | |
|
989 | 0 | newitemoff = _bt_binsrch_insert(rel, insertstate); |
990 | |
|
991 | 0 | if (insertstate->postingoff == -1) |
992 | 0 | { |
993 | | /* |
994 | | * There is an overlapping posting list tuple with its LP_DEAD bit |
995 | | * set. We don't want to unnecessarily unset its LP_DEAD bit while |
996 | | * performing a posting list split, so perform simple index tuple |
997 | | * deletion early. |
998 | | */ |
999 | 0 | _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, |
1000 | 0 | false, false, false); |
1001 | | |
1002 | | /* |
1003 | | * Do new binary search. New insert location cannot overlap with any |
1004 | | * posting list now. |
1005 | | */ |
1006 | 0 | Assert(!insertstate->bounds_valid); |
1007 | 0 | insertstate->postingoff = 0; |
1008 | 0 | newitemoff = _bt_binsrch_insert(rel, insertstate); |
1009 | 0 | Assert(insertstate->postingoff == 0); |
1010 | 0 | } |
1011 | |
|
1012 | 0 | return newitemoff; |
1013 | 0 | } |
1014 | | |
1015 | | /* |
1016 | | * Step right to next non-dead page, during insertion. |
1017 | | * |
1018 | | * This is a bit more complicated than moving right in a search. We must |
1019 | | * write-lock the target page before releasing write lock on current page; |
1020 | | * else someone else's _bt_check_unique scan could fail to see our insertion. |
1021 | | * Write locks on intermediate dead pages won't do because we don't know when |
1022 | | * they will get de-linked from the tree. |
1023 | | * |
1024 | | * This is more aggressive than it needs to be for non-unique !heapkeyspace |
1025 | | * indexes. |
1026 | | */ |
1027 | | static void |
1028 | | _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, |
1029 | | BTStack stack) |
1030 | 0 | { |
1031 | 0 | Page page; |
1032 | 0 | BTPageOpaque opaque; |
1033 | 0 | Buffer rbuf; |
1034 | 0 | BlockNumber rblkno; |
1035 | |
|
1036 | 0 | Assert(heaprel != NULL); |
1037 | 0 | page = BufferGetPage(insertstate->buf); |
1038 | 0 | opaque = BTPageGetOpaque(page); |
1039 | |
|
1040 | 0 | rbuf = InvalidBuffer; |
1041 | 0 | rblkno = opaque->btpo_next; |
1042 | 0 | for (;;) |
1043 | 0 | { |
1044 | 0 | rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE); |
1045 | 0 | page = BufferGetPage(rbuf); |
1046 | 0 | opaque = BTPageGetOpaque(page); |
1047 | | |
1048 | | /* |
1049 | | * If this page was incompletely split, finish the split now. We do |
1050 | | * this while holding a lock on the left sibling, which is not good |
1051 | | * because finishing the split could be a fairly lengthy operation. |
1052 | | * But this should happen very seldom. |
1053 | | */ |
1054 | 0 | if (P_INCOMPLETE_SPLIT(opaque)) |
1055 | 0 | { |
1056 | 0 | _bt_finish_split(rel, heaprel, rbuf, stack); |
1057 | 0 | rbuf = InvalidBuffer; |
1058 | 0 | continue; |
1059 | 0 | } |
1060 | | |
1061 | 0 | if (!P_IGNORE(opaque)) |
1062 | 0 | break; |
1063 | 0 | if (P_RIGHTMOST(opaque)) |
1064 | 0 | elog(ERROR, "fell off the end of index \"%s\"", |
1065 | 0 | RelationGetRelationName(rel)); |
1066 | | |
1067 | 0 | rblkno = opaque->btpo_next; |
1068 | 0 | } |
1069 | | /* rbuf locked; unlock buf, update state for caller */ |
1070 | 0 | _bt_relbuf(rel, insertstate->buf); |
1071 | 0 | insertstate->buf = rbuf; |
1072 | 0 | insertstate->bounds_valid = false; |
1073 | 0 | } |
1074 | | |
1075 | | /*---------- |
1076 | | * _bt_insertonpg() -- Insert a tuple on a particular page in the index. |
1077 | | * |
1078 | | * This recursive procedure does the following things: |
1079 | | * |
1080 | | * + if postingoff != 0, splits existing posting list tuple |
1081 | | * (since it overlaps with new 'itup' tuple). |
1082 | | * + if necessary, splits the target page, using 'itup_key' for |
1083 | | * suffix truncation on leaf pages (caller passes NULL for |
1084 | | * non-leaf pages). |
1085 | | * + inserts the new tuple (might be split from posting list). |
1086 | | * + if the page was split, pops the parent stack, and finds the |
1087 | | * right place to insert the new child pointer (by walking |
1088 | | * right using information stored in the parent stack). |
1089 | | * + invokes itself with the appropriate tuple for the right |
1090 | | * child page on the parent. |
1091 | | * + updates the metapage if a true root or fast root is split. |
1092 | | * |
1093 | | * On entry, we must have the correct buffer in which to do the |
1094 | | * insertion, and the buffer must be pinned and write-locked. On return, |
1095 | | * we will have dropped both the pin and the lock on the buffer. |
1096 | | * |
1097 | | * This routine only performs retail tuple insertions. 'itup' should |
1098 | | * always be either a non-highkey leaf item, or a downlink (new high |
1099 | | * key items are created indirectly, when a page is split). When |
1100 | | * inserting to a non-leaf page, 'cbuf' is the left-sibling of the page |
1101 | | * we're inserting the downlink for. This function will clear the |
1102 | | * INCOMPLETE_SPLIT flag on it, and release the buffer. |
1103 | | *---------- |
1104 | | */ |
1105 | | static void |
1106 | | _bt_insertonpg(Relation rel, |
1107 | | Relation heaprel, |
1108 | | BTScanInsert itup_key, |
1109 | | Buffer buf, |
1110 | | Buffer cbuf, |
1111 | | BTStack stack, |
1112 | | IndexTuple itup, |
1113 | | Size itemsz, |
1114 | | OffsetNumber newitemoff, |
1115 | | int postingoff, |
1116 | | bool split_only_page) |
1117 | 0 | { |
1118 | 0 | Page page; |
1119 | 0 | BTPageOpaque opaque; |
1120 | 0 | bool isleaf, |
1121 | 0 | isroot, |
1122 | 0 | isrightmost, |
1123 | 0 | isonly; |
1124 | 0 | IndexTuple oposting = NULL; |
1125 | 0 | IndexTuple origitup = NULL; |
1126 | 0 | IndexTuple nposting = NULL; |
1127 | |
|
1128 | 0 | page = BufferGetPage(buf); |
1129 | 0 | opaque = BTPageGetOpaque(page); |
1130 | 0 | isleaf = P_ISLEAF(opaque); |
1131 | 0 | isroot = P_ISROOT(opaque); |
1132 | 0 | isrightmost = P_RIGHTMOST(opaque); |
1133 | 0 | isonly = P_LEFTMOST(opaque) && P_RIGHTMOST(opaque); |
1134 | | |
1135 | | /* child buffer must be given iff inserting on an internal page */ |
1136 | 0 | Assert(isleaf == !BufferIsValid(cbuf)); |
1137 | | /* tuple must have appropriate number of attributes */ |
1138 | 0 | Assert(!isleaf || |
1139 | 0 | BTreeTupleGetNAtts(itup, rel) == |
1140 | 0 | IndexRelationGetNumberOfAttributes(rel)); |
1141 | 0 | Assert(isleaf || |
1142 | 0 | BTreeTupleGetNAtts(itup, rel) <= |
1143 | 0 | IndexRelationGetNumberOfKeyAttributes(rel)); |
1144 | 0 | Assert(!BTreeTupleIsPosting(itup)); |
1145 | 0 | Assert(MAXALIGN(IndexTupleSize(itup)) == itemsz); |
1146 | | /* Caller must always finish incomplete split for us */ |
1147 | 0 | Assert(!P_INCOMPLETE_SPLIT(opaque)); |
1148 | | |
1149 | | /* |
1150 | | * Every internal page should have exactly one negative infinity item at |
1151 | | * all times. Only _bt_split() and _bt_newlevel() should add items that |
1152 | | * become negative infinity items through truncation, since they're the |
1153 | | * only routines that allocate new internal pages. |
1154 | | */ |
1155 | 0 | Assert(isleaf || newitemoff > P_FIRSTDATAKEY(opaque)); |
1156 | | |
1157 | | /* |
1158 | | * Do we need to split an existing posting list item? |
1159 | | */ |
1160 | 0 | if (postingoff != 0) |
1161 | 0 | { |
1162 | 0 | ItemId itemid = PageGetItemId(page, newitemoff); |
1163 | | |
1164 | | /* |
1165 | | * The new tuple is a duplicate with a heap TID that falls inside the |
1166 | | * range of an existing posting list tuple on a leaf page. Prepare to |
1167 | | * split an existing posting list. Overwriting the posting list with |
1168 | | * its post-split version is treated as an extra step in either the |
1169 | | * insert or page split critical section. |
1170 | | */ |
1171 | 0 | Assert(isleaf && itup_key->heapkeyspace && itup_key->allequalimage); |
1172 | 0 | oposting = (IndexTuple) PageGetItem(page, itemid); |
1173 | | |
1174 | | /* |
1175 | | * postingoff value comes from earlier call to _bt_binsrch_posting(). |
1176 | | * Its binary search might think that a plain tuple must be a posting |
1177 | | * list tuple that needs to be split. This can happen with corruption |
1178 | | * involving an existing plain tuple that is a duplicate of the new |
1179 | | * item, up to and including its table TID. Check for that here in |
1180 | | * passing. |
1181 | | * |
1182 | | * Also verify that our caller has made sure that the existing posting |
1183 | | * list tuple does not have its LP_DEAD bit set. |
1184 | | */ |
1185 | 0 | if (!BTreeTupleIsPosting(oposting) || ItemIdIsDead(itemid)) |
1186 | 0 | ereport(ERROR, |
1187 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
1188 | 0 | errmsg_internal("table tid from new index tuple (%u,%u) overlaps with invalid duplicate tuple at offset %u of block %u in index \"%s\"", |
1189 | 0 | ItemPointerGetBlockNumber(&itup->t_tid), |
1190 | 0 | ItemPointerGetOffsetNumber(&itup->t_tid), |
1191 | 0 | newitemoff, BufferGetBlockNumber(buf), |
1192 | 0 | RelationGetRelationName(rel)))); |
1193 | | |
1194 | | /* use a mutable copy of itup as our itup from here on */ |
1195 | 0 | origitup = itup; |
1196 | 0 | itup = CopyIndexTuple(origitup); |
1197 | 0 | nposting = _bt_swap_posting(itup, oposting, postingoff); |
1198 | | /* itup now contains rightmost/max TID from oposting */ |
1199 | | |
1200 | | /* Alter offset so that newitem goes after posting list */ |
1201 | 0 | newitemoff = OffsetNumberNext(newitemoff); |
1202 | 0 | } |
1203 | | |
1204 | | /* |
1205 | | * Do we need to split the page to fit the item on it? |
1206 | | * |
1207 | | * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, |
1208 | | * so this comparison is correct even though we appear to be accounting |
1209 | | * only for the item and not for its line pointer. |
1210 | | */ |
1211 | 0 | if (PageGetFreeSpace(page) < itemsz) |
1212 | 0 | { |
1213 | 0 | Buffer rbuf; |
1214 | |
|
1215 | 0 | Assert(!split_only_page); |
1216 | | |
1217 | | /* split the buffer into left and right halves */ |
1218 | 0 | rbuf = _bt_split(rel, heaprel, itup_key, buf, cbuf, newitemoff, itemsz, |
1219 | 0 | itup, origitup, nposting, postingoff); |
1220 | 0 | PredicateLockPageSplit(rel, |
1221 | 0 | BufferGetBlockNumber(buf), |
1222 | 0 | BufferGetBlockNumber(rbuf)); |
1223 | | |
1224 | | /*---------- |
1225 | | * By here, |
1226 | | * |
1227 | | * + our target page has been split; |
1228 | | * + the original tuple has been inserted; |
1229 | | * + we have write locks on both the old (left half) |
1230 | | * and new (right half) buffers, after the split; and |
1231 | | * + we know the key we want to insert into the parent |
1232 | | * (it's the "high key" on the left child page). |
1233 | | * |
1234 | | * We're ready to do the parent insertion. We need to hold onto the |
1235 | | * locks for the child pages until we locate the parent, but we can |
1236 | | * at least release the lock on the right child before doing the |
1237 | | * actual insertion. The lock on the left child will be released |
1238 | | * last of all by parent insertion, where it is the 'cbuf' of parent |
1239 | | * page. |
1240 | | *---------- |
1241 | | */ |
1242 | 0 | _bt_insert_parent(rel, heaprel, buf, rbuf, stack, isroot, isonly); |
1243 | 0 | } |
1244 | 0 | else |
1245 | 0 | { |
1246 | 0 | Buffer metabuf = InvalidBuffer; |
1247 | 0 | Page metapg = NULL; |
1248 | 0 | BTMetaPageData *metad = NULL; |
1249 | 0 | BlockNumber blockcache; |
1250 | | |
1251 | | /* |
1252 | | * If we are doing this insert because we split a page that was the |
1253 | | * only one on its tree level, but was not the root, it may have been |
1254 | | * the "fast root". We need to ensure that the fast root link points |
1255 | | * at or above the current page. We can safely acquire a lock on the |
1256 | | * metapage here --- see comments for _bt_newlevel(). |
1257 | | */ |
1258 | 0 | if (unlikely(split_only_page)) |
1259 | 0 | { |
1260 | 0 | Assert(!isleaf); |
1261 | 0 | Assert(BufferIsValid(cbuf)); |
1262 | |
|
1263 | 0 | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); |
1264 | 0 | metapg = BufferGetPage(metabuf); |
1265 | 0 | metad = BTPageGetMeta(metapg); |
1266 | |
|
1267 | 0 | if (metad->btm_fastlevel >= opaque->btpo_level) |
1268 | 0 | { |
1269 | | /* no update wanted */ |
1270 | 0 | _bt_relbuf(rel, metabuf); |
1271 | 0 | metabuf = InvalidBuffer; |
1272 | 0 | } |
1273 | 0 | } |
1274 | | |
1275 | | /* Do the update. No ereport(ERROR) until changes are logged */ |
1276 | 0 | START_CRIT_SECTION(); |
1277 | |
|
1278 | 0 | if (postingoff != 0) |
1279 | 0 | memcpy(oposting, nposting, MAXALIGN(IndexTupleSize(nposting))); |
1280 | |
|
1281 | 0 | if (PageAddItem(page, (Item) itup, itemsz, newitemoff, false, |
1282 | 0 | false) == InvalidOffsetNumber) |
1283 | 0 | elog(PANIC, "failed to add new item to block %u in index \"%s\"", |
1284 | 0 | BufferGetBlockNumber(buf), RelationGetRelationName(rel)); |
1285 | | |
1286 | 0 | MarkBufferDirty(buf); |
1287 | |
|
1288 | 0 | if (BufferIsValid(metabuf)) |
1289 | 0 | { |
1290 | | /* upgrade meta-page if needed */ |
1291 | 0 | if (metad->btm_version < BTREE_NOVAC_VERSION) |
1292 | 0 | _bt_upgrademetapage(metapg); |
1293 | 0 | metad->btm_fastroot = BufferGetBlockNumber(buf); |
1294 | 0 | metad->btm_fastlevel = opaque->btpo_level; |
1295 | 0 | MarkBufferDirty(metabuf); |
1296 | 0 | } |
1297 | | |
1298 | | /* |
1299 | | * Clear INCOMPLETE_SPLIT flag on child if inserting the new item |
1300 | | * finishes a split |
1301 | | */ |
1302 | 0 | if (!isleaf) |
1303 | 0 | { |
1304 | 0 | Page cpage = BufferGetPage(cbuf); |
1305 | 0 | BTPageOpaque cpageop = BTPageGetOpaque(cpage); |
1306 | |
|
1307 | 0 | Assert(P_INCOMPLETE_SPLIT(cpageop)); |
1308 | 0 | cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; |
1309 | 0 | MarkBufferDirty(cbuf); |
1310 | 0 | } |
1311 | | |
1312 | | /* XLOG stuff */ |
1313 | 0 | if (RelationNeedsWAL(rel)) |
1314 | 0 | { |
1315 | 0 | xl_btree_insert xlrec; |
1316 | 0 | xl_btree_metadata xlmeta; |
1317 | 0 | uint8 xlinfo; |
1318 | 0 | XLogRecPtr recptr; |
1319 | 0 | uint16 upostingoff; |
1320 | |
|
1321 | 0 | xlrec.offnum = newitemoff; |
1322 | |
|
1323 | 0 | XLogBeginInsert(); |
1324 | 0 | XLogRegisterData(&xlrec, SizeOfBtreeInsert); |
1325 | |
|
1326 | 0 | if (isleaf && postingoff == 0) |
1327 | 0 | { |
1328 | | /* Simple leaf insert */ |
1329 | 0 | xlinfo = XLOG_BTREE_INSERT_LEAF; |
1330 | 0 | } |
1331 | 0 | else if (postingoff != 0) |
1332 | 0 | { |
1333 | | /* |
1334 | | * Leaf insert with posting list split. Must include |
1335 | | * postingoff field before newitem/orignewitem. |
1336 | | */ |
1337 | 0 | Assert(isleaf); |
1338 | 0 | xlinfo = XLOG_BTREE_INSERT_POST; |
1339 | 0 | } |
1340 | 0 | else |
1341 | 0 | { |
1342 | | /* Internal page insert, which finishes a split on cbuf */ |
1343 | 0 | xlinfo = XLOG_BTREE_INSERT_UPPER; |
1344 | 0 | XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD); |
1345 | |
|
1346 | 0 | if (BufferIsValid(metabuf)) |
1347 | 0 | { |
1348 | | /* Actually, it's an internal page insert + meta update */ |
1349 | 0 | xlinfo = XLOG_BTREE_INSERT_META; |
1350 | |
|
1351 | 0 | Assert(metad->btm_version >= BTREE_NOVAC_VERSION); |
1352 | 0 | xlmeta.version = metad->btm_version; |
1353 | 0 | xlmeta.root = metad->btm_root; |
1354 | 0 | xlmeta.level = metad->btm_level; |
1355 | 0 | xlmeta.fastroot = metad->btm_fastroot; |
1356 | 0 | xlmeta.fastlevel = metad->btm_fastlevel; |
1357 | 0 | xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; |
1358 | 0 | xlmeta.allequalimage = metad->btm_allequalimage; |
1359 | |
|
1360 | 0 | XLogRegisterBuffer(2, metabuf, |
1361 | 0 | REGBUF_WILL_INIT | REGBUF_STANDARD); |
1362 | 0 | XLogRegisterBufData(2, &xlmeta, |
1363 | 0 | sizeof(xl_btree_metadata)); |
1364 | 0 | } |
1365 | 0 | } |
1366 | |
|
1367 | 0 | XLogRegisterBuffer(0, buf, REGBUF_STANDARD); |
1368 | 0 | if (postingoff == 0) |
1369 | 0 | { |
1370 | | /* Just log itup from caller */ |
1371 | 0 | XLogRegisterBufData(0, itup, IndexTupleSize(itup)); |
1372 | 0 | } |
1373 | 0 | else |
1374 | 0 | { |
1375 | | /* |
1376 | | * Insert with posting list split (XLOG_BTREE_INSERT_POST |
1377 | | * record) case. |
1378 | | * |
1379 | | * Log postingoff. Also log origitup, not itup. REDO routine |
1380 | | * must reconstruct final itup (as well as nposting) using |
1381 | | * _bt_swap_posting(). |
1382 | | */ |
1383 | 0 | upostingoff = postingoff; |
1384 | |
|
1385 | 0 | XLogRegisterBufData(0, &upostingoff, sizeof(uint16)); |
1386 | 0 | XLogRegisterBufData(0, origitup, |
1387 | 0 | IndexTupleSize(origitup)); |
1388 | 0 | } |
1389 | |
|
1390 | 0 | recptr = XLogInsert(RM_BTREE_ID, xlinfo); |
1391 | |
|
1392 | 0 | if (BufferIsValid(metabuf)) |
1393 | 0 | PageSetLSN(metapg, recptr); |
1394 | 0 | if (!isleaf) |
1395 | 0 | PageSetLSN(BufferGetPage(cbuf), recptr); |
1396 | |
|
1397 | 0 | PageSetLSN(page, recptr); |
1398 | 0 | } |
1399 | |
|
1400 | 0 | END_CRIT_SECTION(); |
1401 | | |
1402 | | /* Release subsidiary buffers */ |
1403 | 0 | if (BufferIsValid(metabuf)) |
1404 | 0 | _bt_relbuf(rel, metabuf); |
1405 | 0 | if (!isleaf) |
1406 | 0 | _bt_relbuf(rel, cbuf); |
1407 | | |
1408 | | /* |
1409 | | * Cache the block number if this is the rightmost leaf page. Cache |
1410 | | * may be used by a future inserter within _bt_search_insert(). |
1411 | | */ |
1412 | 0 | blockcache = InvalidBlockNumber; |
1413 | 0 | if (isrightmost && isleaf && !isroot) |
1414 | 0 | blockcache = BufferGetBlockNumber(buf); |
1415 | | |
1416 | | /* Release buffer for insertion target block */ |
1417 | 0 | _bt_relbuf(rel, buf); |
1418 | | |
1419 | | /* |
1420 | | * If we decided to cache the insertion target block before releasing |
1421 | | * its buffer lock, then cache it now. Check the height of the tree |
1422 | | * first, though. We don't go for the optimization with small |
1423 | | * indexes. Defer final check to this point to ensure that we don't |
1424 | | * call _bt_getrootheight while holding a buffer lock. |
1425 | | */ |
1426 | 0 | if (BlockNumberIsValid(blockcache) && |
1427 | 0 | _bt_getrootheight(rel) >= BTREE_FASTPATH_MIN_LEVEL) |
1428 | 0 | RelationSetTargetBlock(rel, blockcache); |
1429 | 0 | } |
1430 | | |
1431 | | /* be tidy */ |
1432 | 0 | if (postingoff != 0) |
1433 | 0 | { |
1434 | | /* itup is actually a modified copy of caller's original */ |
1435 | 0 | pfree(nposting); |
1436 | 0 | pfree(itup); |
1437 | 0 | } |
1438 | 0 | } |
1439 | | |
1440 | | /* |
1441 | | * _bt_split() -- split a page in the btree. |
1442 | | * |
1443 | | * On entry, buf is the page to split, and is pinned and write-locked. |
1444 | | * newitemoff etc. tell us about the new item that must be inserted |
1445 | | * along with the data from the original page. |
1446 | | * |
1447 | | * itup_key is used for suffix truncation on leaf pages (internal |
1448 | | * page callers pass NULL). When splitting a non-leaf page, 'cbuf' |
1449 | | * is the left-sibling of the page we're inserting the downlink for. |
1450 | | * This function will clear the INCOMPLETE_SPLIT flag on it, and |
1451 | | * release the buffer. |
1452 | | * |
1453 | | * orignewitem, nposting, and postingoff are needed when an insert of |
1454 | | * orignewitem results in both a posting list split and a page split. |
1455 | | * These extra posting list split details are used here in the same |
1456 | | * way as they are used in the more common case where a posting list |
1457 | | * split does not coincide with a page split. We need to deal with |
1458 | | * posting list splits directly in order to ensure that everything |
1459 | | * that follows from the insert of orignewitem is handled as a single |
1460 | | * atomic operation (though caller's insert of a new pivot/downlink |
1461 | | * into parent page will still be a separate operation). See |
1462 | | * nbtree/README for details on the design of posting list splits. |
1463 | | * |
1464 | | * Returns the new right sibling of buf, pinned and write-locked. |
1465 | | * The pin and lock on buf are maintained. |
1466 | | */ |
1467 | | static Buffer |
1468 | | _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, |
1469 | | Buffer cbuf, OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem, |
1470 | | IndexTuple orignewitem, IndexTuple nposting, uint16 postingoff) |
1471 | 0 | { |
1472 | 0 | Buffer rbuf; |
1473 | 0 | Page origpage; |
1474 | 0 | Page leftpage, |
1475 | 0 | rightpage; |
1476 | 0 | PGAlignedBlock leftpage_buf, |
1477 | 0 | rightpage_buf; |
1478 | 0 | BlockNumber origpagenumber, |
1479 | 0 | rightpagenumber; |
1480 | 0 | BTPageOpaque ropaque, |
1481 | 0 | lopaque, |
1482 | 0 | oopaque; |
1483 | 0 | Buffer sbuf = InvalidBuffer; |
1484 | 0 | Page spage = NULL; |
1485 | 0 | BTPageOpaque sopaque = NULL; |
1486 | 0 | Size itemsz; |
1487 | 0 | ItemId itemid; |
1488 | 0 | IndexTuple firstright, |
1489 | 0 | lefthighkey; |
1490 | 0 | OffsetNumber firstrightoff; |
1491 | 0 | OffsetNumber afterleftoff, |
1492 | 0 | afterrightoff, |
1493 | 0 | minusinfoff; |
1494 | 0 | OffsetNumber origpagepostingoff; |
1495 | 0 | OffsetNumber maxoff; |
1496 | 0 | OffsetNumber i; |
1497 | 0 | bool newitemonleft, |
1498 | 0 | isleaf, |
1499 | 0 | isrightmost; |
1500 | | |
1501 | | /* |
1502 | | * origpage is the original page to be split. leftpage is a temporary |
1503 | | * buffer that receives the left-sibling data, which will be copied back |
1504 | | * into origpage on success. rightpage is the new page that will receive |
1505 | | * the right-sibling data. |
1506 | | * |
1507 | | * leftpage is allocated after choosing a split point. rightpage's new |
1508 | | * buffer isn't acquired until after leftpage is initialized and has new |
1509 | | * high key, the last point where splitting the page may fail (barring |
1510 | | * corruption). Failing before acquiring new buffer won't have lasting |
1511 | | * consequences, since origpage won't have been modified and leftpage is |
1512 | | * only workspace. |
1513 | | */ |
1514 | 0 | origpage = BufferGetPage(buf); |
1515 | 0 | oopaque = BTPageGetOpaque(origpage); |
1516 | 0 | isleaf = P_ISLEAF(oopaque); |
1517 | 0 | isrightmost = P_RIGHTMOST(oopaque); |
1518 | 0 | maxoff = PageGetMaxOffsetNumber(origpage); |
1519 | 0 | origpagenumber = BufferGetBlockNumber(buf); |
1520 | | |
1521 | | /* |
1522 | | * Choose a point to split origpage at. |
1523 | | * |
1524 | | * A split point can be thought of as a point _between_ two existing data |
1525 | | * items on origpage (the lastleft and firstright tuples), provided you |
1526 | | * pretend that the new item that didn't fit is already on origpage. |
1527 | | * |
1528 | | * Since origpage does not actually contain newitem, the representation of |
1529 | | * split points needs to work with two boundary cases: splits where |
1530 | | * newitem is lastleft, and splits where newitem is firstright. |
1531 | | * newitemonleft resolves the ambiguity that would otherwise exist when |
1532 | | * newitemoff == firstrightoff. In all other cases it's clear which side |
1533 | | * of the split every tuple goes on from context. newitemonleft is |
1534 | | * usually (but not always) redundant information. |
1535 | | * |
1536 | | * firstrightoff is supposed to be an origpage offset number, but it's |
1537 | | * possible that its value will be maxoff+1, which is "past the end" of |
1538 | | * origpage. This happens in the rare case where newitem goes after all |
1539 | | * existing items (i.e. newitemoff is maxoff+1) and we end up splitting |
1540 | | * origpage at the point that leaves newitem alone on new right page. Any |
1541 | | * "!newitemonleft && newitemoff == firstrightoff" split point makes |
1542 | | * newitem the firstright tuple, though, so this case isn't a special |
1543 | | * case. |
1544 | | */ |
1545 | 0 | firstrightoff = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz, |
1546 | 0 | newitem, &newitemonleft); |
1547 | | |
1548 | | /* Use temporary buffer for leftpage */ |
1549 | 0 | leftpage = leftpage_buf.data; |
1550 | 0 | _bt_pageinit(leftpage, BufferGetPageSize(buf)); |
1551 | 0 | lopaque = BTPageGetOpaque(leftpage); |
1552 | | |
1553 | | /* |
1554 | | * leftpage won't be the root when we're done. Also, clear the SPLIT_END |
1555 | | * and HAS_GARBAGE flags. |
1556 | | */ |
1557 | 0 | lopaque->btpo_flags = oopaque->btpo_flags; |
1558 | 0 | lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); |
1559 | | /* set flag in leftpage indicating that rightpage has no downlink yet */ |
1560 | 0 | lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT; |
1561 | 0 | lopaque->btpo_prev = oopaque->btpo_prev; |
1562 | | /* handle btpo_next after rightpage buffer acquired */ |
1563 | 0 | lopaque->btpo_level = oopaque->btpo_level; |
1564 | | /* handle btpo_cycleid after rightpage buffer acquired */ |
1565 | | |
1566 | | /* |
1567 | | * Copy the original page's LSN into leftpage, which will become the |
1568 | | * updated version of the page. We need this because XLogInsert will |
1569 | | * examine the LSN and possibly dump it in a page image. |
1570 | | */ |
1571 | 0 | PageSetLSN(leftpage, PageGetLSN(origpage)); |
1572 | | |
1573 | | /* |
1574 | | * Determine page offset number of existing overlapped-with-orignewitem |
1575 | | * posting list when it is necessary to perform a posting list split in |
1576 | | * passing. Note that newitem was already changed by caller (newitem no |
1577 | | * longer has the orignewitem TID). |
1578 | | * |
1579 | | * This page offset number (origpagepostingoff) will be used to pretend |
1580 | | * that the posting split has already taken place, even though the |
1581 | | * required modifications to origpage won't occur until we reach the |
1582 | | * critical section. The lastleft and firstright tuples of our page split |
1583 | | * point should, in effect, come from an imaginary version of origpage |
1584 | | * that has the nposting tuple instead of the original posting list tuple. |
1585 | | * |
1586 | | * Note: _bt_findsplitloc() should have compensated for coinciding posting |
1587 | | * list splits in just the same way, at least in theory. It doesn't |
1588 | | * bother with that, though. In practice it won't affect its choice of |
1589 | | * split point. |
1590 | | */ |
1591 | 0 | origpagepostingoff = InvalidOffsetNumber; |
1592 | 0 | if (postingoff != 0) |
1593 | 0 | { |
1594 | 0 | Assert(isleaf); |
1595 | 0 | Assert(ItemPointerCompare(&orignewitem->t_tid, |
1596 | 0 | &newitem->t_tid) < 0); |
1597 | 0 | Assert(BTreeTupleIsPosting(nposting)); |
1598 | 0 | origpagepostingoff = OffsetNumberPrev(newitemoff); |
1599 | 0 | } |
1600 | | |
1601 | | /* |
1602 | | * The high key for the new left page is a possibly-truncated copy of |
1603 | | * firstright on the leaf level (it's "firstright itself" on internal |
1604 | | * pages; see !isleaf comments below). This may seem to be contrary to |
1605 | | * Lehman & Yao's approach of using a copy of lastleft as the new high key |
1606 | | * when splitting on the leaf level. It isn't, though. |
1607 | | * |
1608 | | * Suffix truncation will leave the left page's high key fully equal to |
1609 | | * lastleft when lastleft and firstright are equal prior to heap TID (that |
1610 | | * is, the tiebreaker TID value comes from lastleft). It isn't actually |
1611 | | * necessary for a new leaf high key to be a copy of lastleft for the L&Y |
1612 | | * "subtree" invariant to hold. It's sufficient to make sure that the new |
1613 | | * leaf high key is strictly less than firstright, and greater than or |
1614 | | * equal to (not necessarily equal to) lastleft. In other words, when |
1615 | | * suffix truncation isn't possible during a leaf page split, we take |
1616 | | * L&Y's exact approach to generating a new high key for the left page. |
1617 | | * (Actually, that is slightly inaccurate. We don't just use a copy of |
1618 | | * lastleft. A tuple with all the keys from firstright but the max heap |
1619 | | * TID from lastleft is used, to avoid introducing a special case.) |
1620 | | */ |
1621 | 0 | if (!newitemonleft && newitemoff == firstrightoff) |
1622 | 0 | { |
1623 | | /* incoming tuple becomes firstright */ |
1624 | 0 | itemsz = newitemsz; |
1625 | 0 | firstright = newitem; |
1626 | 0 | } |
1627 | 0 | else |
1628 | 0 | { |
1629 | | /* existing item at firstrightoff becomes firstright */ |
1630 | 0 | itemid = PageGetItemId(origpage, firstrightoff); |
1631 | 0 | itemsz = ItemIdGetLength(itemid); |
1632 | 0 | firstright = (IndexTuple) PageGetItem(origpage, itemid); |
1633 | 0 | if (firstrightoff == origpagepostingoff) |
1634 | 0 | firstright = nposting; |
1635 | 0 | } |
1636 | |
|
1637 | 0 | if (isleaf) |
1638 | 0 | { |
1639 | 0 | IndexTuple lastleft; |
1640 | | |
1641 | | /* Attempt suffix truncation for leaf page splits */ |
1642 | 0 | if (newitemonleft && newitemoff == firstrightoff) |
1643 | 0 | { |
1644 | | /* incoming tuple becomes lastleft */ |
1645 | 0 | lastleft = newitem; |
1646 | 0 | } |
1647 | 0 | else |
1648 | 0 | { |
1649 | 0 | OffsetNumber lastleftoff; |
1650 | | |
1651 | | /* existing item before firstrightoff becomes lastleft */ |
1652 | 0 | lastleftoff = OffsetNumberPrev(firstrightoff); |
1653 | 0 | Assert(lastleftoff >= P_FIRSTDATAKEY(oopaque)); |
1654 | 0 | itemid = PageGetItemId(origpage, lastleftoff); |
1655 | 0 | lastleft = (IndexTuple) PageGetItem(origpage, itemid); |
1656 | 0 | if (lastleftoff == origpagepostingoff) |
1657 | 0 | lastleft = nposting; |
1658 | 0 | } |
1659 | |
|
1660 | 0 | lefthighkey = _bt_truncate(rel, lastleft, firstright, itup_key); |
1661 | 0 | itemsz = IndexTupleSize(lefthighkey); |
1662 | 0 | } |
1663 | 0 | else |
1664 | 0 | { |
1665 | | /* |
1666 | | * Don't perform suffix truncation on a copy of firstright to make |
1667 | | * left page high key for internal page splits. Must use firstright |
1668 | | * as new high key directly. |
1669 | | * |
1670 | | * Each distinct separator key value originates as a leaf level high |
1671 | | * key; all other separator keys/pivot tuples are copied from one |
1672 | | * level down. A separator key in a grandparent page must be |
1673 | | * identical to high key in rightmost parent page of the subtree to |
1674 | | * its left, which must itself be identical to high key in rightmost |
1675 | | * child page of that same subtree (this even applies to separator |
1676 | | * from grandparent's high key). There must always be an unbroken |
1677 | | * "seam" of identical separator keys that guide index scans at every |
1678 | | * level, starting from the grandparent. That's why suffix truncation |
1679 | | * is unsafe here. |
1680 | | * |
1681 | | * Internal page splits will truncate firstright into a "negative |
1682 | | * infinity" data item when it gets inserted on the new right page |
1683 | | * below, though. This happens during the call to _bt_pgaddtup() for |
1684 | | * the new first data item for right page. Do not confuse this |
1685 | | * mechanism with suffix truncation. It is just a convenient way of |
1686 | | * implementing page splits that split the internal page "inside" |
1687 | | * firstright. The lefthighkey separator key cannot appear a second |
1688 | | * time in the right page (only firstright's downlink goes in right |
1689 | | * page). |
1690 | | */ |
1691 | 0 | lefthighkey = firstright; |
1692 | 0 | } |
1693 | | |
1694 | | /* |
1695 | | * Add new high key to leftpage |
1696 | | */ |
1697 | 0 | afterleftoff = P_HIKEY; |
1698 | |
|
1699 | 0 | Assert(BTreeTupleGetNAtts(lefthighkey, rel) > 0); |
1700 | 0 | Assert(BTreeTupleGetNAtts(lefthighkey, rel) <= |
1701 | 0 | IndexRelationGetNumberOfKeyAttributes(rel)); |
1702 | 0 | Assert(itemsz == MAXALIGN(IndexTupleSize(lefthighkey))); |
1703 | 0 | if (PageAddItem(leftpage, (Item) lefthighkey, itemsz, afterleftoff, false, |
1704 | 0 | false) == InvalidOffsetNumber) |
1705 | 0 | elog(ERROR, "failed to add high key to the left sibling" |
1706 | 0 | " while splitting block %u of index \"%s\"", |
1707 | 0 | origpagenumber, RelationGetRelationName(rel)); |
1708 | 0 | afterleftoff = OffsetNumberNext(afterleftoff); |
1709 | | |
1710 | | /* |
1711 | | * Acquire a new right page to split into, now that left page has a new |
1712 | | * high key. |
1713 | | * |
1714 | | * To not confuse future VACUUM operations, we zero the right page and |
1715 | | * work on an in-memory copy of it before writing WAL, then copy its |
1716 | | * contents back to the actual page once we start the critical section |
1717 | | * work. This simplifies the split work, so as there is no need to zero |
1718 | | * the right page before throwing an error. |
1719 | | */ |
1720 | 0 | rbuf = _bt_allocbuf(rel, heaprel); |
1721 | 0 | rightpage = rightpage_buf.data; |
1722 | | |
1723 | | /* |
1724 | | * Copy the contents of the right page into its temporary location, and |
1725 | | * zero the original space. |
1726 | | */ |
1727 | 0 | memcpy(rightpage, BufferGetPage(rbuf), BLCKSZ); |
1728 | 0 | memset(BufferGetPage(rbuf), 0, BLCKSZ); |
1729 | 0 | rightpagenumber = BufferGetBlockNumber(rbuf); |
1730 | | /* rightpage was initialized by _bt_allocbuf */ |
1731 | 0 | ropaque = BTPageGetOpaque(rightpage); |
1732 | | |
1733 | | /* |
1734 | | * Finish off remaining leftpage special area fields. They cannot be set |
1735 | | * before both origpage (leftpage) and rightpage buffers are acquired and |
1736 | | * locked. |
1737 | | * |
1738 | | * btpo_cycleid is only used with leaf pages, though we set it here in all |
1739 | | * cases just to be consistent. |
1740 | | */ |
1741 | 0 | lopaque->btpo_next = rightpagenumber; |
1742 | 0 | lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel); |
1743 | | |
1744 | | /* |
1745 | | * rightpage won't be the root when we're done. Also, clear the SPLIT_END |
1746 | | * and HAS_GARBAGE flags. |
1747 | | */ |
1748 | 0 | ropaque->btpo_flags = oopaque->btpo_flags; |
1749 | 0 | ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE); |
1750 | 0 | ropaque->btpo_prev = origpagenumber; |
1751 | 0 | ropaque->btpo_next = oopaque->btpo_next; |
1752 | 0 | ropaque->btpo_level = oopaque->btpo_level; |
1753 | 0 | ropaque->btpo_cycleid = lopaque->btpo_cycleid; |
1754 | | |
1755 | | /* |
1756 | | * Add new high key to rightpage where necessary. |
1757 | | * |
1758 | | * If the page we're splitting is not the rightmost page at its level in |
1759 | | * the tree, then the first entry on the page is the high key from |
1760 | | * origpage. |
1761 | | */ |
1762 | 0 | afterrightoff = P_HIKEY; |
1763 | |
|
1764 | 0 | if (!isrightmost) |
1765 | 0 | { |
1766 | 0 | IndexTuple righthighkey; |
1767 | |
|
1768 | 0 | itemid = PageGetItemId(origpage, P_HIKEY); |
1769 | 0 | itemsz = ItemIdGetLength(itemid); |
1770 | 0 | righthighkey = (IndexTuple) PageGetItem(origpage, itemid); |
1771 | 0 | Assert(BTreeTupleGetNAtts(righthighkey, rel) > 0); |
1772 | 0 | Assert(BTreeTupleGetNAtts(righthighkey, rel) <= |
1773 | 0 | IndexRelationGetNumberOfKeyAttributes(rel)); |
1774 | 0 | if (PageAddItem(rightpage, (Item) righthighkey, itemsz, afterrightoff, |
1775 | 0 | false, false) == InvalidOffsetNumber) |
1776 | 0 | { |
1777 | 0 | elog(ERROR, "failed to add high key to the right sibling" |
1778 | 0 | " while splitting block %u of index \"%s\"", |
1779 | 0 | origpagenumber, RelationGetRelationName(rel)); |
1780 | 0 | } |
1781 | 0 | afterrightoff = OffsetNumberNext(afterrightoff); |
1782 | 0 | } |
1783 | | |
1784 | | /* |
1785 | | * Internal page splits truncate first data item on right page -- it |
1786 | | * becomes "minus infinity" item for the page. Set this up here. |
1787 | | */ |
1788 | 0 | minusinfoff = InvalidOffsetNumber; |
1789 | 0 | if (!isleaf) |
1790 | 0 | minusinfoff = afterrightoff; |
1791 | | |
1792 | | /* |
1793 | | * Now transfer all the data items (non-pivot tuples in isleaf case, or |
1794 | | * additional pivot tuples in !isleaf case) to the appropriate page. |
1795 | | * |
1796 | | * Note: we *must* insert at least the right page's items in item-number |
1797 | | * order, for the benefit of _bt_restore_page(). |
1798 | | */ |
1799 | 0 | for (i = P_FIRSTDATAKEY(oopaque); i <= maxoff; i = OffsetNumberNext(i)) |
1800 | 0 | { |
1801 | 0 | IndexTuple dataitem; |
1802 | |
|
1803 | 0 | itemid = PageGetItemId(origpage, i); |
1804 | 0 | itemsz = ItemIdGetLength(itemid); |
1805 | 0 | dataitem = (IndexTuple) PageGetItem(origpage, itemid); |
1806 | | |
1807 | | /* replace original item with nposting due to posting split? */ |
1808 | 0 | if (i == origpagepostingoff) |
1809 | 0 | { |
1810 | 0 | Assert(BTreeTupleIsPosting(dataitem)); |
1811 | 0 | Assert(itemsz == MAXALIGN(IndexTupleSize(nposting))); |
1812 | 0 | dataitem = nposting; |
1813 | 0 | } |
1814 | | |
1815 | | /* does new item belong before this one? */ |
1816 | 0 | else if (i == newitemoff) |
1817 | 0 | { |
1818 | 0 | if (newitemonleft) |
1819 | 0 | { |
1820 | 0 | Assert(newitemoff <= firstrightoff); |
1821 | 0 | if (!_bt_pgaddtup(leftpage, newitemsz, newitem, afterleftoff, |
1822 | 0 | false)) |
1823 | 0 | { |
1824 | 0 | elog(ERROR, "failed to add new item to the left sibling" |
1825 | 0 | " while splitting block %u of index \"%s\"", |
1826 | 0 | origpagenumber, RelationGetRelationName(rel)); |
1827 | 0 | } |
1828 | 0 | afterleftoff = OffsetNumberNext(afterleftoff); |
1829 | 0 | } |
1830 | 0 | else |
1831 | 0 | { |
1832 | 0 | Assert(newitemoff >= firstrightoff); |
1833 | 0 | if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, |
1834 | 0 | afterrightoff == minusinfoff)) |
1835 | 0 | { |
1836 | 0 | elog(ERROR, "failed to add new item to the right sibling" |
1837 | 0 | " while splitting block %u of index \"%s\"", |
1838 | 0 | origpagenumber, RelationGetRelationName(rel)); |
1839 | 0 | } |
1840 | 0 | afterrightoff = OffsetNumberNext(afterrightoff); |
1841 | 0 | } |
1842 | 0 | } |
1843 | | |
1844 | | /* decide which page to put it on */ |
1845 | 0 | if (i < firstrightoff) |
1846 | 0 | { |
1847 | 0 | if (!_bt_pgaddtup(leftpage, itemsz, dataitem, afterleftoff, false)) |
1848 | 0 | { |
1849 | 0 | elog(ERROR, "failed to add old item to the left sibling" |
1850 | 0 | " while splitting block %u of index \"%s\"", |
1851 | 0 | origpagenumber, RelationGetRelationName(rel)); |
1852 | 0 | } |
1853 | 0 | afterleftoff = OffsetNumberNext(afterleftoff); |
1854 | 0 | } |
1855 | 0 | else |
1856 | 0 | { |
1857 | 0 | if (!_bt_pgaddtup(rightpage, itemsz, dataitem, afterrightoff, |
1858 | 0 | afterrightoff == minusinfoff)) |
1859 | 0 | { |
1860 | 0 | elog(ERROR, "failed to add old item to the right sibling" |
1861 | 0 | " while splitting block %u of index \"%s\"", |
1862 | 0 | origpagenumber, RelationGetRelationName(rel)); |
1863 | 0 | } |
1864 | 0 | afterrightoff = OffsetNumberNext(afterrightoff); |
1865 | 0 | } |
1866 | 0 | } |
1867 | | |
1868 | | /* Handle case where newitem goes at the end of rightpage */ |
1869 | 0 | if (i <= newitemoff) |
1870 | 0 | { |
1871 | | /* |
1872 | | * Can't have newitemonleft here; that would imply we were told to put |
1873 | | * *everything* on the left page, which cannot fit (if it could, we'd |
1874 | | * not be splitting the page). |
1875 | | */ |
1876 | 0 | Assert(!newitemonleft && newitemoff == maxoff + 1); |
1877 | 0 | if (!_bt_pgaddtup(rightpage, newitemsz, newitem, afterrightoff, |
1878 | 0 | afterrightoff == minusinfoff)) |
1879 | 0 | { |
1880 | 0 | elog(ERROR, "failed to add new item to the right sibling" |
1881 | 0 | " while splitting block %u of index \"%s\"", |
1882 | 0 | origpagenumber, RelationGetRelationName(rel)); |
1883 | 0 | } |
1884 | 0 | afterrightoff = OffsetNumberNext(afterrightoff); |
1885 | 0 | } |
1886 | | |
1887 | | /* |
1888 | | * We have to grab the original right sibling (if any) and update its prev |
1889 | | * link. We are guaranteed that this is deadlock-free, since we couple |
1890 | | * the locks in the standard order: left to right. |
1891 | | */ |
1892 | 0 | if (!isrightmost) |
1893 | 0 | { |
1894 | 0 | sbuf = _bt_getbuf(rel, oopaque->btpo_next, BT_WRITE); |
1895 | 0 | spage = BufferGetPage(sbuf); |
1896 | 0 | sopaque = BTPageGetOpaque(spage); |
1897 | 0 | if (sopaque->btpo_prev != origpagenumber) |
1898 | 0 | { |
1899 | 0 | ereport(ERROR, |
1900 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
1901 | 0 | errmsg_internal("right sibling's left-link doesn't match: " |
1902 | 0 | "block %u links to %u instead of expected %u in index \"%s\"", |
1903 | 0 | oopaque->btpo_next, sopaque->btpo_prev, origpagenumber, |
1904 | 0 | RelationGetRelationName(rel)))); |
1905 | 0 | } |
1906 | | |
1907 | | /* |
1908 | | * Check to see if we can set the SPLIT_END flag in the right-hand |
1909 | | * split page; this can save some I/O for vacuum since it need not |
1910 | | * proceed to the right sibling. We can set the flag if the right |
1911 | | * sibling has a different cycleid: that means it could not be part of |
1912 | | * a group of pages that were all split off from the same ancestor |
1913 | | * page. If you're confused, imagine that page A splits to A B and |
1914 | | * then again, yielding A C B, while vacuum is in progress. Tuples |
1915 | | * originally in A could now be in either B or C, hence vacuum must |
1916 | | * examine both pages. But if D, our right sibling, has a different |
1917 | | * cycleid then it could not contain any tuples that were in A when |
1918 | | * the vacuum started. |
1919 | | */ |
1920 | 0 | if (sopaque->btpo_cycleid != ropaque->btpo_cycleid) |
1921 | 0 | ropaque->btpo_flags |= BTP_SPLIT_END; |
1922 | 0 | } |
1923 | | |
1924 | | /* |
1925 | | * Right sibling is locked, new siblings are prepared, but original page |
1926 | | * is not updated yet. |
1927 | | * |
1928 | | * NO EREPORT(ERROR) till right sibling is updated. We can get away with |
1929 | | * not starting the critical section till here because we haven't been |
1930 | | * scribbling on the original page yet; see comments above. |
1931 | | */ |
1932 | 0 | START_CRIT_SECTION(); |
1933 | | |
1934 | | /* |
1935 | | * By here, the original data page has been split into two new halves, and |
1936 | | * these are correct. The algorithm requires that the left page never |
1937 | | * move during a split, so we copy the new left page back on top of the |
1938 | | * original. We need to do this before writing the WAL record, so that |
1939 | | * XLogInsert can WAL log an image of the page if necessary. |
1940 | | */ |
1941 | 0 | memcpy(origpage, leftpage, BLCKSZ); |
1942 | | /* leftpage, lopaque must not be used below here */ |
1943 | | |
1944 | | /* |
1945 | | * Move the contents of the right page from its temporary location to the |
1946 | | * destination buffer, before writing the WAL record. Unlike the left |
1947 | | * page, the right page and its opaque area are still needed to complete |
1948 | | * the update of the page, so reinitialize them. |
1949 | | */ |
1950 | 0 | rightpage = BufferGetPage(rbuf); |
1951 | 0 | memcpy(rightpage, rightpage_buf.data, BLCKSZ); |
1952 | 0 | ropaque = BTPageGetOpaque(rightpage); |
1953 | |
|
1954 | 0 | MarkBufferDirty(buf); |
1955 | 0 | MarkBufferDirty(rbuf); |
1956 | |
|
1957 | 0 | if (!isrightmost) |
1958 | 0 | { |
1959 | 0 | sopaque->btpo_prev = rightpagenumber; |
1960 | 0 | MarkBufferDirty(sbuf); |
1961 | 0 | } |
1962 | | |
1963 | | /* |
1964 | | * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes |
1965 | | * a split |
1966 | | */ |
1967 | 0 | if (!isleaf) |
1968 | 0 | { |
1969 | 0 | Page cpage = BufferGetPage(cbuf); |
1970 | 0 | BTPageOpaque cpageop = BTPageGetOpaque(cpage); |
1971 | |
|
1972 | 0 | cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; |
1973 | 0 | MarkBufferDirty(cbuf); |
1974 | 0 | } |
1975 | | |
1976 | | /* XLOG stuff */ |
1977 | 0 | if (RelationNeedsWAL(rel)) |
1978 | 0 | { |
1979 | 0 | xl_btree_split xlrec; |
1980 | 0 | uint8 xlinfo; |
1981 | 0 | XLogRecPtr recptr; |
1982 | |
|
1983 | 0 | xlrec.level = ropaque->btpo_level; |
1984 | | /* See comments below on newitem, orignewitem, and posting lists */ |
1985 | 0 | xlrec.firstrightoff = firstrightoff; |
1986 | 0 | xlrec.newitemoff = newitemoff; |
1987 | 0 | xlrec.postingoff = 0; |
1988 | 0 | if (postingoff != 0 && origpagepostingoff < firstrightoff) |
1989 | 0 | xlrec.postingoff = postingoff; |
1990 | |
|
1991 | 0 | XLogBeginInsert(); |
1992 | 0 | XLogRegisterData(&xlrec, SizeOfBtreeSplit); |
1993 | |
|
1994 | 0 | XLogRegisterBuffer(0, buf, REGBUF_STANDARD); |
1995 | 0 | XLogRegisterBuffer(1, rbuf, REGBUF_WILL_INIT); |
1996 | | /* Log original right sibling, since we've changed its prev-pointer */ |
1997 | 0 | if (!isrightmost) |
1998 | 0 | XLogRegisterBuffer(2, sbuf, REGBUF_STANDARD); |
1999 | 0 | if (!isleaf) |
2000 | 0 | XLogRegisterBuffer(3, cbuf, REGBUF_STANDARD); |
2001 | | |
2002 | | /* |
2003 | | * Log the new item, if it was inserted on the left page. (If it was |
2004 | | * put on the right page, we don't need to explicitly WAL log it |
2005 | | * because it's included with all the other items on the right page.) |
2006 | | * Show the new item as belonging to the left page buffer, so that it |
2007 | | * is not stored if XLogInsert decides it needs a full-page image of |
2008 | | * the left page. We always store newitemoff in the record, though. |
2009 | | * |
2010 | | * The details are sometimes slightly different for page splits that |
2011 | | * coincide with a posting list split. If both the replacement |
2012 | | * posting list and newitem go on the right page, then we don't need |
2013 | | * to log anything extra, just like the simple !newitemonleft |
2014 | | * no-posting-split case (postingoff is set to zero in the WAL record, |
2015 | | * so recovery doesn't need to process a posting list split at all). |
2016 | | * Otherwise, we set postingoff and log orignewitem instead of |
2017 | | * newitem, despite having actually inserted newitem. REDO routine |
2018 | | * must reconstruct nposting and newitem using _bt_swap_posting(). |
2019 | | * |
2020 | | * Note: It's possible that our page split point is the point that |
2021 | | * makes the posting list lastleft and newitem firstright. This is |
2022 | | * the only case where we log orignewitem/newitem despite newitem |
2023 | | * going on the right page. If XLogInsert decides that it can omit |
2024 | | * orignewitem due to logging a full-page image of the left page, |
2025 | | * everything still works out, since recovery only needs to log |
2026 | | * orignewitem for items on the left page (just like the regular |
2027 | | * newitem-logged case). |
2028 | | */ |
2029 | 0 | if (newitemonleft && xlrec.postingoff == 0) |
2030 | 0 | XLogRegisterBufData(0, newitem, newitemsz); |
2031 | 0 | else if (xlrec.postingoff != 0) |
2032 | 0 | { |
2033 | 0 | Assert(isleaf); |
2034 | 0 | Assert(newitemonleft || firstrightoff == newitemoff); |
2035 | 0 | Assert(newitemsz == IndexTupleSize(orignewitem)); |
2036 | 0 | XLogRegisterBufData(0, orignewitem, newitemsz); |
2037 | 0 | } |
2038 | | |
2039 | | /* Log the left page's new high key */ |
2040 | 0 | if (!isleaf) |
2041 | 0 | { |
2042 | | /* lefthighkey isn't local copy, get current pointer */ |
2043 | 0 | itemid = PageGetItemId(origpage, P_HIKEY); |
2044 | 0 | lefthighkey = (IndexTuple) PageGetItem(origpage, itemid); |
2045 | 0 | } |
2046 | 0 | XLogRegisterBufData(0, lefthighkey, |
2047 | 0 | MAXALIGN(IndexTupleSize(lefthighkey))); |
2048 | | |
2049 | | /* |
2050 | | * Log the contents of the right page in the format understood by |
2051 | | * _bt_restore_page(). The whole right page will be recreated. |
2052 | | * |
2053 | | * Direct access to page is not good but faster - we should implement |
2054 | | * some new func in page API. Note we only store the tuples |
2055 | | * themselves, knowing that they were inserted in item-number order |
2056 | | * and so the line pointers can be reconstructed. See comments for |
2057 | | * _bt_restore_page(). |
2058 | | */ |
2059 | 0 | XLogRegisterBufData(1, |
2060 | 0 | (char *) rightpage + ((PageHeader) rightpage)->pd_upper, |
2061 | 0 | ((PageHeader) rightpage)->pd_special - ((PageHeader) rightpage)->pd_upper); |
2062 | |
|
2063 | 0 | xlinfo = newitemonleft ? XLOG_BTREE_SPLIT_L : XLOG_BTREE_SPLIT_R; |
2064 | 0 | recptr = XLogInsert(RM_BTREE_ID, xlinfo); |
2065 | |
|
2066 | 0 | PageSetLSN(origpage, recptr); |
2067 | 0 | PageSetLSN(rightpage, recptr); |
2068 | 0 | if (!isrightmost) |
2069 | 0 | PageSetLSN(spage, recptr); |
2070 | 0 | if (!isleaf) |
2071 | 0 | PageSetLSN(BufferGetPage(cbuf), recptr); |
2072 | 0 | } |
2073 | |
|
2074 | 0 | END_CRIT_SECTION(); |
2075 | | |
2076 | | /* release the old right sibling */ |
2077 | 0 | if (!isrightmost) |
2078 | 0 | _bt_relbuf(rel, sbuf); |
2079 | | |
2080 | | /* release the child */ |
2081 | 0 | if (!isleaf) |
2082 | 0 | _bt_relbuf(rel, cbuf); |
2083 | | |
2084 | | /* be tidy */ |
2085 | 0 | if (isleaf) |
2086 | 0 | pfree(lefthighkey); |
2087 | | |
2088 | | /* split's done */ |
2089 | 0 | return rbuf; |
2090 | 0 | } |
2091 | | |
2092 | | /* |
2093 | | * _bt_insert_parent() -- Insert downlink into parent, completing split. |
2094 | | * |
2095 | | * On entry, buf and rbuf are the left and right split pages, which we |
2096 | | * still hold write locks on. Both locks will be released here. We |
2097 | | * release the rbuf lock once we have a write lock on the page that we |
2098 | | * intend to insert a downlink to rbuf on (i.e. buf's current parent page). |
2099 | | * The lock on buf is released at the same point as the lock on the parent |
2100 | | * page, since buf's INCOMPLETE_SPLIT flag must be cleared by the same |
2101 | | * atomic operation that completes the split by inserting a new downlink. |
2102 | | * |
2103 | | * stack - stack showing how we got here. Will be NULL when splitting true |
2104 | | * root, or during concurrent root split, where we can be inefficient |
2105 | | * isroot - we split the true root |
2106 | | * isonly - we split a page alone on its level (might have been fast root) |
2107 | | */ |
2108 | | static void |
2109 | | _bt_insert_parent(Relation rel, |
2110 | | Relation heaprel, |
2111 | | Buffer buf, |
2112 | | Buffer rbuf, |
2113 | | BTStack stack, |
2114 | | bool isroot, |
2115 | | bool isonly) |
2116 | 0 | { |
2117 | 0 | Assert(heaprel != NULL); |
2118 | | |
2119 | | /* |
2120 | | * Here we have to do something Lehman and Yao don't talk about: deal with |
2121 | | * a root split and construction of a new root. If our stack is empty |
2122 | | * then we have just split a node on what had been the root level when we |
2123 | | * descended the tree. If it was still the root then we perform a |
2124 | | * new-root construction. If it *wasn't* the root anymore, search to find |
2125 | | * the next higher level that someone constructed meanwhile, and find the |
2126 | | * right place to insert as for the normal case. |
2127 | | * |
2128 | | * If we have to search for the parent level, we do so by re-descending |
2129 | | * from the root. This is not super-efficient, but it's rare enough not |
2130 | | * to matter. |
2131 | | */ |
2132 | 0 | if (isroot) |
2133 | 0 | { |
2134 | 0 | Buffer rootbuf; |
2135 | |
|
2136 | 0 | Assert(stack == NULL); |
2137 | 0 | Assert(isonly); |
2138 | | /* create a new root node one level up and update the metapage */ |
2139 | 0 | rootbuf = _bt_newlevel(rel, heaprel, buf, rbuf); |
2140 | | /* release the split buffers */ |
2141 | 0 | _bt_relbuf(rel, rootbuf); |
2142 | 0 | _bt_relbuf(rel, rbuf); |
2143 | 0 | _bt_relbuf(rel, buf); |
2144 | 0 | } |
2145 | 0 | else |
2146 | 0 | { |
2147 | 0 | BlockNumber bknum = BufferGetBlockNumber(buf); |
2148 | 0 | BlockNumber rbknum = BufferGetBlockNumber(rbuf); |
2149 | 0 | Page page = BufferGetPage(buf); |
2150 | 0 | IndexTuple new_item; |
2151 | 0 | BTStackData fakestack; |
2152 | 0 | IndexTuple ritem; |
2153 | 0 | Buffer pbuf; |
2154 | |
|
2155 | 0 | if (stack == NULL) |
2156 | 0 | { |
2157 | 0 | BTPageOpaque opaque; |
2158 | |
|
2159 | 0 | elog(DEBUG2, "concurrent ROOT page split"); |
2160 | 0 | opaque = BTPageGetOpaque(page); |
2161 | | |
2162 | | /* |
2163 | | * We should never reach here when a leaf page split takes place |
2164 | | * despite the insert of newitem being able to apply the fastpath |
2165 | | * optimization. Make sure of that with an assertion. |
2166 | | * |
2167 | | * This is more of a performance issue than a correctness issue. |
2168 | | * The fastpath won't have a descent stack. Using a phony stack |
2169 | | * here works, but never rely on that. The fastpath should be |
2170 | | * rejected within _bt_search_insert() when the rightmost leaf |
2171 | | * page will split, since it's faster to go through _bt_search() |
2172 | | * and get a stack in the usual way. |
2173 | | */ |
2174 | 0 | Assert(!(P_ISLEAF(opaque) && |
2175 | 0 | BlockNumberIsValid(RelationGetTargetBlock(rel)))); |
2176 | | |
2177 | | /* Find the leftmost page at the next level up */ |
2178 | 0 | pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false); |
2179 | | /* Set up a phony stack entry pointing there */ |
2180 | 0 | stack = &fakestack; |
2181 | 0 | stack->bts_blkno = BufferGetBlockNumber(pbuf); |
2182 | 0 | stack->bts_offset = InvalidOffsetNumber; |
2183 | 0 | stack->bts_parent = NULL; |
2184 | 0 | _bt_relbuf(rel, pbuf); |
2185 | 0 | } |
2186 | | |
2187 | | /* get high key from left, a strict lower bound for new right page */ |
2188 | 0 | ritem = (IndexTuple) PageGetItem(page, |
2189 | 0 | PageGetItemId(page, P_HIKEY)); |
2190 | | |
2191 | | /* form an index tuple that points at the new right page */ |
2192 | 0 | new_item = CopyIndexTuple(ritem); |
2193 | 0 | BTreeTupleSetDownLink(new_item, rbknum); |
2194 | | |
2195 | | /* |
2196 | | * Re-find and write lock the parent of buf. |
2197 | | * |
2198 | | * It's possible that the location of buf's downlink has changed since |
2199 | | * our initial _bt_search() descent. _bt_getstackbuf() will detect |
2200 | | * and recover from this, updating the stack, which ensures that the |
2201 | | * new downlink will be inserted at the correct offset. Even buf's |
2202 | | * parent may have changed. |
2203 | | */ |
2204 | 0 | pbuf = _bt_getstackbuf(rel, heaprel, stack, bknum); |
2205 | | |
2206 | | /* |
2207 | | * Unlock the right child. The left child will be unlocked in |
2208 | | * _bt_insertonpg(). |
2209 | | * |
2210 | | * Unlocking the right child must be delayed until here to ensure that |
2211 | | * no concurrent VACUUM operation can become confused. Page deletion |
2212 | | * cannot be allowed to fail to re-find a downlink for the rbuf page. |
2213 | | * (Actually, this is just a vestige of how things used to work. The |
2214 | | * page deletion code is expected to check for the INCOMPLETE_SPLIT |
2215 | | * flag on the left child. It won't attempt deletion of the right |
2216 | | * child until the split is complete. Despite all this, we opt to |
2217 | | * conservatively delay unlocking the right child until here.) |
2218 | | */ |
2219 | 0 | _bt_relbuf(rel, rbuf); |
2220 | |
|
2221 | 0 | if (pbuf == InvalidBuffer) |
2222 | 0 | ereport(ERROR, |
2223 | 0 | (errcode(ERRCODE_INDEX_CORRUPTED), |
2224 | 0 | errmsg_internal("failed to re-find parent key in index \"%s\" for split pages %u/%u", |
2225 | 0 | RelationGetRelationName(rel), bknum, rbknum))); |
2226 | | |
2227 | | /* Recursively insert into the parent */ |
2228 | 0 | _bt_insertonpg(rel, heaprel, NULL, pbuf, buf, stack->bts_parent, |
2229 | 0 | new_item, MAXALIGN(IndexTupleSize(new_item)), |
2230 | 0 | stack->bts_offset + 1, 0, isonly); |
2231 | | |
2232 | | /* be tidy */ |
2233 | 0 | pfree(new_item); |
2234 | 0 | } |
2235 | 0 | } |
2236 | | |
2237 | | /* |
2238 | | * _bt_finish_split() -- Finish an incomplete split |
2239 | | * |
2240 | | * A crash or other failure can leave a split incomplete. The insertion |
2241 | | * routines won't allow to insert on a page that is incompletely split. |
2242 | | * Before inserting on such a page, call _bt_finish_split(). |
2243 | | * |
2244 | | * On entry, 'lbuf' must be locked in write-mode. On exit, it is unlocked |
2245 | | * and unpinned. |
2246 | | * |
2247 | | * Caller must provide a valid heaprel, since finishing a page split requires |
2248 | | * allocating a new page if and when the parent page splits in turn. |
2249 | | */ |
2250 | | void |
2251 | | _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack) |
2252 | | { |
2253 | | Page lpage = BufferGetPage(lbuf); |
2254 | | BTPageOpaque lpageop = BTPageGetOpaque(lpage); |
2255 | | Buffer rbuf; |
2256 | | Page rpage; |
2257 | | BTPageOpaque rpageop; |
2258 | | bool wasroot; |
2259 | | bool wasonly; |
2260 | | |
2261 | | Assert(P_INCOMPLETE_SPLIT(lpageop)); |
2262 | | Assert(heaprel != NULL); |
2263 | | |
2264 | | /* Lock right sibling, the one missing the downlink */ |
2265 | | rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE); |
2266 | | rpage = BufferGetPage(rbuf); |
2267 | | rpageop = BTPageGetOpaque(rpage); |
2268 | | |
2269 | | /* Could this be a root split? */ |
2270 | | if (!stack) |
2271 | | { |
2272 | | Buffer metabuf; |
2273 | | Page metapg; |
2274 | | BTMetaPageData *metad; |
2275 | | |
2276 | | /* acquire lock on the metapage */ |
2277 | | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); |
2278 | | metapg = BufferGetPage(metabuf); |
2279 | | metad = BTPageGetMeta(metapg); |
2280 | | |
2281 | | wasroot = (metad->btm_root == BufferGetBlockNumber(lbuf)); |
2282 | | |
2283 | | _bt_relbuf(rel, metabuf); |
2284 | | } |
2285 | | else |
2286 | | wasroot = false; |
2287 | | |
2288 | | /* Was this the only page on the level before split? */ |
2289 | | wasonly = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop)); |
2290 | | |
2291 | | elog(DEBUG1, "finishing incomplete split of %u/%u", |
2292 | | BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf)); |
2293 | | |
2294 | | _bt_insert_parent(rel, heaprel, lbuf, rbuf, stack, wasroot, wasonly); |
2295 | | } |
2296 | | |
2297 | | /* |
2298 | | * _bt_getstackbuf() -- Walk back up the tree one step, and find the pivot |
2299 | | * tuple whose downlink points to child page. |
2300 | | * |
2301 | | * Caller passes child's block number, which is used to identify |
2302 | | * associated pivot tuple in parent page using a linear search that |
2303 | | * matches on pivot's downlink/block number. The expected location of |
2304 | | * the pivot tuple is taken from the stack one level above the child |
2305 | | * page. This is used as a starting point. Insertions into the |
2306 | | * parent level could cause the pivot tuple to move right; deletions |
2307 | | * could cause it to move left, but not left of the page we previously |
2308 | | * found it on. |
2309 | | * |
2310 | | * Caller can use its stack to relocate the pivot tuple/downlink for |
2311 | | * any same-level page to the right of the page found by its initial |
2312 | | * descent. This is necessary because of the possibility that caller |
2313 | | * moved right to recover from a concurrent page split. It's also |
2314 | | * convenient for certain callers to be able to step right when there |
2315 | | * wasn't a concurrent page split, while still using their original |
2316 | | * stack. For example, the checkingunique _bt_doinsert() case may |
2317 | | * have to step right when there are many physical duplicates, and its |
2318 | | * scantid forces an insertion to the right of the "first page the |
2319 | | * value could be on". (This is also relied on by all of our callers |
2320 | | * when dealing with !heapkeyspace indexes.) |
2321 | | * |
2322 | | * Returns write-locked parent page buffer, or InvalidBuffer if pivot |
2323 | | * tuple not found (should not happen). Adjusts bts_blkno & |
2324 | | * bts_offset if changed. Page split caller should insert its new |
2325 | | * pivot tuple for its new right sibling page on parent page, at the |
2326 | | * offset number bts_offset + 1. |
2327 | | */ |
2328 | | Buffer |
2329 | | _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child) |
2330 | 0 | { |
2331 | 0 | BlockNumber blkno; |
2332 | 0 | OffsetNumber start; |
2333 | |
|
2334 | 0 | blkno = stack->bts_blkno; |
2335 | 0 | start = stack->bts_offset; |
2336 | |
|
2337 | 0 | for (;;) |
2338 | 0 | { |
2339 | 0 | Buffer buf; |
2340 | 0 | Page page; |
2341 | 0 | BTPageOpaque opaque; |
2342 | |
|
2343 | 0 | buf = _bt_getbuf(rel, blkno, BT_WRITE); |
2344 | 0 | page = BufferGetPage(buf); |
2345 | 0 | opaque = BTPageGetOpaque(page); |
2346 | |
|
2347 | 0 | Assert(heaprel != NULL); |
2348 | 0 | if (P_INCOMPLETE_SPLIT(opaque)) |
2349 | 0 | { |
2350 | 0 | _bt_finish_split(rel, heaprel, buf, stack->bts_parent); |
2351 | 0 | continue; |
2352 | 0 | } |
2353 | | |
2354 | 0 | if (!P_IGNORE(opaque)) |
2355 | 0 | { |
2356 | 0 | OffsetNumber offnum, |
2357 | 0 | minoff, |
2358 | 0 | maxoff; |
2359 | 0 | ItemId itemid; |
2360 | 0 | IndexTuple item; |
2361 | |
|
2362 | 0 | minoff = P_FIRSTDATAKEY(opaque); |
2363 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
2364 | | |
2365 | | /* |
2366 | | * start = InvalidOffsetNumber means "search the whole page". We |
2367 | | * need this test anyway due to possibility that page has a high |
2368 | | * key now when it didn't before. |
2369 | | */ |
2370 | 0 | if (start < minoff) |
2371 | 0 | start = minoff; |
2372 | | |
2373 | | /* |
2374 | | * Need this check too, to guard against possibility that page |
2375 | | * split since we visited it originally. |
2376 | | */ |
2377 | 0 | if (start > maxoff) |
2378 | 0 | start = OffsetNumberNext(maxoff); |
2379 | | |
2380 | | /* |
2381 | | * These loops will check every item on the page --- but in an |
2382 | | * order that's attuned to the probability of where it actually |
2383 | | * is. Scan to the right first, then to the left. |
2384 | | */ |
2385 | 0 | for (offnum = start; |
2386 | 0 | offnum <= maxoff; |
2387 | 0 | offnum = OffsetNumberNext(offnum)) |
2388 | 0 | { |
2389 | 0 | itemid = PageGetItemId(page, offnum); |
2390 | 0 | item = (IndexTuple) PageGetItem(page, itemid); |
2391 | |
|
2392 | 0 | if (BTreeTupleGetDownLink(item) == child) |
2393 | 0 | { |
2394 | | /* Return accurate pointer to where link is now */ |
2395 | 0 | stack->bts_blkno = blkno; |
2396 | 0 | stack->bts_offset = offnum; |
2397 | 0 | return buf; |
2398 | 0 | } |
2399 | 0 | } |
2400 | | |
2401 | 0 | for (offnum = OffsetNumberPrev(start); |
2402 | 0 | offnum >= minoff; |
2403 | 0 | offnum = OffsetNumberPrev(offnum)) |
2404 | 0 | { |
2405 | 0 | itemid = PageGetItemId(page, offnum); |
2406 | 0 | item = (IndexTuple) PageGetItem(page, itemid); |
2407 | |
|
2408 | 0 | if (BTreeTupleGetDownLink(item) == child) |
2409 | 0 | { |
2410 | | /* Return accurate pointer to where link is now */ |
2411 | 0 | stack->bts_blkno = blkno; |
2412 | 0 | stack->bts_offset = offnum; |
2413 | 0 | return buf; |
2414 | 0 | } |
2415 | 0 | } |
2416 | 0 | } |
2417 | | |
2418 | | /* |
2419 | | * The item we're looking for moved right at least one page. |
2420 | | * |
2421 | | * Lehman and Yao couple/chain locks when moving right here, which we |
2422 | | * can avoid. See nbtree/README. |
2423 | | */ |
2424 | 0 | if (P_RIGHTMOST(opaque)) |
2425 | 0 | { |
2426 | 0 | _bt_relbuf(rel, buf); |
2427 | 0 | return InvalidBuffer; |
2428 | 0 | } |
2429 | 0 | blkno = opaque->btpo_next; |
2430 | 0 | start = InvalidOffsetNumber; |
2431 | 0 | _bt_relbuf(rel, buf); |
2432 | 0 | } |
2433 | 0 | } |
2434 | | |
2435 | | /* |
2436 | | * _bt_newlevel() -- Create a new level above root page. |
2437 | | * |
2438 | | * We've just split the old root page and need to create a new one. |
2439 | | * In order to do this, we add a new root page to the file, then lock |
2440 | | * the metadata page and update it. This is guaranteed to be deadlock- |
2441 | | * free, because all readers release their locks on the metadata page |
2442 | | * before trying to lock the root, and all writers lock the root before |
2443 | | * trying to lock the metadata page. We have a write lock on the old |
2444 | | * root page, so we have not introduced any cycles into the waits-for |
2445 | | * graph. |
2446 | | * |
2447 | | * On entry, lbuf (the old root) and rbuf (its new peer) are write- |
2448 | | * locked. On exit, a new root page exists with entries for the |
2449 | | * two new children, metapage is updated and unlocked/unpinned. |
2450 | | * The new root buffer is returned to caller which has to unlock/unpin |
2451 | | * lbuf, rbuf & rootbuf. |
2452 | | */ |
2453 | | static Buffer |
2454 | | _bt_newlevel(Relation rel, Relation heaprel, Buffer lbuf, Buffer rbuf) |
2455 | 0 | { |
2456 | 0 | Buffer rootbuf; |
2457 | 0 | Page lpage, |
2458 | 0 | rootpage; |
2459 | 0 | BlockNumber lbkno, |
2460 | 0 | rbkno; |
2461 | 0 | BlockNumber rootblknum; |
2462 | 0 | BTPageOpaque rootopaque; |
2463 | 0 | BTPageOpaque lopaque; |
2464 | 0 | ItemId itemid; |
2465 | 0 | IndexTuple item; |
2466 | 0 | IndexTuple left_item; |
2467 | 0 | Size left_item_sz; |
2468 | 0 | IndexTuple right_item; |
2469 | 0 | Size right_item_sz; |
2470 | 0 | Buffer metabuf; |
2471 | 0 | Page metapg; |
2472 | 0 | BTMetaPageData *metad; |
2473 | |
|
2474 | 0 | lbkno = BufferGetBlockNumber(lbuf); |
2475 | 0 | rbkno = BufferGetBlockNumber(rbuf); |
2476 | 0 | lpage = BufferGetPage(lbuf); |
2477 | 0 | lopaque = BTPageGetOpaque(lpage); |
2478 | | |
2479 | | /* get a new root page */ |
2480 | 0 | rootbuf = _bt_allocbuf(rel, heaprel); |
2481 | 0 | rootpage = BufferGetPage(rootbuf); |
2482 | 0 | rootblknum = BufferGetBlockNumber(rootbuf); |
2483 | | |
2484 | | /* acquire lock on the metapage */ |
2485 | 0 | metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE); |
2486 | 0 | metapg = BufferGetPage(metabuf); |
2487 | 0 | metad = BTPageGetMeta(metapg); |
2488 | | |
2489 | | /* |
2490 | | * Create downlink item for left page (old root). The key value used is |
2491 | | * "minus infinity", a sentinel value that's reliably less than any real |
2492 | | * key value that could appear in the left page. |
2493 | | */ |
2494 | 0 | left_item_sz = sizeof(IndexTupleData); |
2495 | 0 | left_item = (IndexTuple) palloc(left_item_sz); |
2496 | 0 | left_item->t_info = left_item_sz; |
2497 | 0 | BTreeTupleSetDownLink(left_item, lbkno); |
2498 | 0 | BTreeTupleSetNAtts(left_item, 0, false); |
2499 | | |
2500 | | /* |
2501 | | * Create downlink item for right page. The key for it is obtained from |
2502 | | * the "high key" position in the left page. |
2503 | | */ |
2504 | 0 | itemid = PageGetItemId(lpage, P_HIKEY); |
2505 | 0 | right_item_sz = ItemIdGetLength(itemid); |
2506 | 0 | item = (IndexTuple) PageGetItem(lpage, itemid); |
2507 | 0 | right_item = CopyIndexTuple(item); |
2508 | 0 | BTreeTupleSetDownLink(right_item, rbkno); |
2509 | | |
2510 | | /* NO EREPORT(ERROR) from here till newroot op is logged */ |
2511 | 0 | START_CRIT_SECTION(); |
2512 | | |
2513 | | /* upgrade metapage if needed */ |
2514 | 0 | if (metad->btm_version < BTREE_NOVAC_VERSION) |
2515 | 0 | _bt_upgrademetapage(metapg); |
2516 | | |
2517 | | /* set btree special data */ |
2518 | 0 | rootopaque = BTPageGetOpaque(rootpage); |
2519 | 0 | rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE; |
2520 | 0 | rootopaque->btpo_flags = BTP_ROOT; |
2521 | 0 | rootopaque->btpo_level = |
2522 | 0 | (BTPageGetOpaque(lpage))->btpo_level + 1; |
2523 | 0 | rootopaque->btpo_cycleid = 0; |
2524 | | |
2525 | | /* update metapage data */ |
2526 | 0 | metad->btm_root = rootblknum; |
2527 | 0 | metad->btm_level = rootopaque->btpo_level; |
2528 | 0 | metad->btm_fastroot = rootblknum; |
2529 | 0 | metad->btm_fastlevel = rootopaque->btpo_level; |
2530 | | |
2531 | | /* |
2532 | | * Insert the left page pointer into the new root page. The root page is |
2533 | | * the rightmost page on its level so there is no "high key" in it; the |
2534 | | * two items will go into positions P_HIKEY and P_FIRSTKEY. |
2535 | | * |
2536 | | * Note: we *must* insert the two items in item-number order, for the |
2537 | | * benefit of _bt_restore_page(). |
2538 | | */ |
2539 | 0 | Assert(BTreeTupleGetNAtts(left_item, rel) == 0); |
2540 | 0 | if (PageAddItem(rootpage, (Item) left_item, left_item_sz, P_HIKEY, |
2541 | 0 | false, false) == InvalidOffsetNumber) |
2542 | 0 | elog(PANIC, "failed to add leftkey to new root page" |
2543 | 0 | " while splitting block %u of index \"%s\"", |
2544 | 0 | BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); |
2545 | | |
2546 | | /* |
2547 | | * insert the right page pointer into the new root page. |
2548 | | */ |
2549 | 0 | Assert(BTreeTupleGetNAtts(right_item, rel) > 0); |
2550 | 0 | Assert(BTreeTupleGetNAtts(right_item, rel) <= |
2551 | 0 | IndexRelationGetNumberOfKeyAttributes(rel)); |
2552 | 0 | if (PageAddItem(rootpage, (Item) right_item, right_item_sz, P_FIRSTKEY, |
2553 | 0 | false, false) == InvalidOffsetNumber) |
2554 | 0 | elog(PANIC, "failed to add rightkey to new root page" |
2555 | 0 | " while splitting block %u of index \"%s\"", |
2556 | 0 | BufferGetBlockNumber(lbuf), RelationGetRelationName(rel)); |
2557 | | |
2558 | | /* Clear the incomplete-split flag in the left child */ |
2559 | 0 | Assert(P_INCOMPLETE_SPLIT(lopaque)); |
2560 | 0 | lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT; |
2561 | 0 | MarkBufferDirty(lbuf); |
2562 | |
|
2563 | 0 | MarkBufferDirty(rootbuf); |
2564 | 0 | MarkBufferDirty(metabuf); |
2565 | | |
2566 | | /* XLOG stuff */ |
2567 | 0 | if (RelationNeedsWAL(rel)) |
2568 | 0 | { |
2569 | 0 | xl_btree_newroot xlrec; |
2570 | 0 | XLogRecPtr recptr; |
2571 | 0 | xl_btree_metadata md; |
2572 | |
|
2573 | 0 | xlrec.rootblk = rootblknum; |
2574 | 0 | xlrec.level = metad->btm_level; |
2575 | |
|
2576 | 0 | XLogBeginInsert(); |
2577 | 0 | XLogRegisterData(&xlrec, SizeOfBtreeNewroot); |
2578 | |
|
2579 | 0 | XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT); |
2580 | 0 | XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD); |
2581 | 0 | XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); |
2582 | |
|
2583 | 0 | Assert(metad->btm_version >= BTREE_NOVAC_VERSION); |
2584 | 0 | md.version = metad->btm_version; |
2585 | 0 | md.root = rootblknum; |
2586 | 0 | md.level = metad->btm_level; |
2587 | 0 | md.fastroot = rootblknum; |
2588 | 0 | md.fastlevel = metad->btm_level; |
2589 | 0 | md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages; |
2590 | 0 | md.allequalimage = metad->btm_allequalimage; |
2591 | |
|
2592 | 0 | XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata)); |
2593 | | |
2594 | | /* |
2595 | | * Direct access to page is not good but faster - we should implement |
2596 | | * some new func in page API. |
2597 | | */ |
2598 | 0 | XLogRegisterBufData(0, |
2599 | 0 | (char *) rootpage + ((PageHeader) rootpage)->pd_upper, |
2600 | 0 | ((PageHeader) rootpage)->pd_special - |
2601 | 0 | ((PageHeader) rootpage)->pd_upper); |
2602 | |
|
2603 | 0 | recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT); |
2604 | |
|
2605 | 0 | PageSetLSN(lpage, recptr); |
2606 | 0 | PageSetLSN(rootpage, recptr); |
2607 | 0 | PageSetLSN(metapg, recptr); |
2608 | 0 | } |
2609 | |
|
2610 | 0 | END_CRIT_SECTION(); |
2611 | | |
2612 | | /* done with metapage */ |
2613 | 0 | _bt_relbuf(rel, metabuf); |
2614 | |
|
2615 | 0 | pfree(left_item); |
2616 | 0 | pfree(right_item); |
2617 | |
|
2618 | 0 | return rootbuf; |
2619 | 0 | } |
2620 | | |
2621 | | /* |
2622 | | * _bt_pgaddtup() -- add a data item to a particular page during split. |
2623 | | * |
2624 | | * The difference between this routine and a bare PageAddItem call is |
2625 | | * that this code can deal with the first data item on an internal btree |
2626 | | * page in passing. This data item (which is called "firstright" within |
2627 | | * _bt_split()) has a key that must be treated as minus infinity after |
2628 | | * the split. Therefore, we truncate away all attributes when caller |
2629 | | * specifies it's the first data item on page (downlink is not changed, |
2630 | | * though). This extra step is only needed for the right page of an |
2631 | | * internal page split. There is no need to do this for the first data |
2632 | | * item on the existing/left page, since that will already have been |
2633 | | * truncated during an earlier page split. |
2634 | | * |
2635 | | * See _bt_split() for a high level explanation of why we truncate here. |
2636 | | * Note that this routine has nothing to do with suffix truncation, |
2637 | | * despite using some of the same infrastructure. |
2638 | | */ |
2639 | | static inline bool |
2640 | | _bt_pgaddtup(Page page, |
2641 | | Size itemsize, |
2642 | | IndexTuple itup, |
2643 | | OffsetNumber itup_off, |
2644 | | bool newfirstdataitem) |
2645 | 0 | { |
2646 | 0 | IndexTupleData trunctuple; |
2647 | |
|
2648 | 0 | if (newfirstdataitem) |
2649 | 0 | { |
2650 | 0 | trunctuple = *itup; |
2651 | 0 | trunctuple.t_info = sizeof(IndexTupleData); |
2652 | 0 | BTreeTupleSetNAtts(&trunctuple, 0, false); |
2653 | 0 | itup = &trunctuple; |
2654 | 0 | itemsize = sizeof(IndexTupleData); |
2655 | 0 | } |
2656 | |
|
2657 | 0 | if (unlikely(PageAddItem(page, (Item) itup, itemsize, itup_off, false, |
2658 | 0 | false) == InvalidOffsetNumber)) |
2659 | 0 | return false; |
2660 | | |
2661 | 0 | return true; |
2662 | 0 | } |
2663 | | |
2664 | | /* |
2665 | | * _bt_delete_or_dedup_one_page - Try to avoid a leaf page split. |
2666 | | * |
2667 | | * There are three operations performed here: simple index deletion, bottom-up |
2668 | | * index deletion, and deduplication. If all three operations fail to free |
2669 | | * enough space for the incoming item then caller will go on to split the |
2670 | | * page. We always consider simple deletion first. If that doesn't work out |
2671 | | * we consider alternatives. Callers that only want us to consider simple |
2672 | | * deletion (without any fallback) ask for that using the 'simpleonly' |
2673 | | * argument. |
2674 | | * |
2675 | | * We usually pick only one alternative "complex" operation when simple |
2676 | | * deletion alone won't prevent a page split. The 'checkingunique', |
2677 | | * 'uniquedup', and 'indexUnchanged' arguments are used for that. |
2678 | | * |
2679 | | * Note: We used to only delete LP_DEAD items when the BTP_HAS_GARBAGE page |
2680 | | * level flag was found set. The flag was useful back when there wasn't |
2681 | | * necessarily one single page for a duplicate tuple to go on (before heap TID |
2682 | | * became a part of the key space in version 4 indexes). But we don't |
2683 | | * actually look at the flag anymore (it's not a gating condition for our |
2684 | | * caller). That would cause us to miss tuples that are safe to delete, |
2685 | | * without getting any benefit in return. We know that the alternative is to |
2686 | | * split the page; scanning the line pointer array in passing won't have |
2687 | | * noticeable overhead. (We still maintain the BTP_HAS_GARBAGE flag despite |
2688 | | * all this because !heapkeyspace indexes must still do a "getting tired" |
2689 | | * linear search, and so are likely to get some benefit from using it as a |
2690 | | * gating condition.) |
2691 | | */ |
2692 | | static void |
2693 | | _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, |
2694 | | BTInsertState insertstate, |
2695 | | bool simpleonly, bool checkingunique, |
2696 | | bool uniquedup, bool indexUnchanged) |
2697 | 0 | { |
2698 | 0 | OffsetNumber deletable[MaxIndexTuplesPerPage]; |
2699 | 0 | int ndeletable = 0; |
2700 | 0 | OffsetNumber offnum, |
2701 | 0 | minoff, |
2702 | 0 | maxoff; |
2703 | 0 | Buffer buffer = insertstate->buf; |
2704 | 0 | BTScanInsert itup_key = insertstate->itup_key; |
2705 | 0 | Page page = BufferGetPage(buffer); |
2706 | 0 | BTPageOpaque opaque = BTPageGetOpaque(page); |
2707 | |
|
2708 | 0 | Assert(P_ISLEAF(opaque)); |
2709 | 0 | Assert(simpleonly || itup_key->heapkeyspace); |
2710 | 0 | Assert(!simpleonly || (!checkingunique && !uniquedup && !indexUnchanged)); |
2711 | | |
2712 | | /* |
2713 | | * Scan over all items to see which ones need to be deleted according to |
2714 | | * LP_DEAD flags. We'll usually manage to delete a few extra items that |
2715 | | * are not marked LP_DEAD in passing. Often the extra items that actually |
2716 | | * end up getting deleted are items that would have had their LP_DEAD bit |
2717 | | * set before long anyway (if we opted not to include them as extras). |
2718 | | */ |
2719 | 0 | minoff = P_FIRSTDATAKEY(opaque); |
2720 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
2721 | 0 | for (offnum = minoff; |
2722 | 0 | offnum <= maxoff; |
2723 | 0 | offnum = OffsetNumberNext(offnum)) |
2724 | 0 | { |
2725 | 0 | ItemId itemId = PageGetItemId(page, offnum); |
2726 | |
|
2727 | 0 | if (ItemIdIsDead(itemId)) |
2728 | 0 | deletable[ndeletable++] = offnum; |
2729 | 0 | } |
2730 | |
|
2731 | 0 | if (ndeletable > 0) |
2732 | 0 | { |
2733 | 0 | _bt_simpledel_pass(rel, buffer, heapRel, deletable, ndeletable, |
2734 | 0 | insertstate->itup, minoff, maxoff); |
2735 | 0 | insertstate->bounds_valid = false; |
2736 | | |
2737 | | /* Return when a page split has already been avoided */ |
2738 | 0 | if (PageGetFreeSpace(page) >= insertstate->itemsz) |
2739 | 0 | return; |
2740 | | |
2741 | | /* Might as well assume duplicates (if checkingunique) */ |
2742 | 0 | uniquedup = true; |
2743 | 0 | } |
2744 | | |
2745 | | /* |
2746 | | * We're done with simple deletion. Return early with callers that only |
2747 | | * call here so that simple deletion can be considered. This includes |
2748 | | * callers that explicitly ask for this and checkingunique callers that |
2749 | | * probably don't have any version churn duplicates on the page. |
2750 | | * |
2751 | | * Note: The page's BTP_HAS_GARBAGE hint flag may still be set when we |
2752 | | * return at this point (or when we go on the try either or both of our |
2753 | | * other strategies and they also fail). We do not bother expending a |
2754 | | * separate write to clear it, however. Caller will definitely clear it |
2755 | | * when it goes on to split the page (note also that the deduplication |
2756 | | * process will clear the flag in passing, just to keep things tidy). |
2757 | | */ |
2758 | 0 | if (simpleonly || (checkingunique && !uniquedup)) |
2759 | 0 | { |
2760 | 0 | Assert(!indexUnchanged); |
2761 | 0 | return; |
2762 | 0 | } |
2763 | | |
2764 | | /* Assume bounds about to be invalidated (this is almost certain now) */ |
2765 | 0 | insertstate->bounds_valid = false; |
2766 | | |
2767 | | /* |
2768 | | * Perform bottom-up index deletion pass when executor hint indicated that |
2769 | | * incoming item is logically unchanged, or for a unique index that is |
2770 | | * known to have physical duplicates for some other reason. (There is a |
2771 | | * large overlap between these two cases for a unique index. It's worth |
2772 | | * having both triggering conditions in order to apply the optimization in |
2773 | | * the event of successive related INSERT and DELETE statements.) |
2774 | | * |
2775 | | * We'll go on to do a deduplication pass when a bottom-up pass fails to |
2776 | | * delete an acceptable amount of free space (a significant fraction of |
2777 | | * the page, or space for the new item, whichever is greater). |
2778 | | * |
2779 | | * Note: Bottom-up index deletion uses the same equality/equivalence |
2780 | | * routines as deduplication internally. However, it does not merge |
2781 | | * together index tuples, so the same correctness considerations do not |
2782 | | * apply. We deliberately omit an index-is-allequalimage test here. |
2783 | | */ |
2784 | 0 | if ((indexUnchanged || uniquedup) && |
2785 | 0 | _bt_bottomupdel_pass(rel, buffer, heapRel, insertstate->itemsz)) |
2786 | 0 | return; |
2787 | | |
2788 | | /* Perform deduplication pass (when enabled and index-is-allequalimage) */ |
2789 | 0 | if (BTGetDeduplicateItems(rel) && itup_key->allequalimage) |
2790 | 0 | _bt_dedup_pass(rel, buffer, insertstate->itup, insertstate->itemsz, |
2791 | 0 | (indexUnchanged || uniquedup)); |
2792 | 0 | } |
2793 | | |
2794 | | /* |
2795 | | * _bt_simpledel_pass - Simple index tuple deletion pass. |
2796 | | * |
2797 | | * We delete all LP_DEAD-set index tuples on a leaf page. The offset numbers |
2798 | | * of all such tuples are determined by caller (caller passes these to us as |
2799 | | * its 'deletable' argument). |
2800 | | * |
2801 | | * We might also delete extra index tuples that turn out to be safe to delete |
2802 | | * in passing (though they must be cheap to check in passing to begin with). |
2803 | | * There is no certainty that any extra tuples will be deleted, though. The |
2804 | | * high level goal of the approach we take is to get the most out of each call |
2805 | | * here (without noticeably increasing the per-call overhead compared to what |
2806 | | * we need to do just to be able to delete the page's LP_DEAD-marked index |
2807 | | * tuples). |
2808 | | * |
2809 | | * The number of extra index tuples that turn out to be deletable might |
2810 | | * greatly exceed the number of LP_DEAD-marked index tuples due to various |
2811 | | * locality related effects. For example, it's possible that the total number |
2812 | | * of table blocks (pointed to by all TIDs on the leaf page) is naturally |
2813 | | * quite low, in which case we might end up checking if it's possible to |
2814 | | * delete _most_ index tuples on the page (without the tableam needing to |
2815 | | * access additional table blocks). The tableam will sometimes stumble upon |
2816 | | * _many_ extra deletable index tuples in indexes where this pattern is |
2817 | | * common. |
2818 | | * |
2819 | | * See nbtree/README for further details on simple index tuple deletion. |
2820 | | */ |
2821 | | static void |
2822 | | _bt_simpledel_pass(Relation rel, Buffer buffer, Relation heapRel, |
2823 | | OffsetNumber *deletable, int ndeletable, IndexTuple newitem, |
2824 | | OffsetNumber minoff, OffsetNumber maxoff) |
2825 | 0 | { |
2826 | 0 | Page page = BufferGetPage(buffer); |
2827 | 0 | BlockNumber *deadblocks; |
2828 | 0 | int ndeadblocks; |
2829 | 0 | TM_IndexDeleteOp delstate; |
2830 | 0 | OffsetNumber offnum; |
2831 | | |
2832 | | /* Get array of table blocks pointed to by LP_DEAD-set tuples */ |
2833 | 0 | deadblocks = _bt_deadblocks(page, deletable, ndeletable, newitem, |
2834 | 0 | &ndeadblocks); |
2835 | | |
2836 | | /* Initialize tableam state that describes index deletion operation */ |
2837 | 0 | delstate.irel = rel; |
2838 | 0 | delstate.iblknum = BufferGetBlockNumber(buffer); |
2839 | 0 | delstate.bottomup = false; |
2840 | 0 | delstate.bottomupfreespace = 0; |
2841 | 0 | delstate.ndeltids = 0; |
2842 | 0 | delstate.deltids = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexDelete)); |
2843 | 0 | delstate.status = palloc(MaxTIDsPerBTreePage * sizeof(TM_IndexStatus)); |
2844 | |
|
2845 | 0 | for (offnum = minoff; |
2846 | 0 | offnum <= maxoff; |
2847 | 0 | offnum = OffsetNumberNext(offnum)) |
2848 | 0 | { |
2849 | 0 | ItemId itemid = PageGetItemId(page, offnum); |
2850 | 0 | IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); |
2851 | 0 | TM_IndexDelete *odeltid = &delstate.deltids[delstate.ndeltids]; |
2852 | 0 | TM_IndexStatus *ostatus = &delstate.status[delstate.ndeltids]; |
2853 | 0 | BlockNumber tidblock; |
2854 | 0 | void *match; |
2855 | |
|
2856 | 0 | if (!BTreeTupleIsPosting(itup)) |
2857 | 0 | { |
2858 | 0 | tidblock = ItemPointerGetBlockNumber(&itup->t_tid); |
2859 | 0 | match = bsearch(&tidblock, deadblocks, ndeadblocks, |
2860 | 0 | sizeof(BlockNumber), _bt_blk_cmp); |
2861 | |
|
2862 | 0 | if (!match) |
2863 | 0 | { |
2864 | 0 | Assert(!ItemIdIsDead(itemid)); |
2865 | 0 | continue; |
2866 | 0 | } |
2867 | | |
2868 | | /* |
2869 | | * TID's table block is among those pointed to by the TIDs from |
2870 | | * LP_DEAD-bit set tuples on page -- add TID to deltids |
2871 | | */ |
2872 | 0 | odeltid->tid = itup->t_tid; |
2873 | 0 | odeltid->id = delstate.ndeltids; |
2874 | 0 | ostatus->idxoffnum = offnum; |
2875 | 0 | ostatus->knowndeletable = ItemIdIsDead(itemid); |
2876 | 0 | ostatus->promising = false; /* unused */ |
2877 | 0 | ostatus->freespace = 0; /* unused */ |
2878 | |
|
2879 | 0 | delstate.ndeltids++; |
2880 | 0 | } |
2881 | 0 | else |
2882 | 0 | { |
2883 | 0 | int nitem = BTreeTupleGetNPosting(itup); |
2884 | |
|
2885 | 0 | for (int p = 0; p < nitem; p++) |
2886 | 0 | { |
2887 | 0 | ItemPointer tid = BTreeTupleGetPostingN(itup, p); |
2888 | |
|
2889 | 0 | tidblock = ItemPointerGetBlockNumber(tid); |
2890 | 0 | match = bsearch(&tidblock, deadblocks, ndeadblocks, |
2891 | 0 | sizeof(BlockNumber), _bt_blk_cmp); |
2892 | |
|
2893 | 0 | if (!match) |
2894 | 0 | { |
2895 | 0 | Assert(!ItemIdIsDead(itemid)); |
2896 | 0 | continue; |
2897 | 0 | } |
2898 | | |
2899 | | /* |
2900 | | * TID's table block is among those pointed to by the TIDs |
2901 | | * from LP_DEAD-bit set tuples on page -- add TID to deltids |
2902 | | */ |
2903 | 0 | odeltid->tid = *tid; |
2904 | 0 | odeltid->id = delstate.ndeltids; |
2905 | 0 | ostatus->idxoffnum = offnum; |
2906 | 0 | ostatus->knowndeletable = ItemIdIsDead(itemid); |
2907 | 0 | ostatus->promising = false; /* unused */ |
2908 | 0 | ostatus->freespace = 0; /* unused */ |
2909 | |
|
2910 | 0 | odeltid++; |
2911 | 0 | ostatus++; |
2912 | 0 | delstate.ndeltids++; |
2913 | 0 | } |
2914 | 0 | } |
2915 | 0 | } |
2916 | |
|
2917 | 0 | pfree(deadblocks); |
2918 | |
|
2919 | 0 | Assert(delstate.ndeltids >= ndeletable); |
2920 | | |
2921 | | /* Physically delete LP_DEAD tuples (plus any delete-safe extra TIDs) */ |
2922 | 0 | _bt_delitems_delete_check(rel, buffer, heapRel, &delstate); |
2923 | |
|
2924 | 0 | pfree(delstate.deltids); |
2925 | 0 | pfree(delstate.status); |
2926 | 0 | } |
2927 | | |
2928 | | /* |
2929 | | * _bt_deadblocks() -- Get LP_DEAD related table blocks. |
2930 | | * |
2931 | | * Builds sorted and unique-ified array of table block numbers from index |
2932 | | * tuple TIDs whose line pointers are marked LP_DEAD. Also adds the table |
2933 | | * block from incoming newitem just in case it isn't among the LP_DEAD-related |
2934 | | * table blocks. |
2935 | | * |
2936 | | * Always counting the newitem's table block as an LP_DEAD related block makes |
2937 | | * sense because the cost is consistently low; it is practically certain that |
2938 | | * the table block will not incur a buffer miss in tableam. On the other hand |
2939 | | * the benefit is often quite high. There is a decent chance that there will |
2940 | | * be some deletable items from this block, since in general most garbage |
2941 | | * tuples became garbage in the recent past (in many cases this won't be the |
2942 | | * first logical row that core code added to/modified in table block |
2943 | | * recently). |
2944 | | * |
2945 | | * Returns final array, and sets *nblocks to its final size for caller. |
2946 | | */ |
2947 | | static BlockNumber * |
2948 | | _bt_deadblocks(Page page, OffsetNumber *deletable, int ndeletable, |
2949 | | IndexTuple newitem, int *nblocks) |
2950 | 0 | { |
2951 | 0 | int spacentids, |
2952 | 0 | ntids; |
2953 | 0 | BlockNumber *tidblocks; |
2954 | | |
2955 | | /* |
2956 | | * Accumulate each TID's block in array whose initial size has space for |
2957 | | * one table block per LP_DEAD-set tuple (plus space for the newitem table |
2958 | | * block). Array will only need to grow when there are LP_DEAD-marked |
2959 | | * posting list tuples (which is not that common). |
2960 | | */ |
2961 | 0 | spacentids = ndeletable + 1; |
2962 | 0 | ntids = 0; |
2963 | 0 | tidblocks = (BlockNumber *) palloc(sizeof(BlockNumber) * spacentids); |
2964 | | |
2965 | | /* |
2966 | | * First add the table block for the incoming newitem. This is the one |
2967 | | * case where simple deletion can visit a table block that doesn't have |
2968 | | * any known deletable items. |
2969 | | */ |
2970 | 0 | Assert(!BTreeTupleIsPosting(newitem) && !BTreeTupleIsPivot(newitem)); |
2971 | 0 | tidblocks[ntids++] = ItemPointerGetBlockNumber(&newitem->t_tid); |
2972 | |
|
2973 | 0 | for (int i = 0; i < ndeletable; i++) |
2974 | 0 | { |
2975 | 0 | ItemId itemid = PageGetItemId(page, deletable[i]); |
2976 | 0 | IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); |
2977 | |
|
2978 | 0 | Assert(ItemIdIsDead(itemid)); |
2979 | |
|
2980 | 0 | if (!BTreeTupleIsPosting(itup)) |
2981 | 0 | { |
2982 | 0 | if (ntids + 1 > spacentids) |
2983 | 0 | { |
2984 | 0 | spacentids *= 2; |
2985 | 0 | tidblocks = (BlockNumber *) |
2986 | 0 | repalloc(tidblocks, sizeof(BlockNumber) * spacentids); |
2987 | 0 | } |
2988 | |
|
2989 | 0 | tidblocks[ntids++] = ItemPointerGetBlockNumber(&itup->t_tid); |
2990 | 0 | } |
2991 | 0 | else |
2992 | 0 | { |
2993 | 0 | int nposting = BTreeTupleGetNPosting(itup); |
2994 | |
|
2995 | 0 | if (ntids + nposting > spacentids) |
2996 | 0 | { |
2997 | 0 | spacentids = Max(spacentids * 2, ntids + nposting); |
2998 | 0 | tidblocks = (BlockNumber *) |
2999 | 0 | repalloc(tidblocks, sizeof(BlockNumber) * spacentids); |
3000 | 0 | } |
3001 | |
|
3002 | 0 | for (int j = 0; j < nposting; j++) |
3003 | 0 | { |
3004 | 0 | ItemPointer tid = BTreeTupleGetPostingN(itup, j); |
3005 | |
|
3006 | 0 | tidblocks[ntids++] = ItemPointerGetBlockNumber(tid); |
3007 | 0 | } |
3008 | 0 | } |
3009 | 0 | } |
3010 | |
|
3011 | 0 | qsort(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); |
3012 | 0 | *nblocks = qunique(tidblocks, ntids, sizeof(BlockNumber), _bt_blk_cmp); |
3013 | |
|
3014 | 0 | return tidblocks; |
3015 | 0 | } |
3016 | | |
3017 | | /* |
3018 | | * _bt_blk_cmp() -- qsort comparison function for _bt_simpledel_pass |
3019 | | */ |
3020 | | static inline int |
3021 | | _bt_blk_cmp(const void *arg1, const void *arg2) |
3022 | 0 | { |
3023 | 0 | BlockNumber b1 = *((BlockNumber *) arg1); |
3024 | 0 | BlockNumber b2 = *((BlockNumber *) arg2); |
3025 | |
|
3026 | 0 | return pg_cmp_u32(b1, b2); |
3027 | 0 | } |