/src/postgres/src/backend/access/nbtree/nbtsplitloc.c
Line | Count | Source |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * nbtsplitloc.c |
4 | | * Choose split point code for Postgres btree implementation. |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/access/nbtree/nbtsplitloc.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | #include "postgres.h" |
16 | | |
17 | | #include "access/nbtree.h" |
18 | | #include "access/tableam.h" |
19 | | #include "common/int.h" |
20 | | |
21 | | typedef enum |
22 | | { |
23 | | /* strategy for searching through materialized list of split points */ |
24 | | SPLIT_DEFAULT, /* give some weight to truncation */ |
25 | | SPLIT_MANY_DUPLICATES, /* find minimally distinguishing point */ |
26 | | SPLIT_SINGLE_VALUE, /* leave left page almost full */ |
27 | | } FindSplitStrat; |
28 | | |
29 | | typedef struct |
30 | | { |
31 | | /* details of free space left by split */ |
32 | | int16 curdelta; /* current leftfree/rightfree delta */ |
33 | | int16 leftfree; /* space left on left page post-split */ |
34 | | int16 rightfree; /* space left on right page post-split */ |
35 | | |
36 | | /* split point identifying fields (returned by _bt_findsplitloc) */ |
37 | | OffsetNumber firstrightoff; /* first origpage item on rightpage */ |
38 | | bool newitemonleft; /* new item goes on left, or right? */ |
39 | | } SplitPoint; |
40 | | |
41 | | typedef struct |
42 | | { |
43 | | /* context data for _bt_recsplitloc */ |
44 | | Relation rel; /* index relation */ |
45 | | Page origpage; /* page undergoing split */ |
46 | | IndexTuple newitem; /* new item (cause of page split) */ |
47 | | Size newitemsz; /* size of newitem (includes line pointer) */ |
48 | | bool is_leaf; /* T if splitting a leaf page */ |
49 | | bool is_rightmost; /* T if splitting rightmost page on level */ |
50 | | OffsetNumber newitemoff; /* where the new item is to be inserted */ |
51 | | int leftspace; /* space available for items on left page */ |
52 | | int rightspace; /* space available for items on right page */ |
53 | | int olddataitemstotal; /* space taken by old items */ |
54 | | Size minfirstrightsz; /* smallest firstright size */ |
55 | | |
56 | | /* candidate split point data */ |
57 | | int maxsplits; /* maximum number of splits */ |
58 | | int nsplits; /* current number of splits */ |
59 | | SplitPoint *splits; /* all candidate split points for page */ |
60 | | int interval; /* current range of acceptable split points */ |
61 | | } FindSplitData; |
62 | | |
63 | | static void _bt_recsplitloc(FindSplitData *state, |
64 | | OffsetNumber firstrightoff, bool newitemonleft, |
65 | | int olddataitemstoleft, |
66 | | Size firstrightofforigpagetuplesz); |
67 | | static void _bt_deltasortsplits(FindSplitData *state, double fillfactormult, |
68 | | bool usemult); |
69 | | static int _bt_splitcmp(const void *arg1, const void *arg2); |
70 | | static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, |
71 | | int leaffillfactor, bool *usemult); |
72 | | static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid); |
73 | | static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, |
74 | | bool *newitemonleft, FindSplitStrat strategy); |
75 | | static int _bt_defaultinterval(FindSplitData *state); |
76 | | static int _bt_strategy(FindSplitData *state, SplitPoint *leftpage, |
77 | | SplitPoint *rightpage, FindSplitStrat *strategy); |
78 | | static void _bt_interval_edges(FindSplitData *state, |
79 | | SplitPoint **leftinterval, SplitPoint **rightinterval); |
80 | | static inline int _bt_split_penalty(FindSplitData *state, SplitPoint *split); |
81 | | static inline IndexTuple _bt_split_lastleft(FindSplitData *state, |
82 | | SplitPoint *split); |
83 | | static inline IndexTuple _bt_split_firstright(FindSplitData *state, |
84 | | SplitPoint *split); |
85 | | |
86 | | |
87 | | /* |
88 | | * _bt_findsplitloc() -- find an appropriate place to split a page. |
89 | | * |
90 | | * The main goal here is to equalize the free space that will be on each |
91 | | * split page, *after accounting for the inserted tuple*. (If we fail to |
92 | | * account for it, we might find ourselves with too little room on the page |
93 | | * that it needs to go into!) |
94 | | * |
95 | | * If the page is the rightmost page on its level, we instead try to arrange |
96 | | * to leave the left split page fillfactor% full. In this way, when we are |
97 | | * inserting successively increasing keys (consider sequences, timestamps, |
98 | | * etc) we will end up with a tree whose pages are about fillfactor% full, |
99 | | * instead of the 50% full result that we'd get without this special case. |
100 | | * This is the same as nbtsort.c produces for a newly-created tree. Note |
101 | | * that leaf and nonleaf pages use different fillfactors. Note also that |
102 | | * there are a number of further special cases where fillfactor is not |
103 | | * applied in the standard way. |
104 | | * |
105 | | * We are passed the intended insert position of the new tuple, expressed as |
106 | | * the offsetnumber of the tuple it must go in front of (this could be |
107 | | * maxoff+1 if the tuple is to go at the end). The new tuple itself is also |
108 | | * passed, since it's needed to give some weight to how effective suffix |
109 | | * truncation will be. The implementation picks the split point that |
110 | | * maximizes the effectiveness of suffix truncation from a small list of |
111 | | * alternative candidate split points that leave each side of the split with |
112 | | * about the same share of free space. Suffix truncation is secondary to |
113 | | * equalizing free space, except in cases with large numbers of duplicates. |
114 | | * Note that it is always assumed that caller goes on to perform truncation, |
115 | | * even with pg_upgrade'd indexes where that isn't actually the case |
116 | | * (!heapkeyspace indexes). See nbtree/README for more information about |
117 | | * suffix truncation. |
118 | | * |
119 | | * We return the index of the first existing tuple that should go on the |
120 | | * righthand page (which is called firstrightoff), plus a boolean |
121 | | * indicating whether the new tuple goes on the left or right page. You |
122 | | * can think of the returned state as a point _between_ two adjacent data |
123 | | * items (lastleft and firstright data items) on an imaginary version of |
124 | | * origpage that already includes newitem. The bool is necessary to |
125 | | * disambiguate the case where firstrightoff == newitemoff (i.e. it is |
126 | | * sometimes needed to determine if the firstright tuple for the split is |
127 | | * newitem rather than the tuple from origpage at offset firstrightoff). |
128 | | */ |
129 | | OffsetNumber |
130 | | _bt_findsplitloc(Relation rel, |
131 | | Page origpage, |
132 | | OffsetNumber newitemoff, |
133 | | Size newitemsz, |
134 | | IndexTuple newitem, |
135 | | bool *newitemonleft) |
136 | 0 | { |
137 | 0 | BTPageOpaque opaque; |
138 | 0 | int leftspace, |
139 | 0 | rightspace, |
140 | 0 | olddataitemstotal, |
141 | 0 | olddataitemstoleft, |
142 | 0 | perfectpenalty, |
143 | 0 | leaffillfactor; |
144 | 0 | FindSplitData state; |
145 | 0 | FindSplitStrat strategy; |
146 | 0 | ItemId itemid; |
147 | 0 | OffsetNumber offnum, |
148 | 0 | maxoff, |
149 | 0 | firstrightoff; |
150 | 0 | double fillfactormult; |
151 | 0 | bool usemult; |
152 | 0 | SplitPoint leftpage, |
153 | 0 | rightpage; |
154 | |
|
155 | 0 | opaque = BTPageGetOpaque(origpage); |
156 | 0 | maxoff = PageGetMaxOffsetNumber(origpage); |
157 | | |
158 | | /* Total free space available on a btree page, after fixed overhead */ |
159 | 0 | leftspace = rightspace = |
160 | 0 | PageGetPageSize(origpage) - SizeOfPageHeaderData - |
161 | 0 | MAXALIGN(sizeof(BTPageOpaqueData)); |
162 | | |
163 | | /* The right page will have the same high key as the old page */ |
164 | 0 | if (!P_RIGHTMOST(opaque)) |
165 | 0 | { |
166 | 0 | itemid = PageGetItemId(origpage, P_HIKEY); |
167 | 0 | rightspace -= (int) (MAXALIGN(ItemIdGetLength(itemid)) + |
168 | 0 | sizeof(ItemIdData)); |
169 | 0 | } |
170 | | |
171 | | /* Count up total space in data items before actually scanning 'em */ |
172 | 0 | olddataitemstotal = rightspace - (int) PageGetExactFreeSpace(origpage); |
173 | 0 | leaffillfactor = BTGetFillFactor(rel); |
174 | | |
175 | | /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ |
176 | 0 | newitemsz += sizeof(ItemIdData); |
177 | 0 | state.rel = rel; |
178 | 0 | state.origpage = origpage; |
179 | 0 | state.newitem = newitem; |
180 | 0 | state.newitemsz = newitemsz; |
181 | 0 | state.is_leaf = P_ISLEAF(opaque); |
182 | 0 | state.is_rightmost = P_RIGHTMOST(opaque); |
183 | 0 | state.leftspace = leftspace; |
184 | 0 | state.rightspace = rightspace; |
185 | 0 | state.olddataitemstotal = olddataitemstotal; |
186 | 0 | state.minfirstrightsz = SIZE_MAX; |
187 | 0 | state.newitemoff = newitemoff; |
188 | | |
189 | | /* newitem cannot be a posting list item */ |
190 | 0 | Assert(!BTreeTupleIsPosting(newitem)); |
191 | | |
192 | | /* |
193 | | * nsplits should never exceed maxoff because there will be at most as |
194 | | * many candidate split points as there are points _between_ tuples, once |
195 | | * you imagine that the new item is already on the original page (the |
196 | | * final number of splits may be slightly lower because not all points |
197 | | * between tuples will be legal). |
198 | | */ |
199 | 0 | state.maxsplits = maxoff; |
200 | 0 | state.splits = palloc(sizeof(SplitPoint) * state.maxsplits); |
201 | 0 | state.nsplits = 0; |
202 | | |
203 | | /* |
204 | | * Scan through the data items and calculate space usage for a split at |
205 | | * each possible position |
206 | | */ |
207 | 0 | olddataitemstoleft = 0; |
208 | |
|
209 | 0 | for (offnum = P_FIRSTDATAKEY(opaque); |
210 | 0 | offnum <= maxoff; |
211 | 0 | offnum = OffsetNumberNext(offnum)) |
212 | 0 | { |
213 | 0 | Size itemsz; |
214 | |
|
215 | 0 | itemid = PageGetItemId(origpage, offnum); |
216 | 0 | itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); |
217 | | |
218 | | /* |
219 | | * When item offset number is not newitemoff, neither side of the |
220 | | * split can be newitem. Record a split after the previous data item |
221 | | * from original page, but before the current data item from original |
222 | | * page. (_bt_recsplitloc() will reject the split when there are no |
223 | | * previous items, which we rely on.) |
224 | | */ |
225 | 0 | if (offnum < newitemoff) |
226 | 0 | _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); |
227 | 0 | else if (offnum > newitemoff) |
228 | 0 | _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); |
229 | 0 | else |
230 | 0 | { |
231 | | /* |
232 | | * Record a split after all "offnum < newitemoff" original page |
233 | | * data items, but before newitem |
234 | | */ |
235 | 0 | _bt_recsplitloc(&state, offnum, false, olddataitemstoleft, itemsz); |
236 | | |
237 | | /* |
238 | | * Record a split after newitem, but before data item from |
239 | | * original page at offset newitemoff/current offset |
240 | | */ |
241 | 0 | _bt_recsplitloc(&state, offnum, true, olddataitemstoleft, itemsz); |
242 | 0 | } |
243 | |
|
244 | 0 | olddataitemstoleft += itemsz; |
245 | 0 | } |
246 | | |
247 | | /* |
248 | | * Record a split after all original page data items, but before newitem. |
249 | | * (Though only when it's possible that newitem will end up alone on new |
250 | | * right page.) |
251 | | */ |
252 | 0 | Assert(olddataitemstoleft == olddataitemstotal); |
253 | 0 | if (newitemoff > maxoff) |
254 | 0 | _bt_recsplitloc(&state, newitemoff, false, olddataitemstotal, 0); |
255 | | |
256 | | /* |
257 | | * I believe it is not possible to fail to find a feasible split, but just |
258 | | * in case ... |
259 | | */ |
260 | 0 | if (state.nsplits == 0) |
261 | 0 | elog(ERROR, "could not find a feasible split point for index \"%s\"", |
262 | 0 | RelationGetRelationName(rel)); |
263 | | |
264 | | /* |
265 | | * Start search for a split point among list of legal split points. Give |
266 | | * primary consideration to equalizing available free space in each half |
267 | | * of the split initially (start with default strategy), while applying |
268 | | * rightmost and split-after-new-item optimizations where appropriate. |
269 | | * Either of the two other fallback strategies may be required for cases |
270 | | * with a large number of duplicates around the original/space-optimal |
271 | | * split point. |
272 | | * |
273 | | * Default strategy gives some weight to suffix truncation in deciding a |
274 | | * split point on leaf pages. It attempts to select a split point where a |
275 | | * distinguishing attribute appears earlier in the new high key for the |
276 | | * left side of the split, in order to maximize the number of trailing |
277 | | * attributes that can be truncated away. Only candidate split points |
278 | | * that imply an acceptable balance of free space on each side are |
279 | | * considered. See _bt_defaultinterval(). |
280 | | */ |
281 | 0 | if (!state.is_leaf) |
282 | 0 | { |
283 | | /* fillfactormult only used on rightmost page */ |
284 | 0 | usemult = state.is_rightmost; |
285 | 0 | fillfactormult = BTREE_NONLEAF_FILLFACTOR / 100.0; |
286 | 0 | } |
287 | 0 | else if (state.is_rightmost) |
288 | 0 | { |
289 | | /* Rightmost leaf page -- fillfactormult always used */ |
290 | 0 | usemult = true; |
291 | 0 | fillfactormult = leaffillfactor / 100.0; |
292 | 0 | } |
293 | 0 | else if (_bt_afternewitemoff(&state, maxoff, leaffillfactor, &usemult)) |
294 | 0 | { |
295 | | /* |
296 | | * New item inserted at rightmost point among a localized grouping on |
297 | | * a leaf page -- apply "split after new item" optimization, either by |
298 | | * applying leaf fillfactor multiplier, or by choosing the exact split |
299 | | * point that leaves newitem as lastleft. (usemult is set for us.) |
300 | | */ |
301 | 0 | if (usemult) |
302 | 0 | { |
303 | | /* fillfactormult should be set based on leaf fillfactor */ |
304 | 0 | fillfactormult = leaffillfactor / 100.0; |
305 | 0 | } |
306 | 0 | else |
307 | 0 | { |
308 | | /* find precise split point after newitemoff */ |
309 | 0 | for (int i = 0; i < state.nsplits; i++) |
310 | 0 | { |
311 | 0 | SplitPoint *split = state.splits + i; |
312 | |
|
313 | 0 | if (split->newitemonleft && |
314 | 0 | newitemoff == split->firstrightoff) |
315 | 0 | { |
316 | 0 | pfree(state.splits); |
317 | 0 | *newitemonleft = true; |
318 | 0 | return newitemoff; |
319 | 0 | } |
320 | 0 | } |
321 | | |
322 | | /* |
323 | | * Cannot legally split after newitemoff; proceed with split |
324 | | * without using fillfactor multiplier. This is defensive, and |
325 | | * should never be needed in practice. |
326 | | */ |
327 | 0 | fillfactormult = 0.50; |
328 | 0 | } |
329 | 0 | } |
330 | 0 | else |
331 | 0 | { |
332 | | /* Other leaf page. 50:50 page split. */ |
333 | 0 | usemult = false; |
334 | | /* fillfactormult not used, but be tidy */ |
335 | 0 | fillfactormult = 0.50; |
336 | 0 | } |
337 | | |
338 | | /* |
339 | | * Save leftmost and rightmost splits for page before original ordinal |
340 | | * sort order is lost by delta/fillfactormult sort |
341 | | */ |
342 | 0 | leftpage = state.splits[0]; |
343 | 0 | rightpage = state.splits[state.nsplits - 1]; |
344 | | |
345 | | /* Give split points a fillfactormult-wise delta, and sort on deltas */ |
346 | 0 | _bt_deltasortsplits(&state, fillfactormult, usemult); |
347 | | |
348 | | /* Determine split interval for default strategy */ |
349 | 0 | state.interval = _bt_defaultinterval(&state); |
350 | | |
351 | | /* |
352 | | * Determine if default strategy/split interval will produce a |
353 | | * sufficiently distinguishing split, or if we should change strategies. |
354 | | * Alternative strategies change the range of split points that are |
355 | | * considered acceptable (split interval), and possibly change |
356 | | * fillfactormult, in order to deal with pages with a large number of |
357 | | * duplicates gracefully. |
358 | | * |
359 | | * Pass low and high splits for the entire page (actually, they're for an |
360 | | * imaginary version of the page that includes newitem). These are used |
361 | | * when the initial split interval encloses split points that are full of |
362 | | * duplicates, and we need to consider if it's even possible to avoid |
363 | | * appending a heap TID. |
364 | | */ |
365 | 0 | perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy); |
366 | |
|
367 | 0 | if (strategy == SPLIT_DEFAULT) |
368 | 0 | { |
369 | | /* |
370 | | * Default strategy worked out (always works out with internal page). |
371 | | * Original split interval still stands. |
372 | | */ |
373 | 0 | } |
374 | | |
375 | | /* |
376 | | * Many duplicates strategy is used when a heap TID would otherwise be |
377 | | * appended, but the page isn't completely full of logical duplicates. |
378 | | * |
379 | | * The split interval is widened to include all legal candidate split |
380 | | * points. There might be a few as two distinct values in the whole-page |
381 | | * split interval, though it's also possible that most of the values on |
382 | | * the page are unique. The final split point will either be to the |
383 | | * immediate left or to the immediate right of the group of duplicate |
384 | | * tuples that enclose the first/delta-optimal split point (perfect |
385 | | * penalty was set so that the lowest delta split point that avoids |
386 | | * appending a heap TID will be chosen). Maximizing the number of |
387 | | * attributes that can be truncated away is not a goal of the many |
388 | | * duplicates strategy. |
389 | | * |
390 | | * Single value strategy is used when it is impossible to avoid appending |
391 | | * a heap TID. It arranges to leave the left page very full. This |
392 | | * maximizes space utilization in cases where tuples with the same |
393 | | * attribute values span many pages. Newly inserted duplicates will tend |
394 | | * to have higher heap TID values, so we'll end up splitting to the right |
395 | | * consistently. (Single value strategy is harmless though not |
396 | | * particularly useful with !heapkeyspace indexes.) |
397 | | */ |
398 | 0 | else if (strategy == SPLIT_MANY_DUPLICATES) |
399 | 0 | { |
400 | 0 | Assert(state.is_leaf); |
401 | | /* Shouldn't try to truncate away extra user attributes */ |
402 | 0 | Assert(perfectpenalty == |
403 | 0 | IndexRelationGetNumberOfKeyAttributes(state.rel)); |
404 | | /* No need to resort splits -- no change in fillfactormult/deltas */ |
405 | 0 | state.interval = state.nsplits; |
406 | 0 | } |
407 | 0 | else if (strategy == SPLIT_SINGLE_VALUE) |
408 | 0 | { |
409 | 0 | Assert(state.is_leaf); |
410 | | /* Split near the end of the page */ |
411 | 0 | usemult = true; |
412 | 0 | fillfactormult = BTREE_SINGLEVAL_FILLFACTOR / 100.0; |
413 | | /* Resort split points with new delta */ |
414 | 0 | _bt_deltasortsplits(&state, fillfactormult, usemult); |
415 | | /* Appending a heap TID is unavoidable, so interval of 1 is fine */ |
416 | 0 | state.interval = 1; |
417 | 0 | } |
418 | | |
419 | | /* |
420 | | * Search among acceptable split points (using final split interval) for |
421 | | * the entry that has the lowest penalty, and is therefore expected to |
422 | | * maximize fan-out. Sets *newitemonleft for us. |
423 | | */ |
424 | 0 | firstrightoff = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft, |
425 | 0 | strategy); |
426 | 0 | pfree(state.splits); |
427 | |
|
428 | 0 | return firstrightoff; |
429 | 0 | } |
430 | | |
431 | | /* |
432 | | * Subroutine to record a particular point between two tuples (possibly the |
433 | | * new item) on page (ie, combination of firstrightoff and newitemonleft |
434 | | * settings) in *state for later analysis. This is also a convenient point to |
435 | | * check if the split is legal (if it isn't, it won't be recorded). |
436 | | * |
437 | | * firstrightoff is the offset of the first item on the original page that |
438 | | * goes to the right page, and firstrightofforigpagetuplesz is the size of |
439 | | * that tuple. firstrightoff can be > max offset, which means that all the |
440 | | * old items go to the left page and only the new item goes to the right page. |
441 | | * We don't actually use firstrightofforigpagetuplesz in that case (actually, |
442 | | * we don't use it for _any_ split where the firstright tuple happens to be |
443 | | * newitem). |
444 | | * |
445 | | * olddataitemstoleft is the total size of all old items to the left of the |
446 | | * split point that is recorded here when legal. Should not include |
447 | | * newitemsz, since that is handled here. |
448 | | */ |
449 | | static void |
450 | | _bt_recsplitloc(FindSplitData *state, |
451 | | OffsetNumber firstrightoff, |
452 | | bool newitemonleft, |
453 | | int olddataitemstoleft, |
454 | | Size firstrightofforigpagetuplesz) |
455 | 0 | { |
456 | 0 | int16 leftfree, |
457 | 0 | rightfree; |
458 | 0 | Size firstrightsz; |
459 | 0 | Size postingsz = 0; |
460 | 0 | bool newitemisfirstright; |
461 | | |
462 | | /* Is the new item going to be split point's firstright tuple? */ |
463 | 0 | newitemisfirstright = (firstrightoff == state->newitemoff && |
464 | 0 | !newitemonleft); |
465 | |
|
466 | 0 | if (newitemisfirstright) |
467 | 0 | firstrightsz = state->newitemsz; |
468 | 0 | else |
469 | 0 | { |
470 | 0 | firstrightsz = firstrightofforigpagetuplesz; |
471 | | |
472 | | /* |
473 | | * Calculate suffix truncation space saving when firstright tuple is a |
474 | | * posting list tuple, though only when the tuple is over 64 bytes |
475 | | * including line pointer overhead (arbitrary). This avoids accessing |
476 | | * the tuple in cases where its posting list must be very small (if |
477 | | * tuple has one at all). |
478 | | * |
479 | | * Note: We don't do this in the case where firstright tuple is |
480 | | * newitem, since newitem cannot have a posting list. |
481 | | */ |
482 | 0 | if (state->is_leaf && firstrightsz > 64) |
483 | 0 | { |
484 | 0 | ItemId itemid; |
485 | 0 | IndexTuple newhighkey; |
486 | |
|
487 | 0 | itemid = PageGetItemId(state->origpage, firstrightoff); |
488 | 0 | newhighkey = (IndexTuple) PageGetItem(state->origpage, itemid); |
489 | |
|
490 | 0 | if (BTreeTupleIsPosting(newhighkey)) |
491 | 0 | postingsz = IndexTupleSize(newhighkey) - |
492 | 0 | BTreeTupleGetPostingOffset(newhighkey); |
493 | 0 | } |
494 | 0 | } |
495 | | |
496 | | /* Account for all the old tuples */ |
497 | 0 | leftfree = state->leftspace - olddataitemstoleft; |
498 | 0 | rightfree = state->rightspace - |
499 | 0 | (state->olddataitemstotal - olddataitemstoleft); |
500 | | |
501 | | /* |
502 | | * The first item on the right page becomes the high key of the left page; |
503 | | * therefore it counts against left space as well as right space (we |
504 | | * cannot assume that suffix truncation will make it any smaller). When |
505 | | * index has included attributes, then those attributes of left page high |
506 | | * key will be truncated leaving that page with slightly more free space. |
507 | | * However, that shouldn't affect our ability to find valid split |
508 | | * location, since we err in the direction of being pessimistic about free |
509 | | * space on the left half. Besides, even when suffix truncation of |
510 | | * non-TID attributes occurs, the new high key often won't even be a |
511 | | * single MAXALIGN() quantum smaller than the firstright tuple it's based |
512 | | * on. |
513 | | * |
514 | | * If we are on the leaf level, assume that suffix truncation cannot avoid |
515 | | * adding a heap TID to the left half's new high key when splitting at the |
516 | | * leaf level. In practice the new high key will often be smaller and |
517 | | * will rarely be larger, but conservatively assume the worst case. We do |
518 | | * go to the trouble of subtracting away posting list overhead, though |
519 | | * only when it looks like it will make an appreciable difference. |
520 | | * (Posting lists are the only case where truncation will typically make |
521 | | * the final high key far smaller than firstright, so being a bit more |
522 | | * precise there noticeably improves the balance of free space.) |
523 | | */ |
524 | 0 | if (state->is_leaf) |
525 | 0 | leftfree -= (int16) (firstrightsz + |
526 | 0 | MAXALIGN(sizeof(ItemPointerData)) - |
527 | 0 | postingsz); |
528 | 0 | else |
529 | 0 | leftfree -= (int16) firstrightsz; |
530 | | |
531 | | /* account for the new item */ |
532 | 0 | if (newitemonleft) |
533 | 0 | leftfree -= (int16) state->newitemsz; |
534 | 0 | else |
535 | 0 | rightfree -= (int16) state->newitemsz; |
536 | | |
537 | | /* |
538 | | * If we are not on the leaf level, we will be able to discard the key |
539 | | * data from the first item that winds up on the right page. |
540 | | */ |
541 | 0 | if (!state->is_leaf) |
542 | 0 | rightfree += (int16) firstrightsz - |
543 | 0 | (int16) (MAXALIGN(sizeof(IndexTupleData)) + sizeof(ItemIdData)); |
544 | | |
545 | | /* Record split if legal */ |
546 | 0 | if (leftfree >= 0 && rightfree >= 0) |
547 | 0 | { |
548 | 0 | Assert(state->nsplits < state->maxsplits); |
549 | | |
550 | | /* Determine smallest firstright tuple size among legal splits */ |
551 | 0 | state->minfirstrightsz = Min(state->minfirstrightsz, firstrightsz); |
552 | |
|
553 | 0 | state->splits[state->nsplits].curdelta = 0; |
554 | 0 | state->splits[state->nsplits].leftfree = leftfree; |
555 | 0 | state->splits[state->nsplits].rightfree = rightfree; |
556 | 0 | state->splits[state->nsplits].firstrightoff = firstrightoff; |
557 | 0 | state->splits[state->nsplits].newitemonleft = newitemonleft; |
558 | 0 | state->nsplits++; |
559 | 0 | } |
560 | 0 | } |
561 | | |
562 | | /* |
563 | | * Subroutine to assign space deltas to materialized array of candidate split |
564 | | * points based on current fillfactor, and to sort array using that fillfactor |
565 | | */ |
566 | | static void |
567 | | _bt_deltasortsplits(FindSplitData *state, double fillfactormult, |
568 | | bool usemult) |
569 | 0 | { |
570 | 0 | for (int i = 0; i < state->nsplits; i++) |
571 | 0 | { |
572 | 0 | SplitPoint *split = state->splits + i; |
573 | 0 | int16 delta; |
574 | |
|
575 | 0 | if (usemult) |
576 | 0 | delta = fillfactormult * split->leftfree - |
577 | 0 | (1.0 - fillfactormult) * split->rightfree; |
578 | 0 | else |
579 | 0 | delta = split->leftfree - split->rightfree; |
580 | |
|
581 | 0 | if (delta < 0) |
582 | 0 | delta = -delta; |
583 | | |
584 | | /* Save delta */ |
585 | 0 | split->curdelta = delta; |
586 | 0 | } |
587 | |
|
588 | 0 | qsort(state->splits, state->nsplits, sizeof(SplitPoint), _bt_splitcmp); |
589 | 0 | } |
590 | | |
591 | | /* |
592 | | * qsort-style comparator used by _bt_deltasortsplits() |
593 | | */ |
594 | | static int |
595 | | _bt_splitcmp(const void *arg1, const void *arg2) |
596 | 0 | { |
597 | 0 | SplitPoint *split1 = (SplitPoint *) arg1; |
598 | 0 | SplitPoint *split2 = (SplitPoint *) arg2; |
599 | |
|
600 | 0 | return pg_cmp_s16(split1->curdelta, split2->curdelta); |
601 | 0 | } |
602 | | |
603 | | /* |
604 | | * Subroutine to determine whether or not a non-rightmost leaf page should be |
605 | | * split immediately after the would-be original page offset for the |
606 | | * new/incoming tuple (or should have leaf fillfactor applied when new item is |
607 | | * to the right on original page). This is appropriate when there is a |
608 | | * pattern of localized monotonically increasing insertions into a composite |
609 | | * index, where leading attribute values form local groupings, and we |
610 | | * anticipate further insertions of the same/current grouping (new item's |
611 | | * grouping) in the near future. This can be thought of as a variation on |
612 | | * applying leaf fillfactor during rightmost leaf page splits, since cases |
613 | | * that benefit will converge on packing leaf pages leaffillfactor% full over |
614 | | * time. |
615 | | * |
616 | | * We may leave extra free space remaining on the rightmost page of a "most |
617 | | * significant column" grouping of tuples if that grouping never ends up |
618 | | * having future insertions that use the free space. That effect is |
619 | | * self-limiting; a future grouping that becomes the "nearest on the right" |
620 | | * grouping of the affected grouping usually puts the extra free space to good |
621 | | * use. |
622 | | * |
623 | | * Caller uses optimization when routine returns true, though the exact action |
624 | | * taken by caller varies. Caller uses original leaf page fillfactor in |
625 | | * standard way rather than using the new item offset directly when *usemult |
626 | | * was also set to true here. Otherwise, caller applies optimization by |
627 | | * locating the legal split point that makes the new tuple the lastleft tuple |
628 | | * for the split. |
629 | | */ |
630 | | static bool |
631 | | _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, |
632 | | int leaffillfactor, bool *usemult) |
633 | 0 | { |
634 | 0 | int16 nkeyatts; |
635 | 0 | ItemId itemid; |
636 | 0 | IndexTuple tup; |
637 | 0 | int keepnatts; |
638 | |
|
639 | 0 | Assert(state->is_leaf && !state->is_rightmost); |
640 | |
|
641 | 0 | nkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); |
642 | | |
643 | | /* Single key indexes not considered here */ |
644 | 0 | if (nkeyatts == 1) |
645 | 0 | return false; |
646 | | |
647 | | /* Ascending insertion pattern never inferred when new item is first */ |
648 | 0 | if (state->newitemoff == P_FIRSTKEY) |
649 | 0 | return false; |
650 | | |
651 | | /* |
652 | | * Only apply optimization on pages with equisized tuples, since ordinal |
653 | | * keys are likely to be fixed-width. Testing if the new tuple is |
654 | | * variable width directly might also work, but that fails to apply the |
655 | | * optimization to indexes with a numeric_ops attribute. |
656 | | * |
657 | | * Conclude that page has equisized tuples when the new item is the same |
658 | | * width as the smallest item observed during pass over page, and other |
659 | | * non-pivot tuples must be the same width as well. (Note that the |
660 | | * possibly-truncated existing high key isn't counted in |
661 | | * olddataitemstotal, and must be subtracted from maxoff.) |
662 | | */ |
663 | 0 | if (state->newitemsz != state->minfirstrightsz) |
664 | 0 | return false; |
665 | 0 | if (state->newitemsz * (maxoff - 1) != state->olddataitemstotal) |
666 | 0 | return false; |
667 | | |
668 | | /* |
669 | | * Avoid applying optimization when tuples are wider than a tuple |
670 | | * consisting of two non-NULL int8/int64 attributes (or four non-NULL |
671 | | * int4/int32 attributes) |
672 | | */ |
673 | 0 | if (state->newitemsz > |
674 | 0 | MAXALIGN(sizeof(IndexTupleData) + sizeof(int64) * 2) + |
675 | 0 | sizeof(ItemIdData)) |
676 | 0 | return false; |
677 | | |
678 | | /* |
679 | | * At least the first attribute's value must be equal to the corresponding |
680 | | * value in previous tuple to apply optimization. New item cannot be a |
681 | | * duplicate, either. |
682 | | * |
683 | | * Handle case where new item is to the right of all items on the existing |
684 | | * page. This is suggestive of monotonically increasing insertions in |
685 | | * itself, so the "heap TID adjacency" test is not applied here. |
686 | | */ |
687 | 0 | if (state->newitemoff > maxoff) |
688 | 0 | { |
689 | 0 | itemid = PageGetItemId(state->origpage, maxoff); |
690 | 0 | tup = (IndexTuple) PageGetItem(state->origpage, itemid); |
691 | 0 | keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); |
692 | |
|
693 | 0 | if (keepnatts > 1 && keepnatts <= nkeyatts) |
694 | 0 | { |
695 | 0 | *usemult = true; |
696 | 0 | return true; |
697 | 0 | } |
698 | | |
699 | 0 | return false; |
700 | 0 | } |
701 | | |
702 | | /* |
703 | | * "Low cardinality leading column, high cardinality suffix column" |
704 | | * indexes with a random insertion pattern (e.g., an index with a boolean |
705 | | * column, such as an index on '(book_is_in_print, book_isbn)') present us |
706 | | * with a risk of consistently misapplying the optimization. We're |
707 | | * willing to accept very occasional misapplication of the optimization, |
708 | | * provided the cases where we get it wrong are rare and self-limiting. |
709 | | * |
710 | | * Heap TID adjacency strongly suggests that the item just to the left was |
711 | | * inserted very recently, which limits overapplication of the |
712 | | * optimization. Besides, all inappropriate cases triggered here will |
713 | | * still split in the middle of the page on average. |
714 | | */ |
715 | 0 | itemid = PageGetItemId(state->origpage, OffsetNumberPrev(state->newitemoff)); |
716 | 0 | tup = (IndexTuple) PageGetItem(state->origpage, itemid); |
717 | | /* Do cheaper test first */ |
718 | 0 | if (BTreeTupleIsPosting(tup) || |
719 | 0 | !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) |
720 | 0 | return false; |
721 | | /* Check same conditions as rightmost item case, too */ |
722 | 0 | keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); |
723 | |
|
724 | 0 | if (keepnatts > 1 && keepnatts <= nkeyatts) |
725 | 0 | { |
726 | 0 | double interp = (double) state->newitemoff / ((double) maxoff + 1); |
727 | 0 | double leaffillfactormult = (double) leaffillfactor / 100.0; |
728 | | |
729 | | /* |
730 | | * Don't allow caller to split after a new item when it will result in |
731 | | * a split point to the right of the point that a leaf fillfactor |
732 | | * split would use -- have caller apply leaf fillfactor instead |
733 | | */ |
734 | 0 | *usemult = interp > leaffillfactormult; |
735 | |
|
736 | 0 | return true; |
737 | 0 | } |
738 | | |
739 | 0 | return false; |
740 | 0 | } |
741 | | |
742 | | /* |
743 | | * Subroutine for determining if two heap TIDS are "adjacent". |
744 | | * |
745 | | * Adjacent means that the high TID is very likely to have been inserted into |
746 | | * heap relation immediately after the low TID, probably during the current |
747 | | * transaction. |
748 | | */ |
749 | | static bool |
750 | | _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid) |
751 | 0 | { |
752 | 0 | BlockNumber lowblk, |
753 | 0 | highblk; |
754 | |
|
755 | 0 | lowblk = ItemPointerGetBlockNumber(lowhtid); |
756 | 0 | highblk = ItemPointerGetBlockNumber(highhtid); |
757 | | |
758 | | /* Make optimistic assumption of adjacency when heap blocks match */ |
759 | 0 | if (lowblk == highblk) |
760 | 0 | return true; |
761 | | |
762 | | /* When heap block one up, second offset should be FirstOffsetNumber */ |
763 | 0 | if (lowblk + 1 == highblk && |
764 | 0 | ItemPointerGetOffsetNumber(highhtid) == FirstOffsetNumber) |
765 | 0 | return true; |
766 | | |
767 | 0 | return false; |
768 | 0 | } |
769 | | |
770 | | /* |
771 | | * Subroutine to find the "best" split point among candidate split points. |
772 | | * The best split point is the split point with the lowest penalty among split |
773 | | * points that fall within current/final split interval. Penalty is an |
774 | | * abstract score, with a definition that varies depending on whether we're |
775 | | * splitting a leaf page or an internal page. See _bt_split_penalty() for |
776 | | * details. |
777 | | * |
778 | | * "perfectpenalty" is assumed to be the lowest possible penalty among |
779 | | * candidate split points. This allows us to return early without wasting |
780 | | * cycles on calculating the first differing attribute for all candidate |
781 | | * splits when that clearly cannot improve our choice (or when we only want a |
782 | | * minimally distinguishing split point, and don't want to make the split any |
783 | | * more unbalanced than is necessary). |
784 | | * |
785 | | * We return the index of the first existing tuple that should go on the right |
786 | | * page, plus a boolean indicating if new item is on left of split point. |
787 | | */ |
788 | | static OffsetNumber |
789 | | _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, |
790 | | bool *newitemonleft, FindSplitStrat strategy) |
791 | 0 | { |
792 | 0 | int bestpenalty, |
793 | 0 | lowsplit; |
794 | 0 | int highsplit = Min(state->interval, state->nsplits); |
795 | 0 | SplitPoint *final; |
796 | |
|
797 | 0 | bestpenalty = INT_MAX; |
798 | 0 | lowsplit = 0; |
799 | 0 | for (int i = lowsplit; i < highsplit; i++) |
800 | 0 | { |
801 | 0 | int penalty; |
802 | |
|
803 | 0 | penalty = _bt_split_penalty(state, state->splits + i); |
804 | |
|
805 | 0 | if (penalty < bestpenalty) |
806 | 0 | { |
807 | 0 | bestpenalty = penalty; |
808 | 0 | lowsplit = i; |
809 | 0 | } |
810 | |
|
811 | 0 | if (penalty <= perfectpenalty) |
812 | 0 | break; |
813 | 0 | } |
814 | |
|
815 | 0 | final = &state->splits[lowsplit]; |
816 | | |
817 | | /* |
818 | | * There is a risk that the "many duplicates" strategy will repeatedly do |
819 | | * the wrong thing when there are monotonically decreasing insertions to |
820 | | * the right of a large group of duplicates. Repeated splits could leave |
821 | | * a succession of right half pages with free space that can never be |
822 | | * used. This must be avoided. |
823 | | * |
824 | | * Consider the example of the leftmost page in a single integer attribute |
825 | | * NULLS FIRST index which is almost filled with NULLs. Monotonically |
826 | | * decreasing integer insertions might cause the same leftmost page to |
827 | | * split repeatedly at the same point. Each split derives its new high |
828 | | * key from the lowest current value to the immediate right of the large |
829 | | * group of NULLs, which will always be higher than all future integer |
830 | | * insertions, directing all future integer insertions to the same |
831 | | * leftmost page. |
832 | | */ |
833 | 0 | if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost && |
834 | 0 | !final->newitemonleft && final->firstrightoff >= state->newitemoff && |
835 | 0 | final->firstrightoff < state->newitemoff + 9) |
836 | 0 | { |
837 | | /* |
838 | | * Avoid the problem by performing a 50:50 split when the new item is |
839 | | * just to the right of the would-be "many duplicates" split point. |
840 | | * (Note that the test used for an insert that is "just to the right" |
841 | | * of the split point is conservative.) |
842 | | */ |
843 | 0 | final = &state->splits[0]; |
844 | 0 | } |
845 | |
|
846 | 0 | *newitemonleft = final->newitemonleft; |
847 | 0 | return final->firstrightoff; |
848 | 0 | } |
849 | | |
850 | 0 | #define LEAF_SPLIT_DISTANCE 0.050 |
851 | 0 | #define INTERNAL_SPLIT_DISTANCE 0.075 |
852 | | |
853 | | /* |
854 | | * Return a split interval to use for the default strategy. This is a limit |
855 | | * on the number of candidate split points to give further consideration to. |
856 | | * Only a fraction of all candidate splits points (those located at the start |
857 | | * of the now-sorted splits array) fall within the split interval. Split |
858 | | * interval is applied within _bt_bestsplitloc(). |
859 | | * |
860 | | * Split interval represents an acceptable range of split points -- those that |
861 | | * have leftfree and rightfree values that are acceptably balanced. The final |
862 | | * split point chosen is the split point with the lowest "penalty" among split |
863 | | * points in this split interval (unless we change our entire strategy, in |
864 | | * which case the interval also changes -- see _bt_strategy()). |
865 | | * |
866 | | * The "Prefix B-Trees" paper calls split interval sigma l for leaf splits, |
867 | | * and sigma b for internal ("branch") splits. It's hard to provide a |
868 | | * theoretical justification for the size of the split interval, though it's |
869 | | * clear that a small split interval can make tuples on level L+1 much smaller |
870 | | * on average, without noticeably affecting space utilization on level L. |
871 | | * (Note that the way that we calculate split interval might need to change if |
872 | | * suffix truncation is taught to truncate tuples "within" the last |
873 | | * attribute/datum for data types like text, which is more or less how it is |
874 | | * assumed to work in the paper.) |
875 | | */ |
876 | | static int |
877 | | _bt_defaultinterval(FindSplitData *state) |
878 | 0 | { |
879 | 0 | SplitPoint *spaceoptimal; |
880 | 0 | int16 tolerance, |
881 | 0 | lowleftfree, |
882 | 0 | lowrightfree, |
883 | 0 | highleftfree, |
884 | 0 | highrightfree; |
885 | | |
886 | | /* |
887 | | * Determine leftfree and rightfree values that are higher and lower than |
888 | | * we're willing to tolerate. Note that the final split interval will be |
889 | | * about 10% of nsplits in the common case where all non-pivot tuples |
890 | | * (data items) from a leaf page are uniformly sized. We're a bit more |
891 | | * aggressive when splitting internal pages. |
892 | | */ |
893 | 0 | if (state->is_leaf) |
894 | 0 | tolerance = state->olddataitemstotal * LEAF_SPLIT_DISTANCE; |
895 | 0 | else |
896 | 0 | tolerance = state->olddataitemstotal * INTERNAL_SPLIT_DISTANCE; |
897 | | |
898 | | /* First candidate split point is the most evenly balanced */ |
899 | 0 | spaceoptimal = state->splits; |
900 | 0 | lowleftfree = spaceoptimal->leftfree - tolerance; |
901 | 0 | lowrightfree = spaceoptimal->rightfree - tolerance; |
902 | 0 | highleftfree = spaceoptimal->leftfree + tolerance; |
903 | 0 | highrightfree = spaceoptimal->rightfree + tolerance; |
904 | | |
905 | | /* |
906 | | * Iterate through split points, starting from the split immediately after |
907 | | * 'spaceoptimal'. Find the first split point that divides free space so |
908 | | * unevenly that including it in the split interval would be unacceptable. |
909 | | */ |
910 | 0 | for (int i = 1; i < state->nsplits; i++) |
911 | 0 | { |
912 | 0 | SplitPoint *split = state->splits + i; |
913 | | |
914 | | /* Cannot use curdelta here, since its value is often weighted */ |
915 | 0 | if (split->leftfree < lowleftfree || split->rightfree < lowrightfree || |
916 | 0 | split->leftfree > highleftfree || split->rightfree > highrightfree) |
917 | 0 | return i; |
918 | 0 | } |
919 | | |
920 | 0 | return state->nsplits; |
921 | 0 | } |
922 | | |
923 | | /* |
924 | | * Subroutine to decide whether split should use default strategy/initial |
925 | | * split interval, or whether it should finish splitting the page using |
926 | | * alternative strategies (this is only possible with leaf pages). |
927 | | * |
928 | | * Caller uses alternative strategy (or sticks with default strategy) based |
929 | | * on how *strategy is set here. Return value is "perfect penalty", which is |
930 | | * passed to _bt_bestsplitloc() as a final constraint on how far caller is |
931 | | * willing to go to avoid appending a heap TID when using the many duplicates |
932 | | * strategy (it also saves _bt_bestsplitloc() useless cycles). |
933 | | */ |
934 | | static int |
935 | | _bt_strategy(FindSplitData *state, SplitPoint *leftpage, |
936 | | SplitPoint *rightpage, FindSplitStrat *strategy) |
937 | 0 | { |
938 | 0 | IndexTuple leftmost, |
939 | 0 | rightmost; |
940 | 0 | SplitPoint *leftinterval, |
941 | 0 | *rightinterval; |
942 | 0 | int perfectpenalty; |
943 | 0 | int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); |
944 | | |
945 | | /* Assume that alternative strategy won't be used for now */ |
946 | 0 | *strategy = SPLIT_DEFAULT; |
947 | | |
948 | | /* |
949 | | * Use smallest observed firstright item size for entire page (actually, |
950 | | * entire imaginary version of page that includes newitem) as perfect |
951 | | * penalty on internal pages. This can save cycles in the common case |
952 | | * where most or all splits (not just splits within interval) have |
953 | | * firstright tuples that are the same size. |
954 | | */ |
955 | 0 | if (!state->is_leaf) |
956 | 0 | return state->minfirstrightsz; |
957 | | |
958 | | /* |
959 | | * Use leftmost and rightmost tuples from leftmost and rightmost splits in |
960 | | * current split interval |
961 | | */ |
962 | 0 | _bt_interval_edges(state, &leftinterval, &rightinterval); |
963 | 0 | leftmost = _bt_split_lastleft(state, leftinterval); |
964 | 0 | rightmost = _bt_split_firstright(state, rightinterval); |
965 | | |
966 | | /* |
967 | | * If initial split interval can produce a split point that will at least |
968 | | * avoid appending a heap TID in new high key, we're done. Finish split |
969 | | * with default strategy and initial split interval. |
970 | | */ |
971 | 0 | perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); |
972 | 0 | if (perfectpenalty <= indnkeyatts) |
973 | 0 | return perfectpenalty; |
974 | | |
975 | | /* |
976 | | * Work out how caller should finish split when even their "perfect" |
977 | | * penalty for initial/default split interval indicates that the interval |
978 | | * does not contain even a single split that avoids appending a heap TID. |
979 | | * |
980 | | * Use the leftmost split's lastleft tuple and the rightmost split's |
981 | | * firstright tuple to assess every possible split. |
982 | | */ |
983 | 0 | leftmost = _bt_split_lastleft(state, leftpage); |
984 | 0 | rightmost = _bt_split_firstright(state, rightpage); |
985 | | |
986 | | /* |
987 | | * If page (including new item) has many duplicates but is not entirely |
988 | | * full of duplicates, a many duplicates strategy split will be performed. |
989 | | * If page is entirely full of duplicates, a single value strategy split |
990 | | * will be performed. |
991 | | */ |
992 | 0 | perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); |
993 | 0 | if (perfectpenalty <= indnkeyatts) |
994 | 0 | { |
995 | 0 | *strategy = SPLIT_MANY_DUPLICATES; |
996 | | |
997 | | /* |
998 | | * Many duplicates strategy should split at either side the group of |
999 | | * duplicates that enclose the delta-optimal split point. Return |
1000 | | * indnkeyatts rather than the true perfect penalty to make that |
1001 | | * happen. (If perfectpenalty was returned here then low cardinality |
1002 | | * composite indexes could have continual unbalanced splits.) |
1003 | | * |
1004 | | * Note that caller won't go through with a many duplicates split in |
1005 | | * rare cases where it looks like there are ever-decreasing insertions |
1006 | | * to the immediate right of the split point. This must happen just |
1007 | | * before a final decision is made, within _bt_bestsplitloc(). |
1008 | | */ |
1009 | 0 | return indnkeyatts; |
1010 | 0 | } |
1011 | | |
1012 | | /* |
1013 | | * Single value strategy is only appropriate with ever-increasing heap |
1014 | | * TIDs; otherwise, original default strategy split should proceed to |
1015 | | * avoid pathological performance. Use page high key to infer if this is |
1016 | | * the rightmost page among pages that store the same duplicate value. |
1017 | | * This should not prevent insertions of heap TIDs that are slightly out |
1018 | | * of order from using single value strategy, since that's expected with |
1019 | | * concurrent inserters of the same duplicate value. |
1020 | | */ |
1021 | 0 | else if (state->is_rightmost) |
1022 | 0 | *strategy = SPLIT_SINGLE_VALUE; |
1023 | 0 | else |
1024 | 0 | { |
1025 | 0 | ItemId itemid; |
1026 | 0 | IndexTuple hikey; |
1027 | |
|
1028 | 0 | itemid = PageGetItemId(state->origpage, P_HIKEY); |
1029 | 0 | hikey = (IndexTuple) PageGetItem(state->origpage, itemid); |
1030 | 0 | perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, |
1031 | 0 | state->newitem); |
1032 | 0 | if (perfectpenalty <= indnkeyatts) |
1033 | 0 | *strategy = SPLIT_SINGLE_VALUE; |
1034 | 0 | else |
1035 | 0 | { |
1036 | | /* |
1037 | | * Have caller finish split using default strategy, since page |
1038 | | * does not appear to be the rightmost page for duplicates of the |
1039 | | * value the page is filled with |
1040 | | */ |
1041 | 0 | } |
1042 | 0 | } |
1043 | | |
1044 | 0 | return perfectpenalty; |
1045 | 0 | } |
1046 | | |
1047 | | /* |
1048 | | * Subroutine to locate leftmost and rightmost splits for current/default |
1049 | | * split interval. Note that it will be the same split iff there is only one |
1050 | | * split in interval. |
1051 | | */ |
1052 | | static void |
1053 | | _bt_interval_edges(FindSplitData *state, SplitPoint **leftinterval, |
1054 | | SplitPoint **rightinterval) |
1055 | 0 | { |
1056 | 0 | int highsplit = Min(state->interval, state->nsplits); |
1057 | 0 | SplitPoint *deltaoptimal; |
1058 | |
|
1059 | 0 | deltaoptimal = state->splits; |
1060 | 0 | *leftinterval = NULL; |
1061 | 0 | *rightinterval = NULL; |
1062 | | |
1063 | | /* |
1064 | | * Delta is an absolute distance to optimal split point, so both the |
1065 | | * leftmost and rightmost split point will usually be at the end of the |
1066 | | * array |
1067 | | */ |
1068 | 0 | for (int i = highsplit - 1; i >= 0; i--) |
1069 | 0 | { |
1070 | 0 | SplitPoint *distant = state->splits + i; |
1071 | |
|
1072 | 0 | if (distant->firstrightoff < deltaoptimal->firstrightoff) |
1073 | 0 | { |
1074 | 0 | if (*leftinterval == NULL) |
1075 | 0 | *leftinterval = distant; |
1076 | 0 | } |
1077 | 0 | else if (distant->firstrightoff > deltaoptimal->firstrightoff) |
1078 | 0 | { |
1079 | 0 | if (*rightinterval == NULL) |
1080 | 0 | *rightinterval = distant; |
1081 | 0 | } |
1082 | 0 | else if (!distant->newitemonleft && deltaoptimal->newitemonleft) |
1083 | 0 | { |
1084 | | /* |
1085 | | * "incoming tuple will become firstright" (distant) is to the |
1086 | | * left of "incoming tuple will become lastleft" (delta-optimal) |
1087 | | */ |
1088 | 0 | Assert(distant->firstrightoff == state->newitemoff); |
1089 | 0 | if (*leftinterval == NULL) |
1090 | 0 | *leftinterval = distant; |
1091 | 0 | } |
1092 | 0 | else if (distant->newitemonleft && !deltaoptimal->newitemonleft) |
1093 | 0 | { |
1094 | | /* |
1095 | | * "incoming tuple will become lastleft" (distant) is to the right |
1096 | | * of "incoming tuple will become firstright" (delta-optimal) |
1097 | | */ |
1098 | 0 | Assert(distant->firstrightoff == state->newitemoff); |
1099 | 0 | if (*rightinterval == NULL) |
1100 | 0 | *rightinterval = distant; |
1101 | 0 | } |
1102 | 0 | else |
1103 | 0 | { |
1104 | | /* There was only one or two splits in initial split interval */ |
1105 | 0 | Assert(distant == deltaoptimal); |
1106 | 0 | if (*leftinterval == NULL) |
1107 | 0 | *leftinterval = distant; |
1108 | 0 | if (*rightinterval == NULL) |
1109 | 0 | *rightinterval = distant; |
1110 | 0 | } |
1111 | |
|
1112 | 0 | if (*leftinterval && *rightinterval) |
1113 | 0 | return; |
1114 | 0 | } |
1115 | | |
1116 | 0 | Assert(false); |
1117 | 0 | } |
1118 | | |
1119 | | /* |
1120 | | * Subroutine to find penalty for caller's candidate split point. |
1121 | | * |
1122 | | * On leaf pages, penalty is the attribute number that distinguishes each side |
1123 | | * of a split. It's the last attribute that needs to be included in new high |
1124 | | * key for left page. It can be greater than the number of key attributes in |
1125 | | * cases where a heap TID will need to be appended during truncation. |
1126 | | * |
1127 | | * On internal pages, penalty is simply the size of the firstright tuple for |
1128 | | * the split (including line pointer overhead). This tuple will become the |
1129 | | * new high key for the left page. |
1130 | | */ |
1131 | | static inline int |
1132 | | _bt_split_penalty(FindSplitData *state, SplitPoint *split) |
1133 | 0 | { |
1134 | 0 | IndexTuple lastleft; |
1135 | 0 | IndexTuple firstright; |
1136 | |
|
1137 | 0 | if (!state->is_leaf) |
1138 | 0 | { |
1139 | 0 | ItemId itemid; |
1140 | |
|
1141 | 0 | if (!split->newitemonleft && |
1142 | 0 | split->firstrightoff == state->newitemoff) |
1143 | 0 | return state->newitemsz; |
1144 | | |
1145 | 0 | itemid = PageGetItemId(state->origpage, split->firstrightoff); |
1146 | |
|
1147 | 0 | return MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData); |
1148 | 0 | } |
1149 | | |
1150 | 0 | lastleft = _bt_split_lastleft(state, split); |
1151 | 0 | firstright = _bt_split_firstright(state, split); |
1152 | |
|
1153 | 0 | return _bt_keep_natts_fast(state->rel, lastleft, firstright); |
1154 | 0 | } |
1155 | | |
1156 | | /* |
1157 | | * Subroutine to get a lastleft IndexTuple for a split point |
1158 | | */ |
1159 | | static inline IndexTuple |
1160 | | _bt_split_lastleft(FindSplitData *state, SplitPoint *split) |
1161 | 0 | { |
1162 | 0 | ItemId itemid; |
1163 | |
|
1164 | 0 | if (split->newitemonleft && split->firstrightoff == state->newitemoff) |
1165 | 0 | return state->newitem; |
1166 | | |
1167 | 0 | itemid = PageGetItemId(state->origpage, |
1168 | 0 | OffsetNumberPrev(split->firstrightoff)); |
1169 | 0 | return (IndexTuple) PageGetItem(state->origpage, itemid); |
1170 | 0 | } |
1171 | | |
1172 | | /* |
1173 | | * Subroutine to get a firstright IndexTuple for a split point |
1174 | | */ |
1175 | | static inline IndexTuple |
1176 | | _bt_split_firstright(FindSplitData *state, SplitPoint *split) |
1177 | 0 | { |
1178 | 0 | ItemId itemid; |
1179 | |
|
1180 | 0 | if (!split->newitemonleft && split->firstrightoff == state->newitemoff) |
1181 | 0 | return state->newitem; |
1182 | | |
1183 | 0 | itemid = PageGetItemId(state->origpage, split->firstrightoff); |
1184 | 0 | return (IndexTuple) PageGetItem(state->origpage, itemid); |
1185 | 0 | } |