/src/postgres/src/backend/access/nbtree/nbtutils.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * nbtutils.c |
4 | | * Utility code for Postgres btree implementation. |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/access/nbtree/nbtutils.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | |
16 | | #include "postgres.h" |
17 | | |
18 | | #include <time.h> |
19 | | |
20 | | #include "access/nbtree.h" |
21 | | #include "access/reloptions.h" |
22 | | #include "commands/progress.h" |
23 | | #include "miscadmin.h" |
24 | | #include "utils/datum.h" |
25 | | #include "utils/lsyscache.h" |
26 | | |
27 | 0 | #define LOOK_AHEAD_REQUIRED_RECHECKS 3 |
28 | 0 | #define LOOK_AHEAD_DEFAULT_DISTANCE 5 |
29 | 0 | #define NSKIPADVANCES_THRESHOLD 3 |
30 | | |
31 | | static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, |
32 | | Datum tupdatum, bool tupnull, |
33 | | Datum arrdatum, ScanKey cur); |
34 | | static void _bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, |
35 | | Datum tupdatum, bool tupnull, |
36 | | BTArrayKeyInfo *array, ScanKey cur, |
37 | | int32 *set_elem_result); |
38 | | static void _bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, |
39 | | int32 set_elem_result, Datum tupdatum, bool tupnull); |
40 | | static void _bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array); |
41 | | static void _bt_array_set_low_or_high(Relation rel, ScanKey skey, |
42 | | BTArrayKeyInfo *array, bool low_not_high); |
43 | | static bool _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array); |
44 | | static bool _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array); |
45 | | static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, |
46 | | bool *skip_array_set); |
47 | | static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir); |
48 | | static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, |
49 | | IndexTuple tuple, TupleDesc tupdesc, int tupnatts, |
50 | | bool readpagetup, int sktrig, bool *scanBehind); |
51 | | static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, |
52 | | IndexTuple tuple, int tupnatts, TupleDesc tupdesc, |
53 | | int sktrig, bool sktrig_required); |
54 | | #ifdef USE_ASSERT_CHECKING |
55 | | static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir); |
56 | | static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan); |
57 | | #endif |
58 | | static bool _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, |
59 | | IndexTuple finaltup); |
60 | | static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir, |
61 | | IndexTuple tuple, int tupnatts, TupleDesc tupdesc, |
62 | | bool advancenonrequired, bool forcenonrequired, |
63 | | bool *continuescan, int *ikey); |
64 | | static bool _bt_check_rowcompare(ScanKey skey, |
65 | | IndexTuple tuple, int tupnatts, TupleDesc tupdesc, |
66 | | ScanDirection dir, bool forcenonrequired, bool *continuescan); |
67 | | static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, |
68 | | int tupnatts, TupleDesc tupdesc); |
69 | | static int _bt_keep_natts(Relation rel, IndexTuple lastleft, |
70 | | IndexTuple firstright, BTScanInsert itup_key); |
71 | | |
72 | | |
73 | | /* |
74 | | * _bt_mkscankey |
75 | | * Build an insertion scan key that contains comparison data from itup |
76 | | * as well as comparator routines appropriate to the key datatypes. |
77 | | * |
78 | | * The result is intended for use with _bt_compare() and _bt_truncate(). |
79 | | * Callers that don't need to fill out the insertion scankey arguments |
80 | | * (e.g. they use an ad-hoc comparison routine, or only need a scankey |
81 | | * for _bt_truncate()) can pass a NULL index tuple. The scankey will |
82 | | * be initialized as if an "all truncated" pivot tuple was passed |
83 | | * instead. |
84 | | * |
85 | | * Note that we may occasionally have to share lock the metapage to |
86 | | * determine whether or not the keys in the index are expected to be |
87 | | * unique (i.e. if this is a "heapkeyspace" index). We assume a |
88 | | * heapkeyspace index when caller passes a NULL tuple, allowing index |
89 | | * build callers to avoid accessing the non-existent metapage. We |
90 | | * also assume that the index is _not_ allequalimage when a NULL tuple |
91 | | * is passed; CREATE INDEX callers call _bt_allequalimage() to set the |
92 | | * field themselves. |
93 | | */ |
94 | | BTScanInsert |
95 | | _bt_mkscankey(Relation rel, IndexTuple itup) |
96 | 0 | { |
97 | 0 | BTScanInsert key; |
98 | 0 | ScanKey skey; |
99 | 0 | TupleDesc itupdesc; |
100 | 0 | int indnkeyatts; |
101 | 0 | int16 *indoption; |
102 | 0 | int tupnatts; |
103 | 0 | int i; |
104 | |
|
105 | 0 | itupdesc = RelationGetDescr(rel); |
106 | 0 | indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); |
107 | 0 | indoption = rel->rd_indoption; |
108 | 0 | tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; |
109 | |
|
110 | 0 | Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); |
111 | | |
112 | | /* |
113 | | * We'll execute search using scan key constructed on key columns. |
114 | | * Truncated attributes and non-key attributes are omitted from the final |
115 | | * scan key. |
116 | | */ |
117 | 0 | key = palloc(offsetof(BTScanInsertData, scankeys) + |
118 | 0 | sizeof(ScanKeyData) * indnkeyatts); |
119 | 0 | if (itup) |
120 | 0 | _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); |
121 | 0 | else |
122 | 0 | { |
123 | | /* Utility statement callers can set these fields themselves */ |
124 | 0 | key->heapkeyspace = true; |
125 | 0 | key->allequalimage = false; |
126 | 0 | } |
127 | 0 | key->anynullkeys = false; /* initial assumption */ |
128 | 0 | key->nextkey = false; /* usual case, required by btinsert */ |
129 | 0 | key->backward = false; /* usual case, required by btinsert */ |
130 | 0 | key->keysz = Min(indnkeyatts, tupnatts); |
131 | 0 | key->scantid = key->heapkeyspace && itup ? |
132 | 0 | BTreeTupleGetHeapTID(itup) : NULL; |
133 | 0 | skey = key->scankeys; |
134 | 0 | for (i = 0; i < indnkeyatts; i++) |
135 | 0 | { |
136 | 0 | FmgrInfo *procinfo; |
137 | 0 | Datum arg; |
138 | 0 | bool null; |
139 | 0 | int flags; |
140 | | |
141 | | /* |
142 | | * We can use the cached (default) support procs since no cross-type |
143 | | * comparison can be needed. |
144 | | */ |
145 | 0 | procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); |
146 | | |
147 | | /* |
148 | | * Key arguments built from truncated attributes (or when caller |
149 | | * provides no tuple) are defensively represented as NULL values. They |
150 | | * should never be used. |
151 | | */ |
152 | 0 | if (i < tupnatts) |
153 | 0 | arg = index_getattr(itup, i + 1, itupdesc, &null); |
154 | 0 | else |
155 | 0 | { |
156 | 0 | arg = (Datum) 0; |
157 | 0 | null = true; |
158 | 0 | } |
159 | 0 | flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); |
160 | 0 | ScanKeyEntryInitializeWithInfo(&skey[i], |
161 | 0 | flags, |
162 | 0 | (AttrNumber) (i + 1), |
163 | 0 | InvalidStrategy, |
164 | 0 | InvalidOid, |
165 | 0 | rel->rd_indcollation[i], |
166 | 0 | procinfo, |
167 | 0 | arg); |
168 | | /* Record if any key attribute is NULL (or truncated) */ |
169 | 0 | if (null) |
170 | 0 | key->anynullkeys = true; |
171 | 0 | } |
172 | | |
173 | | /* |
174 | | * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so |
175 | | * that full uniqueness check is done. |
176 | | */ |
177 | 0 | if (rel->rd_index->indnullsnotdistinct) |
178 | 0 | key->anynullkeys = false; |
179 | |
|
180 | 0 | return key; |
181 | 0 | } |
182 | | |
183 | | /* |
184 | | * free a retracement stack made by _bt_search. |
185 | | */ |
186 | | void |
187 | | _bt_freestack(BTStack stack) |
188 | 0 | { |
189 | 0 | BTStack ostack; |
190 | |
|
191 | 0 | while (stack != NULL) |
192 | 0 | { |
193 | 0 | ostack = stack; |
194 | 0 | stack = stack->bts_parent; |
195 | 0 | pfree(ostack); |
196 | 0 | } |
197 | 0 | } |
198 | | |
199 | | /* |
200 | | * _bt_compare_array_skey() -- apply array comparison function |
201 | | * |
202 | | * Compares caller's tuple attribute value to a scan key/array element. |
203 | | * Helper function used during binary searches of SK_SEARCHARRAY arrays. |
204 | | * |
205 | | * This routine returns: |
206 | | * <0 if tupdatum < arrdatum; |
207 | | * 0 if tupdatum == arrdatum; |
208 | | * >0 if tupdatum > arrdatum. |
209 | | * |
210 | | * This is essentially the same interface as _bt_compare: both functions |
211 | | * compare the value that they're searching for to a binary search pivot. |
212 | | * However, unlike _bt_compare, this function's "tuple argument" comes first, |
213 | | * while its "array/scankey argument" comes second. |
214 | | */ |
215 | | static inline int32 |
216 | | _bt_compare_array_skey(FmgrInfo *orderproc, |
217 | | Datum tupdatum, bool tupnull, |
218 | | Datum arrdatum, ScanKey cur) |
219 | 0 | { |
220 | 0 | int32 result = 0; |
221 | |
|
222 | 0 | Assert(cur->sk_strategy == BTEqualStrategyNumber); |
223 | 0 | Assert(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL))); |
224 | |
|
225 | 0 | if (tupnull) /* NULL tupdatum */ |
226 | 0 | { |
227 | 0 | if (cur->sk_flags & SK_ISNULL) |
228 | 0 | result = 0; /* NULL "=" NULL */ |
229 | 0 | else if (cur->sk_flags & SK_BT_NULLS_FIRST) |
230 | 0 | result = -1; /* NULL "<" NOT_NULL */ |
231 | 0 | else |
232 | 0 | result = 1; /* NULL ">" NOT_NULL */ |
233 | 0 | } |
234 | 0 | else if (cur->sk_flags & SK_ISNULL) /* NOT_NULL tupdatum, NULL arrdatum */ |
235 | 0 | { |
236 | 0 | if (cur->sk_flags & SK_BT_NULLS_FIRST) |
237 | 0 | result = 1; /* NOT_NULL ">" NULL */ |
238 | 0 | else |
239 | 0 | result = -1; /* NOT_NULL "<" NULL */ |
240 | 0 | } |
241 | 0 | else |
242 | 0 | { |
243 | | /* |
244 | | * Like _bt_compare, we need to be careful of cross-type comparisons, |
245 | | * so the left value has to be the value that came from an index tuple |
246 | | */ |
247 | 0 | result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation, |
248 | 0 | tupdatum, arrdatum)); |
249 | | |
250 | | /* |
251 | | * We flip the sign by following the obvious rule: flip whenever the |
252 | | * column is a DESC column. |
253 | | * |
254 | | * _bt_compare does it the wrong way around (flip when *ASC*) in order |
255 | | * to compensate for passing its orderproc arguments backwards. We |
256 | | * don't need to play these games because we find it natural to pass |
257 | | * tupdatum as the left value (and arrdatum as the right value). |
258 | | */ |
259 | 0 | if (cur->sk_flags & SK_BT_DESC) |
260 | 0 | INVERT_COMPARE_RESULT(result); |
261 | 0 | } |
262 | |
|
263 | 0 | return result; |
264 | 0 | } |
265 | | |
266 | | /* |
267 | | * _bt_binsrch_array_skey() -- Binary search for next matching array key |
268 | | * |
269 | | * Returns an index to the first array element >= caller's tupdatum argument. |
270 | | * This convention is more natural for forwards scan callers, but that can't |
271 | | * really matter to backwards scan callers. Both callers require handling for |
272 | | * the case where the match we return is < tupdatum, and symmetric handling |
273 | | * for the case where our best match is > tupdatum. |
274 | | * |
275 | | * Also sets *set_elem_result to the result _bt_compare_array_skey returned |
276 | | * when we used it to compare the matching array element to tupdatum/tupnull. |
277 | | * |
278 | | * cur_elem_trig indicates if array advancement was triggered by this array's |
279 | | * scan key, and that the array is for a required scan key. We can apply this |
280 | | * information to find the next matching array element in the current scan |
281 | | * direction using far fewer comparisons (fewer on average, compared to naive |
282 | | * binary search). This scheme takes advantage of an important property of |
283 | | * required arrays: required arrays always advance in lockstep with the index |
284 | | * scan's progress through the index's key space. |
285 | | */ |
286 | | int |
287 | | _bt_binsrch_array_skey(FmgrInfo *orderproc, |
288 | | bool cur_elem_trig, ScanDirection dir, |
289 | | Datum tupdatum, bool tupnull, |
290 | | BTArrayKeyInfo *array, ScanKey cur, |
291 | | int32 *set_elem_result) |
292 | 0 | { |
293 | 0 | int low_elem = 0, |
294 | 0 | mid_elem = -1, |
295 | 0 | high_elem = array->num_elems - 1, |
296 | 0 | result = 0; |
297 | 0 | Datum arrdatum; |
298 | |
|
299 | 0 | Assert(cur->sk_flags & SK_SEARCHARRAY); |
300 | 0 | Assert(!(cur->sk_flags & SK_BT_SKIP)); |
301 | 0 | Assert(!(cur->sk_flags & SK_ISNULL)); /* SAOP arrays never have NULLs */ |
302 | 0 | Assert(cur->sk_strategy == BTEqualStrategyNumber); |
303 | |
|
304 | 0 | if (cur_elem_trig) |
305 | 0 | { |
306 | 0 | Assert(!ScanDirectionIsNoMovement(dir)); |
307 | 0 | Assert(cur->sk_flags & SK_BT_REQFWD); |
308 | | |
309 | | /* |
310 | | * When the scan key that triggered array advancement is a required |
311 | | * array scan key, it is now certain that the current array element |
312 | | * (plus all prior elements relative to the current scan direction) |
313 | | * cannot possibly be at or ahead of the corresponding tuple value. |
314 | | * (_bt_checkkeys must have called _bt_tuple_before_array_skeys, which |
315 | | * makes sure this is true as a condition of advancing the arrays.) |
316 | | * |
317 | | * This makes it safe to exclude array elements up to and including |
318 | | * the former-current array element from our search. |
319 | | * |
320 | | * Separately, when array advancement was triggered by a required scan |
321 | | * key, the array element immediately after the former-current element |
322 | | * is often either an exact tupdatum match, or a "close by" near-match |
323 | | * (a near-match tupdatum is one whose key space falls _between_ the |
324 | | * former-current and new-current array elements). We'll detect both |
325 | | * cases via an optimistic comparison of the new search lower bound |
326 | | * (or new search upper bound in the case of backwards scans). |
327 | | */ |
328 | 0 | if (ScanDirectionIsForward(dir)) |
329 | 0 | { |
330 | 0 | low_elem = array->cur_elem + 1; /* old cur_elem exhausted */ |
331 | | |
332 | | /* Compare prospective new cur_elem (also the new lower bound) */ |
333 | 0 | if (high_elem >= low_elem) |
334 | 0 | { |
335 | 0 | arrdatum = array->elem_values[low_elem]; |
336 | 0 | result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, |
337 | 0 | arrdatum, cur); |
338 | |
|
339 | 0 | if (result <= 0) |
340 | 0 | { |
341 | | /* Optimistic comparison optimization worked out */ |
342 | 0 | *set_elem_result = result; |
343 | 0 | return low_elem; |
344 | 0 | } |
345 | 0 | mid_elem = low_elem; |
346 | 0 | low_elem++; /* this cur_elem exhausted, too */ |
347 | 0 | } |
348 | | |
349 | 0 | if (high_elem < low_elem) |
350 | 0 | { |
351 | | /* Caller needs to perform "beyond end" array advancement */ |
352 | 0 | *set_elem_result = 1; |
353 | 0 | return high_elem; |
354 | 0 | } |
355 | 0 | } |
356 | 0 | else |
357 | 0 | { |
358 | 0 | high_elem = array->cur_elem - 1; /* old cur_elem exhausted */ |
359 | | |
360 | | /* Compare prospective new cur_elem (also the new upper bound) */ |
361 | 0 | if (high_elem >= low_elem) |
362 | 0 | { |
363 | 0 | arrdatum = array->elem_values[high_elem]; |
364 | 0 | result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, |
365 | 0 | arrdatum, cur); |
366 | |
|
367 | 0 | if (result >= 0) |
368 | 0 | { |
369 | | /* Optimistic comparison optimization worked out */ |
370 | 0 | *set_elem_result = result; |
371 | 0 | return high_elem; |
372 | 0 | } |
373 | 0 | mid_elem = high_elem; |
374 | 0 | high_elem--; /* this cur_elem exhausted, too */ |
375 | 0 | } |
376 | | |
377 | 0 | if (high_elem < low_elem) |
378 | 0 | { |
379 | | /* Caller needs to perform "beyond end" array advancement */ |
380 | 0 | *set_elem_result = -1; |
381 | 0 | return low_elem; |
382 | 0 | } |
383 | 0 | } |
384 | 0 | } |
385 | | |
386 | 0 | while (high_elem > low_elem) |
387 | 0 | { |
388 | 0 | mid_elem = low_elem + ((high_elem - low_elem) / 2); |
389 | 0 | arrdatum = array->elem_values[mid_elem]; |
390 | |
|
391 | 0 | result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, |
392 | 0 | arrdatum, cur); |
393 | |
|
394 | 0 | if (result == 0) |
395 | 0 | { |
396 | | /* |
397 | | * It's safe to quit as soon as we see an equal array element. |
398 | | * This often saves an extra comparison or two... |
399 | | */ |
400 | 0 | low_elem = mid_elem; |
401 | 0 | break; |
402 | 0 | } |
403 | | |
404 | 0 | if (result > 0) |
405 | 0 | low_elem = mid_elem + 1; |
406 | 0 | else |
407 | 0 | high_elem = mid_elem; |
408 | 0 | } |
409 | | |
410 | | /* |
411 | | * ...but our caller also cares about how its searched-for tuple datum |
412 | | * compares to the low_elem datum. Must always set *set_elem_result with |
413 | | * the result of that comparison specifically. |
414 | | */ |
415 | 0 | if (low_elem != mid_elem) |
416 | 0 | result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, |
417 | 0 | array->elem_values[low_elem], cur); |
418 | |
|
419 | 0 | *set_elem_result = result; |
420 | |
|
421 | 0 | return low_elem; |
422 | 0 | } |
423 | | |
424 | | /* |
425 | | * _bt_binsrch_skiparray_skey() -- "Binary search" within a skip array |
426 | | * |
427 | | * Does not return an index into the array, since skip arrays don't really |
428 | | * contain elements (they generate their array elements procedurally instead). |
429 | | * Our interface matches that of _bt_binsrch_array_skey in every other way. |
430 | | * |
431 | | * Sets *set_elem_result just like _bt_binsrch_array_skey would with a true |
432 | | * array. The value 0 indicates that tupdatum/tupnull is within the range of |
433 | | * the skip array. We return -1 when tupdatum/tupnull is lower that any value |
434 | | * within the range of the array, and 1 when it is higher than every value. |
435 | | * Caller should pass *set_elem_result to _bt_skiparray_set_element to advance |
436 | | * the array. |
437 | | * |
438 | | * cur_elem_trig indicates if array advancement was triggered by this array's |
439 | | * scan key. We use this to optimize-away comparisons that are known by our |
440 | | * caller to be unnecessary from context, just like _bt_binsrch_array_skey. |
441 | | */ |
442 | | static void |
443 | | _bt_binsrch_skiparray_skey(bool cur_elem_trig, ScanDirection dir, |
444 | | Datum tupdatum, bool tupnull, |
445 | | BTArrayKeyInfo *array, ScanKey cur, |
446 | | int32 *set_elem_result) |
447 | 0 | { |
448 | 0 | Assert(cur->sk_flags & SK_BT_SKIP); |
449 | 0 | Assert(cur->sk_flags & SK_SEARCHARRAY); |
450 | 0 | Assert(cur->sk_flags & SK_BT_REQFWD); |
451 | 0 | Assert(array->num_elems == -1); |
452 | 0 | Assert(!ScanDirectionIsNoMovement(dir)); |
453 | |
|
454 | 0 | if (array->null_elem) |
455 | 0 | { |
456 | 0 | Assert(!array->low_compare && !array->high_compare); |
457 | |
|
458 | 0 | *set_elem_result = 0; |
459 | 0 | return; |
460 | 0 | } |
461 | | |
462 | 0 | if (tupnull) /* NULL tupdatum */ |
463 | 0 | { |
464 | 0 | if (cur->sk_flags & SK_BT_NULLS_FIRST) |
465 | 0 | *set_elem_result = -1; /* NULL "<" NOT_NULL */ |
466 | 0 | else |
467 | 0 | *set_elem_result = 1; /* NULL ">" NOT_NULL */ |
468 | 0 | return; |
469 | 0 | } |
470 | | |
471 | | /* |
472 | | * Array inequalities determine whether tupdatum is within the range of |
473 | | * caller's skip array |
474 | | */ |
475 | 0 | *set_elem_result = 0; |
476 | 0 | if (ScanDirectionIsForward(dir)) |
477 | 0 | { |
478 | | /* |
479 | | * Evaluate low_compare first (unless cur_elem_trig tells us that it |
480 | | * cannot possibly fail to be satisfied), then evaluate high_compare |
481 | | */ |
482 | 0 | if (!cur_elem_trig && array->low_compare && |
483 | 0 | !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, |
484 | 0 | array->low_compare->sk_collation, |
485 | 0 | tupdatum, |
486 | 0 | array->low_compare->sk_argument))) |
487 | 0 | *set_elem_result = -1; |
488 | 0 | else if (array->high_compare && |
489 | 0 | !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, |
490 | 0 | array->high_compare->sk_collation, |
491 | 0 | tupdatum, |
492 | 0 | array->high_compare->sk_argument))) |
493 | 0 | *set_elem_result = 1; |
494 | 0 | } |
495 | 0 | else |
496 | 0 | { |
497 | | /* |
498 | | * Evaluate high_compare first (unless cur_elem_trig tells us that it |
499 | | * cannot possibly fail to be satisfied), then evaluate low_compare |
500 | | */ |
501 | 0 | if (!cur_elem_trig && array->high_compare && |
502 | 0 | !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, |
503 | 0 | array->high_compare->sk_collation, |
504 | 0 | tupdatum, |
505 | 0 | array->high_compare->sk_argument))) |
506 | 0 | *set_elem_result = 1; |
507 | 0 | else if (array->low_compare && |
508 | 0 | !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, |
509 | 0 | array->low_compare->sk_collation, |
510 | 0 | tupdatum, |
511 | 0 | array->low_compare->sk_argument))) |
512 | 0 | *set_elem_result = -1; |
513 | 0 | } |
514 | | |
515 | | /* |
516 | | * Assert that any keys that were assumed to be satisfied already (due to |
517 | | * caller passing cur_elem_trig=true) really are satisfied as expected |
518 | | */ |
519 | | #ifdef USE_ASSERT_CHECKING |
520 | | if (cur_elem_trig) |
521 | | { |
522 | | if (ScanDirectionIsForward(dir) && array->low_compare) |
523 | | Assert(DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, |
524 | | array->low_compare->sk_collation, |
525 | | tupdatum, |
526 | | array->low_compare->sk_argument))); |
527 | | |
528 | | if (ScanDirectionIsBackward(dir) && array->high_compare) |
529 | | Assert(DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, |
530 | | array->high_compare->sk_collation, |
531 | | tupdatum, |
532 | | array->high_compare->sk_argument))); |
533 | | } |
534 | | #endif |
535 | 0 | } |
536 | | |
537 | | /* |
538 | | * _bt_skiparray_set_element() -- Set skip array scan key's sk_argument |
539 | | * |
540 | | * Caller passes set_elem_result returned by _bt_binsrch_skiparray_skey for |
541 | | * caller's tupdatum/tupnull. |
542 | | * |
543 | | * We copy tupdatum/tupnull into skey's sk_argument iff set_elem_result == 0. |
544 | | * Otherwise, we set skey to either the lowest or highest value that's within |
545 | | * the range of caller's skip array (whichever is the best available match to |
546 | | * tupdatum/tupnull that is still within the range of the skip array according |
547 | | * to _bt_binsrch_skiparray_skey/set_elem_result). |
548 | | */ |
549 | | static void |
550 | | _bt_skiparray_set_element(Relation rel, ScanKey skey, BTArrayKeyInfo *array, |
551 | | int32 set_elem_result, Datum tupdatum, bool tupnull) |
552 | 0 | { |
553 | 0 | Assert(skey->sk_flags & SK_BT_SKIP); |
554 | 0 | Assert(skey->sk_flags & SK_SEARCHARRAY); |
555 | |
|
556 | 0 | if (set_elem_result) |
557 | 0 | { |
558 | | /* tupdatum/tupnull is out of the range of the skip array */ |
559 | 0 | Assert(!array->null_elem); |
560 | |
|
561 | 0 | _bt_array_set_low_or_high(rel, skey, array, set_elem_result < 0); |
562 | 0 | return; |
563 | 0 | } |
564 | | |
565 | | /* Advance skip array to tupdatum (or tupnull) value */ |
566 | 0 | if (unlikely(tupnull)) |
567 | 0 | { |
568 | 0 | _bt_skiparray_set_isnull(rel, skey, array); |
569 | 0 | return; |
570 | 0 | } |
571 | | |
572 | | /* Free memory previously allocated for sk_argument if needed */ |
573 | 0 | if (!array->attbyval && skey->sk_argument) |
574 | 0 | pfree(DatumGetPointer(skey->sk_argument)); |
575 | | |
576 | | /* tupdatum becomes new sk_argument/new current element */ |
577 | 0 | skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | |
578 | 0 | SK_BT_MINVAL | SK_BT_MAXVAL | |
579 | 0 | SK_BT_NEXT | SK_BT_PRIOR); |
580 | 0 | skey->sk_argument = datumCopy(tupdatum, array->attbyval, array->attlen); |
581 | 0 | } |
582 | | |
583 | | /* |
584 | | * _bt_skiparray_set_isnull() -- set skip array scan key to NULL |
585 | | */ |
586 | | static void |
587 | | _bt_skiparray_set_isnull(Relation rel, ScanKey skey, BTArrayKeyInfo *array) |
588 | 0 | { |
589 | 0 | Assert(skey->sk_flags & SK_BT_SKIP); |
590 | 0 | Assert(skey->sk_flags & SK_SEARCHARRAY); |
591 | 0 | Assert(array->null_elem && !array->low_compare && !array->high_compare); |
592 | | |
593 | | /* Free memory previously allocated for sk_argument if needed */ |
594 | 0 | if (!array->attbyval && skey->sk_argument) |
595 | 0 | pfree(DatumGetPointer(skey->sk_argument)); |
596 | | |
597 | | /* NULL becomes new sk_argument/new current element */ |
598 | 0 | skey->sk_argument = (Datum) 0; |
599 | 0 | skey->sk_flags &= ~(SK_BT_MINVAL | SK_BT_MAXVAL | |
600 | 0 | SK_BT_NEXT | SK_BT_PRIOR); |
601 | 0 | skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); |
602 | 0 | } |
603 | | |
604 | | /* |
605 | | * _bt_start_array_keys() -- Initialize array keys at start of a scan |
606 | | * |
607 | | * Set up the cur_elem counters and fill in the first sk_argument value for |
608 | | * each array scankey. |
609 | | */ |
610 | | void |
611 | | _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) |
612 | 0 | { |
613 | 0 | Relation rel = scan->indexRelation; |
614 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
615 | |
|
616 | 0 | Assert(so->numArrayKeys); |
617 | 0 | Assert(so->qual_ok); |
618 | |
|
619 | 0 | for (int i = 0; i < so->numArrayKeys; i++) |
620 | 0 | { |
621 | 0 | BTArrayKeyInfo *array = &so->arrayKeys[i]; |
622 | 0 | ScanKey skey = &so->keyData[array->scan_key]; |
623 | |
|
624 | 0 | Assert(skey->sk_flags & SK_SEARCHARRAY); |
625 | |
|
626 | 0 | _bt_array_set_low_or_high(rel, skey, array, |
627 | 0 | ScanDirectionIsForward(dir)); |
628 | 0 | } |
629 | 0 | so->scanBehind = so->oppositeDirCheck = false; /* reset */ |
630 | 0 | } |
631 | | |
632 | | /* |
633 | | * _bt_array_set_low_or_high() -- Set array scan key to lowest/highest element |
634 | | * |
635 | | * Caller also passes associated scan key, which will have its argument set to |
636 | | * the lowest/highest array value in passing. |
637 | | */ |
638 | | static void |
639 | | _bt_array_set_low_or_high(Relation rel, ScanKey skey, BTArrayKeyInfo *array, |
640 | | bool low_not_high) |
641 | 0 | { |
642 | 0 | Assert(skey->sk_flags & SK_SEARCHARRAY); |
643 | |
|
644 | 0 | if (array->num_elems != -1) |
645 | 0 | { |
646 | | /* set low or high element for SAOP array */ |
647 | 0 | int set_elem = 0; |
648 | |
|
649 | 0 | Assert(!(skey->sk_flags & SK_BT_SKIP)); |
650 | |
|
651 | 0 | if (!low_not_high) |
652 | 0 | set_elem = array->num_elems - 1; |
653 | | |
654 | | /* |
655 | | * Just copy over array datum (only skip arrays require freeing and |
656 | | * allocating memory for sk_argument) |
657 | | */ |
658 | 0 | array->cur_elem = set_elem; |
659 | 0 | skey->sk_argument = array->elem_values[set_elem]; |
660 | |
|
661 | 0 | return; |
662 | 0 | } |
663 | | |
664 | | /* set low or high element for skip array */ |
665 | 0 | Assert(skey->sk_flags & SK_BT_SKIP); |
666 | 0 | Assert(array->num_elems == -1); |
667 | | |
668 | | /* Free memory previously allocated for sk_argument if needed */ |
669 | 0 | if (!array->attbyval && skey->sk_argument) |
670 | 0 | pfree(DatumGetPointer(skey->sk_argument)); |
671 | | |
672 | | /* Reset flags */ |
673 | 0 | skey->sk_argument = (Datum) 0; |
674 | 0 | skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL | |
675 | 0 | SK_BT_MINVAL | SK_BT_MAXVAL | |
676 | 0 | SK_BT_NEXT | SK_BT_PRIOR); |
677 | |
|
678 | 0 | if (array->null_elem && |
679 | 0 | (low_not_high == ((skey->sk_flags & SK_BT_NULLS_FIRST) != 0))) |
680 | 0 | { |
681 | | /* Requested element (either lowest or highest) has the value NULL */ |
682 | 0 | skey->sk_flags |= (SK_SEARCHNULL | SK_ISNULL); |
683 | 0 | } |
684 | 0 | else if (low_not_high) |
685 | 0 | { |
686 | | /* Setting array to lowest element (according to low_compare) */ |
687 | 0 | skey->sk_flags |= SK_BT_MINVAL; |
688 | 0 | } |
689 | 0 | else |
690 | 0 | { |
691 | | /* Setting array to highest element (according to high_compare) */ |
692 | 0 | skey->sk_flags |= SK_BT_MAXVAL; |
693 | 0 | } |
694 | 0 | } |
695 | | |
696 | | /* |
697 | | * _bt_array_decrement() -- decrement array scan key's sk_argument |
698 | | * |
699 | | * Return value indicates whether caller's array was successfully decremented. |
700 | | * Cannot decrement an array whose current element is already the first one. |
701 | | */ |
702 | | static bool |
703 | | _bt_array_decrement(Relation rel, ScanKey skey, BTArrayKeyInfo *array) |
704 | 0 | { |
705 | 0 | bool uflow = false; |
706 | 0 | Datum dec_sk_argument; |
707 | |
|
708 | 0 | Assert(skey->sk_flags & SK_SEARCHARRAY); |
709 | 0 | Assert(!(skey->sk_flags & (SK_BT_MAXVAL | SK_BT_NEXT | SK_BT_PRIOR))); |
710 | | |
711 | | /* SAOP array? */ |
712 | 0 | if (array->num_elems != -1) |
713 | 0 | { |
714 | 0 | Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); |
715 | 0 | if (array->cur_elem > 0) |
716 | 0 | { |
717 | | /* |
718 | | * Just decrement current element, and assign its datum to skey |
719 | | * (only skip arrays need us to free existing sk_argument memory) |
720 | | */ |
721 | 0 | array->cur_elem--; |
722 | 0 | skey->sk_argument = array->elem_values[array->cur_elem]; |
723 | | |
724 | | /* Successfully decremented array */ |
725 | 0 | return true; |
726 | 0 | } |
727 | | |
728 | | /* Cannot decrement to before first array element */ |
729 | 0 | return false; |
730 | 0 | } |
731 | | |
732 | | /* Nope, this is a skip array */ |
733 | 0 | Assert(skey->sk_flags & SK_BT_SKIP); |
734 | | |
735 | | /* |
736 | | * The sentinel value that represents the minimum value within the range |
737 | | * of a skip array (often just -inf) is never decrementable |
738 | | */ |
739 | 0 | if (skey->sk_flags & SK_BT_MINVAL) |
740 | 0 | return false; |
741 | | |
742 | | /* |
743 | | * When the current array element is NULL, and the lowest sorting value in |
744 | | * the index is also NULL, we cannot decrement before first array element |
745 | | */ |
746 | 0 | if ((skey->sk_flags & SK_ISNULL) && (skey->sk_flags & SK_BT_NULLS_FIRST)) |
747 | 0 | return false; |
748 | | |
749 | | /* |
750 | | * Opclasses without skip support "decrement" the scan key's current |
751 | | * element by setting the PRIOR flag. The true prior value is determined |
752 | | * by repositioning to the last index tuple < existing sk_argument/current |
753 | | * array element. Note that this works in the usual way when the scan key |
754 | | * is already marked ISNULL (i.e. when the current element is NULL). |
755 | | */ |
756 | 0 | if (!array->sksup) |
757 | 0 | { |
758 | | /* Successfully "decremented" array */ |
759 | 0 | skey->sk_flags |= SK_BT_PRIOR; |
760 | 0 | return true; |
761 | 0 | } |
762 | | |
763 | | /* |
764 | | * Opclasses with skip support directly decrement sk_argument |
765 | | */ |
766 | 0 | if (skey->sk_flags & SK_ISNULL) |
767 | 0 | { |
768 | 0 | Assert(!(skey->sk_flags & SK_BT_NULLS_FIRST)); |
769 | | |
770 | | /* |
771 | | * Existing sk_argument/array element is NULL (for an IS NULL qual). |
772 | | * |
773 | | * "Decrement" from NULL to the high_elem value provided by opclass |
774 | | * skip support routine. |
775 | | */ |
776 | 0 | skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); |
777 | 0 | skey->sk_argument = datumCopy(array->sksup->high_elem, |
778 | 0 | array->attbyval, array->attlen); |
779 | 0 | return true; |
780 | 0 | } |
781 | | |
782 | | /* |
783 | | * Ask opclass support routine to provide decremented copy of existing |
784 | | * non-NULL sk_argument |
785 | | */ |
786 | 0 | dec_sk_argument = array->sksup->decrement(rel, skey->sk_argument, &uflow); |
787 | 0 | if (unlikely(uflow)) |
788 | 0 | { |
789 | | /* dec_sk_argument has undefined value (so no pfree) */ |
790 | 0 | if (array->null_elem && (skey->sk_flags & SK_BT_NULLS_FIRST)) |
791 | 0 | { |
792 | 0 | _bt_skiparray_set_isnull(rel, skey, array); |
793 | | |
794 | | /* Successfully "decremented" array to NULL */ |
795 | 0 | return true; |
796 | 0 | } |
797 | | |
798 | | /* Cannot decrement to before first array element */ |
799 | 0 | return false; |
800 | 0 | } |
801 | | |
802 | | /* |
803 | | * Successfully decremented sk_argument to a non-NULL value. Make sure |
804 | | * that the decremented value is still within the range of the array. |
805 | | */ |
806 | 0 | if (array->low_compare && |
807 | 0 | !DatumGetBool(FunctionCall2Coll(&array->low_compare->sk_func, |
808 | 0 | array->low_compare->sk_collation, |
809 | 0 | dec_sk_argument, |
810 | 0 | array->low_compare->sk_argument))) |
811 | 0 | { |
812 | | /* Keep existing sk_argument after all */ |
813 | 0 | if (!array->attbyval) |
814 | 0 | pfree(DatumGetPointer(dec_sk_argument)); |
815 | | |
816 | | /* Cannot decrement to before first array element */ |
817 | 0 | return false; |
818 | 0 | } |
819 | | |
820 | | /* Accept value returned by opclass decrement callback */ |
821 | 0 | if (!array->attbyval && skey->sk_argument) |
822 | 0 | pfree(DatumGetPointer(skey->sk_argument)); |
823 | 0 | skey->sk_argument = dec_sk_argument; |
824 | | |
825 | | /* Successfully decremented array */ |
826 | 0 | return true; |
827 | 0 | } |
828 | | |
829 | | /* |
830 | | * _bt_array_increment() -- increment array scan key's sk_argument |
831 | | * |
832 | | * Return value indicates whether caller's array was successfully incremented. |
833 | | * Cannot increment an array whose current element is already the final one. |
834 | | */ |
835 | | static bool |
836 | | _bt_array_increment(Relation rel, ScanKey skey, BTArrayKeyInfo *array) |
837 | 0 | { |
838 | 0 | bool oflow = false; |
839 | 0 | Datum inc_sk_argument; |
840 | |
|
841 | 0 | Assert(skey->sk_flags & SK_SEARCHARRAY); |
842 | 0 | Assert(!(skey->sk_flags & (SK_BT_MINVAL | SK_BT_NEXT | SK_BT_PRIOR))); |
843 | | |
844 | | /* SAOP array? */ |
845 | 0 | if (array->num_elems != -1) |
846 | 0 | { |
847 | 0 | Assert(!(skey->sk_flags & (SK_BT_SKIP | SK_BT_MINVAL | SK_BT_MAXVAL))); |
848 | 0 | if (array->cur_elem < array->num_elems - 1) |
849 | 0 | { |
850 | | /* |
851 | | * Just increment current element, and assign its datum to skey |
852 | | * (only skip arrays need us to free existing sk_argument memory) |
853 | | */ |
854 | 0 | array->cur_elem++; |
855 | 0 | skey->sk_argument = array->elem_values[array->cur_elem]; |
856 | | |
857 | | /* Successfully incremented array */ |
858 | 0 | return true; |
859 | 0 | } |
860 | | |
861 | | /* Cannot increment past final array element */ |
862 | 0 | return false; |
863 | 0 | } |
864 | | |
865 | | /* Nope, this is a skip array */ |
866 | 0 | Assert(skey->sk_flags & SK_BT_SKIP); |
867 | | |
868 | | /* |
869 | | * The sentinel value that represents the maximum value within the range |
870 | | * of a skip array (often just +inf) is never incrementable |
871 | | */ |
872 | 0 | if (skey->sk_flags & SK_BT_MAXVAL) |
873 | 0 | return false; |
874 | | |
875 | | /* |
876 | | * When the current array element is NULL, and the highest sorting value |
877 | | * in the index is also NULL, we cannot increment past the final element |
878 | | */ |
879 | 0 | if ((skey->sk_flags & SK_ISNULL) && !(skey->sk_flags & SK_BT_NULLS_FIRST)) |
880 | 0 | return false; |
881 | | |
882 | | /* |
883 | | * Opclasses without skip support "increment" the scan key's current |
884 | | * element by setting the NEXT flag. The true next value is determined by |
885 | | * repositioning to the first index tuple > existing sk_argument/current |
886 | | * array element. Note that this works in the usual way when the scan key |
887 | | * is already marked ISNULL (i.e. when the current element is NULL). |
888 | | */ |
889 | 0 | if (!array->sksup) |
890 | 0 | { |
891 | | /* Successfully "incremented" array */ |
892 | 0 | skey->sk_flags |= SK_BT_NEXT; |
893 | 0 | return true; |
894 | 0 | } |
895 | | |
896 | | /* |
897 | | * Opclasses with skip support directly increment sk_argument |
898 | | */ |
899 | 0 | if (skey->sk_flags & SK_ISNULL) |
900 | 0 | { |
901 | 0 | Assert(skey->sk_flags & SK_BT_NULLS_FIRST); |
902 | | |
903 | | /* |
904 | | * Existing sk_argument/array element is NULL (for an IS NULL qual). |
905 | | * |
906 | | * "Increment" from NULL to the low_elem value provided by opclass |
907 | | * skip support routine. |
908 | | */ |
909 | 0 | skey->sk_flags &= ~(SK_SEARCHNULL | SK_ISNULL); |
910 | 0 | skey->sk_argument = datumCopy(array->sksup->low_elem, |
911 | 0 | array->attbyval, array->attlen); |
912 | 0 | return true; |
913 | 0 | } |
914 | | |
915 | | /* |
916 | | * Ask opclass support routine to provide incremented copy of existing |
917 | | * non-NULL sk_argument |
918 | | */ |
919 | 0 | inc_sk_argument = array->sksup->increment(rel, skey->sk_argument, &oflow); |
920 | 0 | if (unlikely(oflow)) |
921 | 0 | { |
922 | | /* inc_sk_argument has undefined value (so no pfree) */ |
923 | 0 | if (array->null_elem && !(skey->sk_flags & SK_BT_NULLS_FIRST)) |
924 | 0 | { |
925 | 0 | _bt_skiparray_set_isnull(rel, skey, array); |
926 | | |
927 | | /* Successfully "incremented" array to NULL */ |
928 | 0 | return true; |
929 | 0 | } |
930 | | |
931 | | /* Cannot increment past final array element */ |
932 | 0 | return false; |
933 | 0 | } |
934 | | |
935 | | /* |
936 | | * Successfully incremented sk_argument to a non-NULL value. Make sure |
937 | | * that the incremented value is still within the range of the array. |
938 | | */ |
939 | 0 | if (array->high_compare && |
940 | 0 | !DatumGetBool(FunctionCall2Coll(&array->high_compare->sk_func, |
941 | 0 | array->high_compare->sk_collation, |
942 | 0 | inc_sk_argument, |
943 | 0 | array->high_compare->sk_argument))) |
944 | 0 | { |
945 | | /* Keep existing sk_argument after all */ |
946 | 0 | if (!array->attbyval) |
947 | 0 | pfree(DatumGetPointer(inc_sk_argument)); |
948 | | |
949 | | /* Cannot increment past final array element */ |
950 | 0 | return false; |
951 | 0 | } |
952 | | |
953 | | /* Accept value returned by opclass increment callback */ |
954 | 0 | if (!array->attbyval && skey->sk_argument) |
955 | 0 | pfree(DatumGetPointer(skey->sk_argument)); |
956 | 0 | skey->sk_argument = inc_sk_argument; |
957 | | |
958 | | /* Successfully incremented array */ |
959 | 0 | return true; |
960 | 0 | } |
961 | | |
962 | | /* |
963 | | * _bt_advance_array_keys_increment() -- Advance to next set of array elements |
964 | | * |
965 | | * Advances the array keys by a single increment in the current scan |
966 | | * direction. When there are multiple array keys this can roll over from the |
967 | | * lowest order array to higher order arrays. |
968 | | * |
969 | | * Returns true if there is another set of values to consider, false if not. |
970 | | * On true result, the scankeys are initialized with the next set of values. |
971 | | * On false result, the scankeys stay the same, and the array keys are not |
972 | | * advanced (every array remains at its final element for scan direction). |
973 | | */ |
974 | | static bool |
975 | | _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir, |
976 | | bool *skip_array_set) |
977 | 0 | { |
978 | 0 | Relation rel = scan->indexRelation; |
979 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
980 | | |
981 | | /* |
982 | | * We must advance the last array key most quickly, since it will |
983 | | * correspond to the lowest-order index column among the available |
984 | | * qualifications |
985 | | */ |
986 | 0 | for (int i = so->numArrayKeys - 1; i >= 0; i--) |
987 | 0 | { |
988 | 0 | BTArrayKeyInfo *array = &so->arrayKeys[i]; |
989 | 0 | ScanKey skey = &so->keyData[array->scan_key]; |
990 | |
|
991 | 0 | if (array->num_elems == -1) |
992 | 0 | *skip_array_set = true; |
993 | |
|
994 | 0 | if (ScanDirectionIsForward(dir)) |
995 | 0 | { |
996 | 0 | if (_bt_array_increment(rel, skey, array)) |
997 | 0 | return true; |
998 | 0 | } |
999 | 0 | else |
1000 | 0 | { |
1001 | 0 | if (_bt_array_decrement(rel, skey, array)) |
1002 | 0 | return true; |
1003 | 0 | } |
1004 | | |
1005 | | /* |
1006 | | * Couldn't increment (or decrement) array. Handle array roll over. |
1007 | | * |
1008 | | * Start over at the array's lowest sorting value (or its highest |
1009 | | * value, for backward scans)... |
1010 | | */ |
1011 | 0 | _bt_array_set_low_or_high(rel, skey, array, |
1012 | 0 | ScanDirectionIsForward(dir)); |
1013 | | |
1014 | | /* ...then increment (or decrement) next most significant array */ |
1015 | 0 | } |
1016 | | |
1017 | | /* |
1018 | | * The array keys are now exhausted. |
1019 | | * |
1020 | | * Restore the array keys to the state they were in immediately before we |
1021 | | * were called. This ensures that the arrays only ever ratchet in the |
1022 | | * current scan direction. |
1023 | | * |
1024 | | * Without this, scans could overlook matching tuples when the scan |
1025 | | * direction gets reversed just before btgettuple runs out of items to |
1026 | | * return, but just after _bt_readpage prepares all the items from the |
1027 | | * scan's final page in so->currPos. When we're on the final page it is |
1028 | | * typical for so->currPos to get invalidated once btgettuple finally |
1029 | | * returns false, which'll effectively invalidate the scan's array keys. |
1030 | | * That hasn't happened yet, though -- and in general it may never happen. |
1031 | | */ |
1032 | 0 | _bt_start_array_keys(scan, -dir); |
1033 | |
|
1034 | 0 | return false; |
1035 | 0 | } |
1036 | | |
1037 | | /* |
1038 | | * _bt_rewind_nonrequired_arrays() -- Rewind SAOP arrays not marked required |
1039 | | * |
1040 | | * Called when _bt_advance_array_keys decides to start a new primitive index |
1041 | | * scan on the basis of the current scan position being before the position |
1042 | | * that _bt_first is capable of repositioning the scan to by applying an |
1043 | | * inequality operator required in the opposite-to-scan direction only. |
1044 | | * |
1045 | | * Although equality strategy scan keys (for both arrays and non-arrays alike) |
1046 | | * are either marked required in both directions or in neither direction, |
1047 | | * there is a sense in which non-required arrays behave like required arrays. |
1048 | | * With a qual such as "WHERE a IN (100, 200) AND b >= 3 AND c IN (5, 6, 7)", |
1049 | | * the scan key on "c" is non-required, but nevertheless enables positioning |
1050 | | * the scan at the first tuple >= "(100, 3, 5)" on the leaf level during the |
1051 | | * first descent of the tree by _bt_first. Later on, there could also be a |
1052 | | * second descent, that places the scan right before tuples >= "(200, 3, 5)". |
1053 | | * _bt_first must never be allowed to build an insertion scan key whose "c" |
1054 | | * entry is set to a value other than 5, the "c" array's first element/value. |
1055 | | * (Actually, it's the first in the current scan direction. This example uses |
1056 | | * a forward scan.) |
1057 | | * |
1058 | | * Calling here resets the array scan key elements for the scan's non-required |
1059 | | * arrays. This is strictly necessary for correctness in a subset of cases |
1060 | | * involving "required in opposite direction"-triggered primitive index scans. |
1061 | | * Not all callers are at risk of _bt_first using a non-required array like |
1062 | | * this, but advancement always resets the arrays when another primitive scan |
1063 | | * is scheduled, just to keep things simple. Array advancement even makes |
1064 | | * sure to reset non-required arrays during scans that have no inequalities. |
1065 | | * (Advancement still won't call here when there are no inequalities, though |
1066 | | * that's just because it's all handled indirectly instead.) |
1067 | | * |
1068 | | * Note: _bt_verify_arrays_bt_first is called by an assertion to enforce that |
1069 | | * everybody got this right. |
1070 | | * |
1071 | | * Note: In practice almost all SAOP arrays are marked required during |
1072 | | * preprocessing (if necessary by generating skip arrays). It is hardly ever |
1073 | | * truly necessary to call here, but consistently doing so is simpler. |
1074 | | */ |
1075 | | static void |
1076 | | _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir) |
1077 | 0 | { |
1078 | 0 | Relation rel = scan->indexRelation; |
1079 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
1080 | 0 | int arrayidx = 0; |
1081 | |
|
1082 | 0 | for (int ikey = 0; ikey < so->numberOfKeys; ikey++) |
1083 | 0 | { |
1084 | 0 | ScanKey cur = so->keyData + ikey; |
1085 | 0 | BTArrayKeyInfo *array = NULL; |
1086 | |
|
1087 | 0 | if (!(cur->sk_flags & SK_SEARCHARRAY) || |
1088 | 0 | cur->sk_strategy != BTEqualStrategyNumber) |
1089 | 0 | continue; |
1090 | | |
1091 | 0 | array = &so->arrayKeys[arrayidx++]; |
1092 | 0 | Assert(array->scan_key == ikey); |
1093 | |
|
1094 | 0 | if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) |
1095 | 0 | continue; |
1096 | | |
1097 | 0 | Assert(array->num_elems != -1); /* No non-required skip arrays */ |
1098 | |
|
1099 | 0 | _bt_array_set_low_or_high(rel, cur, array, |
1100 | 0 | ScanDirectionIsForward(dir)); |
1101 | 0 | } |
1102 | 0 | } |
1103 | | |
1104 | | /* |
1105 | | * _bt_tuple_before_array_skeys() -- too early to advance required arrays? |
1106 | | * |
1107 | | * We always compare the tuple using the current array keys (which we assume |
1108 | | * are already set in so->keyData[]). readpagetup indicates if tuple is the |
1109 | | * scan's current _bt_readpage-wise tuple. |
1110 | | * |
1111 | | * readpagetup callers must only call here when _bt_check_compare already set |
1112 | | * continuescan=false. We help these callers deal with _bt_check_compare's |
1113 | | * inability to distinguish between the < and > cases (it uses equality |
1114 | | * operator scan keys, whereas we use 3-way ORDER procs). These callers pass |
1115 | | * a _bt_check_compare-set sktrig value that indicates which scan key |
1116 | | * triggered the call (!readpagetup callers just pass us sktrig=0 instead). |
1117 | | * This information allows us to avoid wastefully checking earlier scan keys |
1118 | | * that were already deemed to have been satisfied inside _bt_check_compare. |
1119 | | * |
1120 | | * Returns false when caller's tuple is >= the current required equality scan |
1121 | | * keys (or <=, in the case of backwards scans). This happens to readpagetup |
1122 | | * callers when the scan has reached the point of needing its array keys |
1123 | | * advanced; caller will need to advance required and non-required arrays at |
1124 | | * scan key offsets >= sktrig, plus scan keys < sktrig iff sktrig rolls over. |
1125 | | * (When we return false to readpagetup callers, tuple can only be == current |
1126 | | * required equality scan keys when caller's sktrig indicates that the arrays |
1127 | | * need to be advanced due to an unsatisfied required inequality key trigger.) |
1128 | | * |
1129 | | * Returns true when caller passes a tuple that is < the current set of |
1130 | | * equality keys for the most significant non-equal required scan key/column |
1131 | | * (or > the keys, during backwards scans). This happens to readpagetup |
1132 | | * callers when tuple is still before the start of matches for the scan's |
1133 | | * required equality strategy scan keys. (sktrig can't have indicated that an |
1134 | | * inequality strategy scan key wasn't satisfied in _bt_check_compare when we |
1135 | | * return true. In fact, we automatically return false when passed such an |
1136 | | * inequality sktrig by readpagetup callers -- _bt_check_compare's initial |
1137 | | * continuescan=false doesn't really need to be confirmed here by us.) |
1138 | | * |
1139 | | * !readpagetup callers optionally pass us *scanBehind, which tracks whether |
1140 | | * any missing truncated attributes might have affected array advancement |
1141 | | * (compared to what would happen if it was shown the first non-pivot tuple on |
1142 | | * the page to the right of caller's finaltup/high key tuple instead). It's |
1143 | | * only possible that we'll set *scanBehind to true when caller passes us a |
1144 | | * pivot tuple (with truncated -inf attributes) that we return false for. |
1145 | | */ |
1146 | | static bool |
1147 | | _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir, |
1148 | | IndexTuple tuple, TupleDesc tupdesc, int tupnatts, |
1149 | | bool readpagetup, int sktrig, bool *scanBehind) |
1150 | 0 | { |
1151 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
1152 | |
|
1153 | 0 | Assert(so->numArrayKeys); |
1154 | 0 | Assert(so->numberOfKeys); |
1155 | 0 | Assert(sktrig == 0 || readpagetup); |
1156 | 0 | Assert(!readpagetup || scanBehind == NULL); |
1157 | |
|
1158 | 0 | if (scanBehind) |
1159 | 0 | *scanBehind = false; |
1160 | |
|
1161 | 0 | for (int ikey = sktrig; ikey < so->numberOfKeys; ikey++) |
1162 | 0 | { |
1163 | 0 | ScanKey cur = so->keyData + ikey; |
1164 | 0 | Datum tupdatum; |
1165 | 0 | bool tupnull; |
1166 | 0 | int32 result; |
1167 | | |
1168 | | /* readpagetup calls require one ORDER proc comparison (at most) */ |
1169 | 0 | Assert(!readpagetup || ikey == sktrig); |
1170 | | |
1171 | | /* |
1172 | | * Once we reach a non-required scan key, we're completely done. |
1173 | | * |
1174 | | * Note: we deliberately don't consider the scan direction here. |
1175 | | * _bt_advance_array_keys caller requires that we track *scanBehind |
1176 | | * without concern for scan direction. |
1177 | | */ |
1178 | 0 | if ((cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) == 0) |
1179 | 0 | { |
1180 | 0 | Assert(!readpagetup); |
1181 | 0 | Assert(ikey > sktrig || ikey == 0); |
1182 | 0 | return false; |
1183 | 0 | } |
1184 | | |
1185 | 0 | if (cur->sk_attno > tupnatts) |
1186 | 0 | { |
1187 | 0 | Assert(!readpagetup); |
1188 | | |
1189 | | /* |
1190 | | * When we reach a high key's truncated attribute, assume that the |
1191 | | * tuple attribute's value is >= the scan's equality constraint |
1192 | | * scan keys (but set *scanBehind to let interested callers know |
1193 | | * that a truncated attribute might have affected our answer). |
1194 | | */ |
1195 | 0 | if (scanBehind) |
1196 | 0 | *scanBehind = true; |
1197 | |
|
1198 | 0 | return false; |
1199 | 0 | } |
1200 | | |
1201 | | /* |
1202 | | * Deal with inequality strategy scan keys that _bt_check_compare set |
1203 | | * continuescan=false for |
1204 | | */ |
1205 | 0 | if (cur->sk_strategy != BTEqualStrategyNumber) |
1206 | 0 | { |
1207 | | /* |
1208 | | * When _bt_check_compare indicated that a required inequality |
1209 | | * scan key wasn't satisfied, there's no need to verify anything; |
1210 | | * caller always calls _bt_advance_array_keys with this sktrig. |
1211 | | */ |
1212 | 0 | if (readpagetup) |
1213 | 0 | return false; |
1214 | | |
1215 | | /* |
1216 | | * Otherwise we can't give up, since we must check all required |
1217 | | * scan keys (required in either direction) in order to correctly |
1218 | | * track *scanBehind for caller |
1219 | | */ |
1220 | 0 | continue; |
1221 | 0 | } |
1222 | | |
1223 | 0 | tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); |
1224 | |
|
1225 | 0 | if (likely(!(cur->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL)))) |
1226 | 0 | { |
1227 | | /* Scankey has a valid/comparable sk_argument value */ |
1228 | 0 | result = _bt_compare_array_skey(&so->orderProcs[ikey], |
1229 | 0 | tupdatum, tupnull, |
1230 | 0 | cur->sk_argument, cur); |
1231 | |
|
1232 | 0 | if (result == 0) |
1233 | 0 | { |
1234 | | /* |
1235 | | * Interpret result in a way that takes NEXT/PRIOR into |
1236 | | * account |
1237 | | */ |
1238 | 0 | if (cur->sk_flags & SK_BT_NEXT) |
1239 | 0 | result = -1; |
1240 | 0 | else if (cur->sk_flags & SK_BT_PRIOR) |
1241 | 0 | result = 1; |
1242 | |
|
1243 | 0 | Assert(result == 0 || (cur->sk_flags & SK_BT_SKIP)); |
1244 | 0 | } |
1245 | 0 | } |
1246 | 0 | else |
1247 | 0 | { |
1248 | 0 | BTArrayKeyInfo *array = NULL; |
1249 | | |
1250 | | /* |
1251 | | * Current array element/array = scan key value is a sentinel |
1252 | | * value that represents the lowest (or highest) possible value |
1253 | | * that's still within the range of the array. |
1254 | | * |
1255 | | * Like _bt_first, we only see MINVAL keys during forwards scans |
1256 | | * (and similarly only see MAXVAL keys during backwards scans). |
1257 | | * Even if the scan's direction changes, we'll stop at some higher |
1258 | | * order key before we can ever reach any MAXVAL (or MINVAL) keys. |
1259 | | * (However, unlike _bt_first we _can_ get to keys marked either |
1260 | | * NEXT or PRIOR, regardless of the scan's current direction.) |
1261 | | */ |
1262 | 0 | Assert(ScanDirectionIsForward(dir) ? |
1263 | 0 | !(cur->sk_flags & SK_BT_MAXVAL) : |
1264 | 0 | !(cur->sk_flags & SK_BT_MINVAL)); |
1265 | | |
1266 | | /* |
1267 | | * There are no valid sk_argument values in MINVAL/MAXVAL keys. |
1268 | | * Check if tupdatum is within the range of skip array instead. |
1269 | | */ |
1270 | 0 | for (int arrayidx = 0; arrayidx < so->numArrayKeys; arrayidx++) |
1271 | 0 | { |
1272 | 0 | array = &so->arrayKeys[arrayidx]; |
1273 | 0 | if (array->scan_key == ikey) |
1274 | 0 | break; |
1275 | 0 | } |
1276 | |
|
1277 | 0 | _bt_binsrch_skiparray_skey(false, dir, tupdatum, tupnull, |
1278 | 0 | array, cur, &result); |
1279 | |
|
1280 | 0 | if (result == 0) |
1281 | 0 | { |
1282 | | /* |
1283 | | * tupdatum satisfies both low_compare and high_compare, so |
1284 | | * it's time to advance the array keys. |
1285 | | * |
1286 | | * Note: It's possible that the skip array will "advance" from |
1287 | | * its MINVAL (or MAXVAL) representation to an alternative, |
1288 | | * logically equivalent representation of the same value: a |
1289 | | * representation where the = key gets a valid datum in its |
1290 | | * sk_argument. This is only possible when low_compare uses |
1291 | | * the >= strategy (or high_compare uses the <= strategy). |
1292 | | */ |
1293 | 0 | return false; |
1294 | 0 | } |
1295 | 0 | } |
1296 | | |
1297 | | /* |
1298 | | * Does this comparison indicate that caller must _not_ advance the |
1299 | | * scan's arrays just yet? |
1300 | | */ |
1301 | 0 | if ((ScanDirectionIsForward(dir) && result < 0) || |
1302 | 0 | (ScanDirectionIsBackward(dir) && result > 0)) |
1303 | 0 | return true; |
1304 | | |
1305 | | /* |
1306 | | * Does this comparison indicate that caller should now advance the |
1307 | | * scan's arrays? (Must be if we get here during a readpagetup call.) |
1308 | | */ |
1309 | 0 | if (readpagetup || result != 0) |
1310 | 0 | { |
1311 | 0 | Assert(result != 0); |
1312 | 0 | return false; |
1313 | 0 | } |
1314 | | |
1315 | | /* |
1316 | | * Inconclusive -- need to check later scan keys, too. |
1317 | | * |
1318 | | * This must be a finaltup precheck, or a call made from an assertion. |
1319 | | */ |
1320 | 0 | Assert(result == 0); |
1321 | 0 | } |
1322 | | |
1323 | 0 | Assert(!readpagetup); |
1324 | |
|
1325 | 0 | return false; |
1326 | 0 | } |
1327 | | |
1328 | | /* |
1329 | | * _bt_start_prim_scan() -- start scheduled primitive index scan? |
1330 | | * |
1331 | | * Returns true if _bt_checkkeys scheduled another primitive index scan, just |
1332 | | * as the last one ended. Otherwise returns false, indicating that the array |
1333 | | * keys are now fully exhausted. |
1334 | | * |
1335 | | * Only call here during scans with one or more equality type array scan keys, |
1336 | | * after _bt_first or _bt_next return false. |
1337 | | */ |
1338 | | bool |
1339 | | _bt_start_prim_scan(IndexScanDesc scan, ScanDirection dir) |
1340 | 0 | { |
1341 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
1342 | |
|
1343 | 0 | Assert(so->numArrayKeys); |
1344 | |
|
1345 | 0 | so->scanBehind = so->oppositeDirCheck = false; /* reset */ |
1346 | | |
1347 | | /* |
1348 | | * Array keys are advanced within _bt_checkkeys when the scan reaches the |
1349 | | * leaf level (more precisely, they're advanced when the scan reaches the |
1350 | | * end of each distinct set of array elements). This process avoids |
1351 | | * repeat access to leaf pages (across multiple primitive index scans) by |
1352 | | * advancing the scan's array keys when it allows the primitive index scan |
1353 | | * to find nearby matching tuples (or when it eliminates ranges of array |
1354 | | * key space that can't possibly be satisfied by any index tuple). |
1355 | | * |
1356 | | * _bt_checkkeys sets a simple flag variable to schedule another primitive |
1357 | | * index scan. The flag tells us what to do. |
1358 | | * |
1359 | | * We cannot rely on _bt_first always reaching _bt_checkkeys. There are |
1360 | | * various cases where that won't happen. For example, if the index is |
1361 | | * completely empty, then _bt_first won't call _bt_readpage/_bt_checkkeys. |
1362 | | * We also don't expect a call to _bt_checkkeys during searches for a |
1363 | | * non-existent value that happens to be lower/higher than any existing |
1364 | | * value in the index. |
1365 | | * |
1366 | | * We don't require special handling for these cases -- we don't need to |
1367 | | * be explicitly instructed to _not_ perform another primitive index scan. |
1368 | | * It's up to code under the control of _bt_first to always set the flag |
1369 | | * when another primitive index scan will be required. |
1370 | | * |
1371 | | * This works correctly, even with the tricky cases listed above, which |
1372 | | * all involve access to leaf pages "near the boundaries of the key space" |
1373 | | * (whether it's from a leftmost/rightmost page, or an imaginary empty |
1374 | | * leaf root page). If _bt_checkkeys cannot be reached by a primitive |
1375 | | * index scan for one set of array keys, then it also won't be reached for |
1376 | | * any later set ("later" in terms of the direction that we scan the index |
1377 | | * and advance the arrays). The array keys won't have advanced in these |
1378 | | * cases, but that's the correct behavior (even _bt_advance_array_keys |
1379 | | * won't always advance the arrays at the point they become "exhausted"). |
1380 | | */ |
1381 | 0 | if (so->needPrimScan) |
1382 | 0 | { |
1383 | 0 | Assert(_bt_verify_arrays_bt_first(scan, dir)); |
1384 | | |
1385 | | /* |
1386 | | * Flag was set -- must call _bt_first again, which will reset the |
1387 | | * scan's needPrimScan flag |
1388 | | */ |
1389 | 0 | return true; |
1390 | 0 | } |
1391 | | |
1392 | | /* The top-level index scan ran out of tuples in this scan direction */ |
1393 | 0 | if (scan->parallel_scan != NULL) |
1394 | 0 | _bt_parallel_done(scan); |
1395 | |
|
1396 | 0 | return false; |
1397 | 0 | } |
1398 | | |
1399 | | /* |
1400 | | * _bt_advance_array_keys() -- Advance array elements using a tuple |
1401 | | * |
1402 | | * The scan always gets a new qual as a consequence of calling here (except |
1403 | | * when we determine that the top-level scan has run out of matching tuples). |
1404 | | * All later _bt_check_compare calls also use the same new qual that was first |
1405 | | * used here (at least until the next call here advances the keys once again). |
1406 | | * It's convenient to structure _bt_check_compare rechecks of caller's tuple |
1407 | | * (using the new qual) as one the steps of advancing the scan's array keys, |
1408 | | * so this function works as a wrapper around _bt_check_compare. |
1409 | | * |
1410 | | * Like _bt_check_compare, we'll set pstate.continuescan on behalf of the |
1411 | | * caller, and return a boolean indicating if caller's tuple satisfies the |
1412 | | * scan's new qual. But unlike _bt_check_compare, we set so->needPrimScan |
1413 | | * when we set continuescan=false, indicating if a new primitive index scan |
1414 | | * has been scheduled (otherwise, the top-level scan has run out of tuples in |
1415 | | * the current scan direction). |
1416 | | * |
1417 | | * Caller must use _bt_tuple_before_array_skeys to determine if the current |
1418 | | * place in the scan is >= the current array keys _before_ calling here. |
1419 | | * We're responsible for ensuring that caller's tuple is <= the newly advanced |
1420 | | * required array keys once we return. We try to find an exact match, but |
1421 | | * failing that we'll advance the array keys to whatever set of array elements |
1422 | | * comes next in the key space for the current scan direction. Required array |
1423 | | * keys "ratchet forwards" (or backwards). They can only advance as the scan |
1424 | | * itself advances through the index/key space. |
1425 | | * |
1426 | | * (The rules are the same for backwards scans, except that the operators are |
1427 | | * flipped: just replace the precondition's >= operator with a <=, and the |
1428 | | * postcondition's <= operator with a >=. In other words, just swap the |
1429 | | * precondition with the postcondition.) |
1430 | | * |
1431 | | * We also deal with "advancing" non-required arrays here (or arrays that are |
1432 | | * treated as non-required for the duration of a _bt_readpage call). Callers |
1433 | | * whose sktrig scan key is non-required specify sktrig_required=false. These |
1434 | | * calls are the only exception to the general rule about always advancing the |
1435 | | * required array keys (the scan may not even have a required array). These |
1436 | | * callers should just pass a NULL pstate (since there is never any question |
1437 | | * of stopping the scan). No call to _bt_tuple_before_array_skeys is required |
1438 | | * ahead of these calls (it's already clear that any required scan keys must |
1439 | | * be satisfied by caller's tuple). |
1440 | | * |
1441 | | * Note that we deal with non-array required equality strategy scan keys as |
1442 | | * degenerate single element arrays here. Obviously, they can never really |
1443 | | * advance in the way that real arrays can, but they must still affect how we |
1444 | | * advance real array scan keys (exactly like true array equality scan keys). |
1445 | | * We have to keep around a 3-way ORDER proc for these (using the "=" operator |
1446 | | * won't do), since in general whether the tuple is < or > _any_ unsatisfied |
1447 | | * required equality key influences how the scan's real arrays must advance. |
1448 | | * |
1449 | | * Note also that we may sometimes need to advance the array keys when the |
1450 | | * existing required array keys (and other required equality keys) are already |
1451 | | * an exact match for every corresponding value from caller's tuple. We must |
1452 | | * do this for inequalities that _bt_check_compare set continuescan=false for. |
1453 | | * They'll advance the array keys here, just like any other scan key that |
1454 | | * _bt_check_compare stops on. (This can even happen _after_ we advance the |
1455 | | * array keys, in which case we'll advance the array keys a second time. That |
1456 | | * way _bt_checkkeys caller always has its required arrays advance to the |
1457 | | * maximum possible extent that its tuple will allow.) |
1458 | | */ |
1459 | | static bool |
1460 | | _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, |
1461 | | IndexTuple tuple, int tupnatts, TupleDesc tupdesc, |
1462 | | int sktrig, bool sktrig_required) |
1463 | 0 | { |
1464 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
1465 | 0 | Relation rel = scan->indexRelation; |
1466 | 0 | ScanDirection dir = so->currPos.dir; |
1467 | 0 | int arrayidx = 0; |
1468 | 0 | bool beyond_end_advance = false, |
1469 | 0 | skip_array_advanced = false, |
1470 | 0 | has_required_opposite_direction_only = false, |
1471 | 0 | all_required_satisfied = true, |
1472 | 0 | all_satisfied = true; |
1473 | |
|
1474 | 0 | Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); |
1475 | 0 | Assert(_bt_verify_keys_with_arraykeys(scan)); |
1476 | |
|
1477 | 0 | if (sktrig_required) |
1478 | 0 | { |
1479 | | /* |
1480 | | * Precondition array state assertion |
1481 | | */ |
1482 | 0 | Assert(!_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, |
1483 | 0 | tupnatts, false, 0, NULL)); |
1484 | | |
1485 | | /* |
1486 | | * Once we return we'll have a new set of required array keys, so |
1487 | | * reset state used by "look ahead" optimization |
1488 | | */ |
1489 | 0 | pstate->rechecks = 0; |
1490 | 0 | pstate->targetdistance = 0; |
1491 | 0 | } |
1492 | 0 | else if (sktrig < so->numberOfKeys - 1 && |
1493 | 0 | !(so->keyData[so->numberOfKeys - 1].sk_flags & SK_SEARCHARRAY)) |
1494 | 0 | { |
1495 | 0 | int least_sign_ikey = so->numberOfKeys - 1; |
1496 | 0 | bool continuescan; |
1497 | | |
1498 | | /* |
1499 | | * Optimization: perform a precheck of the least significant key |
1500 | | * during !sktrig_required calls when it isn't already our sktrig |
1501 | | * (provided the precheck key is not itself an array). |
1502 | | * |
1503 | | * When the precheck works out we'll avoid an expensive binary search |
1504 | | * of sktrig's array (plus any other arrays before least_sign_ikey). |
1505 | | */ |
1506 | 0 | Assert(so->keyData[sktrig].sk_flags & SK_SEARCHARRAY); |
1507 | 0 | if (!_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, |
1508 | 0 | false, &continuescan, |
1509 | 0 | &least_sign_ikey)) |
1510 | 0 | return false; |
1511 | 0 | } |
1512 | | |
1513 | 0 | for (int ikey = 0; ikey < so->numberOfKeys; ikey++) |
1514 | 0 | { |
1515 | 0 | ScanKey cur = so->keyData + ikey; |
1516 | 0 | BTArrayKeyInfo *array = NULL; |
1517 | 0 | Datum tupdatum; |
1518 | 0 | bool required = false, |
1519 | 0 | required_opposite_direction_only = false, |
1520 | 0 | tupnull; |
1521 | 0 | int32 result; |
1522 | 0 | int set_elem = 0; |
1523 | |
|
1524 | 0 | if (cur->sk_strategy == BTEqualStrategyNumber) |
1525 | 0 | { |
1526 | | /* Manage array state */ |
1527 | 0 | if (cur->sk_flags & SK_SEARCHARRAY) |
1528 | 0 | { |
1529 | 0 | array = &so->arrayKeys[arrayidx++]; |
1530 | 0 | Assert(array->scan_key == ikey); |
1531 | 0 | } |
1532 | 0 | } |
1533 | 0 | else |
1534 | 0 | { |
1535 | | /* |
1536 | | * Are any inequalities required in the opposite direction only |
1537 | | * present here? |
1538 | | */ |
1539 | 0 | if (((ScanDirectionIsForward(dir) && |
1540 | 0 | (cur->sk_flags & (SK_BT_REQBKWD))) || |
1541 | 0 | (ScanDirectionIsBackward(dir) && |
1542 | 0 | (cur->sk_flags & (SK_BT_REQFWD))))) |
1543 | 0 | has_required_opposite_direction_only = |
1544 | 0 | required_opposite_direction_only = true; |
1545 | 0 | } |
1546 | | |
1547 | | /* Optimization: skip over known-satisfied scan keys */ |
1548 | 0 | if (ikey < sktrig) |
1549 | 0 | continue; |
1550 | | |
1551 | 0 | if (cur->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) |
1552 | 0 | { |
1553 | 0 | required = true; |
1554 | |
|
1555 | 0 | if (cur->sk_attno > tupnatts) |
1556 | 0 | { |
1557 | | /* Set this just like _bt_tuple_before_array_skeys */ |
1558 | 0 | Assert(sktrig < ikey); |
1559 | 0 | so->scanBehind = true; |
1560 | 0 | } |
1561 | 0 | } |
1562 | | |
1563 | | /* |
1564 | | * Handle a required non-array scan key that the initial call to |
1565 | | * _bt_check_compare indicated triggered array advancement, if any. |
1566 | | * |
1567 | | * The non-array scan key's strategy will be <, <=, or = during a |
1568 | | * forwards scan (or any one of =, >=, or > during a backwards scan). |
1569 | | * It follows that the corresponding tuple attribute's value must now |
1570 | | * be either > or >= the scan key value (for backwards scans it must |
1571 | | * be either < or <= that value). |
1572 | | * |
1573 | | * If this is a required equality strategy scan key, this is just an |
1574 | | * optimization; _bt_tuple_before_array_skeys already confirmed that |
1575 | | * this scan key places us ahead of caller's tuple. There's no need |
1576 | | * to repeat that work now. (The same underlying principle also gets |
1577 | | * applied by the cur_elem_trig optimization used to speed up searches |
1578 | | * for the next array element.) |
1579 | | * |
1580 | | * If this is a required inequality strategy scan key, we _must_ rely |
1581 | | * on _bt_check_compare like this; we aren't capable of directly |
1582 | | * evaluating required inequality strategy scan keys here, on our own. |
1583 | | */ |
1584 | 0 | if (ikey == sktrig && !array) |
1585 | 0 | { |
1586 | 0 | Assert(sktrig_required && required && all_required_satisfied); |
1587 | | |
1588 | | /* Use "beyond end" advancement. See below for an explanation. */ |
1589 | 0 | beyond_end_advance = true; |
1590 | 0 | all_satisfied = all_required_satisfied = false; |
1591 | |
|
1592 | 0 | continue; |
1593 | 0 | } |
1594 | | |
1595 | | /* |
1596 | | * Nothing more for us to do with an inequality strategy scan key that |
1597 | | * wasn't the one that _bt_check_compare stopped on, though. |
1598 | | * |
1599 | | * Note: if our later call to _bt_check_compare (to recheck caller's |
1600 | | * tuple) sets continuescan=false due to finding this same inequality |
1601 | | * unsatisfied (possible when it's required in the scan direction), |
1602 | | * we'll deal with it via a recursive "second pass" call. |
1603 | | */ |
1604 | 0 | else if (cur->sk_strategy != BTEqualStrategyNumber) |
1605 | 0 | continue; |
1606 | | |
1607 | | /* |
1608 | | * Nothing for us to do with an equality strategy scan key that isn't |
1609 | | * marked required, either -- unless it's a non-required array |
1610 | | */ |
1611 | 0 | else if (!required && !array) |
1612 | 0 | continue; |
1613 | | |
1614 | | /* |
1615 | | * Here we perform steps for all array scan keys after a required |
1616 | | * array scan key whose binary search triggered "beyond end of array |
1617 | | * element" array advancement due to encountering a tuple attribute |
1618 | | * value > the closest matching array key (or < for backwards scans). |
1619 | | */ |
1620 | 0 | if (beyond_end_advance) |
1621 | 0 | { |
1622 | 0 | if (array) |
1623 | 0 | _bt_array_set_low_or_high(rel, cur, array, |
1624 | 0 | ScanDirectionIsBackward(dir)); |
1625 | |
|
1626 | 0 | continue; |
1627 | 0 | } |
1628 | | |
1629 | | /* |
1630 | | * Here we perform steps for all array scan keys after a required |
1631 | | * array scan key whose tuple attribute was < the closest matching |
1632 | | * array key when we dealt with it (or > for backwards scans). |
1633 | | * |
1634 | | * This earlier required array key already puts us ahead of caller's |
1635 | | * tuple in the key space (for the current scan direction). We must |
1636 | | * make sure that subsequent lower-order array keys do not put us too |
1637 | | * far ahead (ahead of tuples that have yet to be seen by our caller). |
1638 | | * For example, when a tuple "(a, b) = (42, 5)" advances the array |
1639 | | * keys on "a" from 40 to 45, we must also set "b" to whatever the |
1640 | | * first array element for "b" is. It would be wrong to allow "b" to |
1641 | | * be set based on the tuple value. |
1642 | | * |
1643 | | * Perform the same steps with truncated high key attributes. You can |
1644 | | * think of this as a "binary search" for the element closest to the |
1645 | | * value -inf. Again, the arrays must never get ahead of the scan. |
1646 | | */ |
1647 | 0 | if (!all_required_satisfied || cur->sk_attno > tupnatts) |
1648 | 0 | { |
1649 | 0 | if (array) |
1650 | 0 | _bt_array_set_low_or_high(rel, cur, array, |
1651 | 0 | ScanDirectionIsForward(dir)); |
1652 | |
|
1653 | 0 | continue; |
1654 | 0 | } |
1655 | | |
1656 | | /* |
1657 | | * Search in scankey's array for the corresponding tuple attribute |
1658 | | * value from caller's tuple |
1659 | | */ |
1660 | 0 | tupdatum = index_getattr(tuple, cur->sk_attno, tupdesc, &tupnull); |
1661 | |
|
1662 | 0 | if (array) |
1663 | 0 | { |
1664 | 0 | bool cur_elem_trig = (sktrig_required && ikey == sktrig); |
1665 | | |
1666 | | /* |
1667 | | * "Binary search" by checking if tupdatum/tupnull are within the |
1668 | | * range of the skip array |
1669 | | */ |
1670 | 0 | if (array->num_elems == -1) |
1671 | 0 | _bt_binsrch_skiparray_skey(cur_elem_trig, dir, |
1672 | 0 | tupdatum, tupnull, array, cur, |
1673 | 0 | &result); |
1674 | | |
1675 | | /* |
1676 | | * Binary search for the closest match from the SAOP array |
1677 | | */ |
1678 | 0 | else |
1679 | 0 | set_elem = _bt_binsrch_array_skey(&so->orderProcs[ikey], |
1680 | 0 | cur_elem_trig, dir, |
1681 | 0 | tupdatum, tupnull, array, cur, |
1682 | 0 | &result); |
1683 | 0 | } |
1684 | 0 | else |
1685 | 0 | { |
1686 | 0 | Assert(required); |
1687 | | |
1688 | | /* |
1689 | | * This is a required non-array equality strategy scan key, which |
1690 | | * we'll treat as a degenerate single element array. |
1691 | | * |
1692 | | * This scan key's imaginary "array" can't really advance, but it |
1693 | | * can still roll over like any other array. (Actually, this is |
1694 | | * no different to real single value arrays, which never advance |
1695 | | * without rolling over -- they can never truly advance, either.) |
1696 | | */ |
1697 | 0 | result = _bt_compare_array_skey(&so->orderProcs[ikey], |
1698 | 0 | tupdatum, tupnull, |
1699 | 0 | cur->sk_argument, cur); |
1700 | 0 | } |
1701 | | |
1702 | | /* |
1703 | | * Consider "beyond end of array element" array advancement. |
1704 | | * |
1705 | | * When the tuple attribute value is > the closest matching array key |
1706 | | * (or < in the backwards scan case), we need to ratchet this array |
1707 | | * forward (backward) by one increment, so that caller's tuple ends up |
1708 | | * being < final array value instead (or > final array value instead). |
1709 | | * This process has to work for all of the arrays, not just this one: |
1710 | | * it must "carry" to higher-order arrays when the set_elem that we |
1711 | | * just found happens to be the final one for the scan's direction. |
1712 | | * Incrementing (decrementing) set_elem itself isn't good enough. |
1713 | | * |
1714 | | * Our approach is to provisionally use set_elem as if it was an exact |
1715 | | * match now, then set each later/less significant array to whatever |
1716 | | * its final element is. Once outside the loop we'll then "increment |
1717 | | * this array's set_elem" by calling _bt_advance_array_keys_increment. |
1718 | | * That way the process rolls over to higher order arrays as needed. |
1719 | | * |
1720 | | * Under this scheme any required arrays only ever ratchet forwards |
1721 | | * (or backwards), and always do so to the maximum possible extent |
1722 | | * that we can know will be safe without seeing the scan's next tuple. |
1723 | | * We don't need any special handling for required scan keys that lack |
1724 | | * a real array to advance, nor for redundant scan keys that couldn't |
1725 | | * be eliminated by _bt_preprocess_keys. It won't matter if some of |
1726 | | * our "true" array scan keys (or even all of them) are non-required. |
1727 | | */ |
1728 | 0 | if (sktrig_required && required && |
1729 | 0 | ((ScanDirectionIsForward(dir) && result > 0) || |
1730 | 0 | (ScanDirectionIsBackward(dir) && result < 0))) |
1731 | 0 | beyond_end_advance = true; |
1732 | |
|
1733 | 0 | Assert(all_required_satisfied && all_satisfied); |
1734 | 0 | if (result != 0) |
1735 | 0 | { |
1736 | | /* |
1737 | | * Track whether caller's tuple satisfies our new post-advancement |
1738 | | * qual, for required scan keys, as well as for the entire set of |
1739 | | * interesting scan keys (all required scan keys plus non-required |
1740 | | * array scan keys are considered interesting.) |
1741 | | */ |
1742 | 0 | all_satisfied = false; |
1743 | 0 | if (sktrig_required && required) |
1744 | 0 | all_required_satisfied = false; |
1745 | 0 | else |
1746 | 0 | { |
1747 | | /* |
1748 | | * There's no need to advance the arrays using the best |
1749 | | * available match for a non-required array. Give up now. |
1750 | | * (Though note that sktrig_required calls still have to do |
1751 | | * all the usual post-advancement steps, including the recheck |
1752 | | * call to _bt_check_compare.) |
1753 | | */ |
1754 | 0 | break; |
1755 | 0 | } |
1756 | 0 | } |
1757 | | |
1758 | | /* Advance array keys, even when we don't have an exact match */ |
1759 | 0 | if (array) |
1760 | 0 | { |
1761 | 0 | if (array->num_elems == -1) |
1762 | 0 | { |
1763 | | /* Skip array's new element is tupdatum (or MINVAL/MAXVAL) */ |
1764 | 0 | _bt_skiparray_set_element(rel, cur, array, result, |
1765 | 0 | tupdatum, tupnull); |
1766 | 0 | skip_array_advanced = true; |
1767 | 0 | } |
1768 | 0 | else if (array->cur_elem != set_elem) |
1769 | 0 | { |
1770 | | /* SAOP array's new element is set_elem datum */ |
1771 | 0 | array->cur_elem = set_elem; |
1772 | 0 | cur->sk_argument = array->elem_values[set_elem]; |
1773 | 0 | } |
1774 | 0 | } |
1775 | 0 | } |
1776 | | |
1777 | | /* |
1778 | | * Advance the array keys incrementally whenever "beyond end of array |
1779 | | * element" array advancement happens, so that advancement will carry to |
1780 | | * higher-order arrays (might exhaust all the scan's arrays instead, which |
1781 | | * ends the top-level scan). |
1782 | | */ |
1783 | 0 | if (beyond_end_advance && |
1784 | 0 | !_bt_advance_array_keys_increment(scan, dir, &skip_array_advanced)) |
1785 | 0 | goto end_toplevel_scan; |
1786 | | |
1787 | 0 | Assert(_bt_verify_keys_with_arraykeys(scan)); |
1788 | | |
1789 | | /* |
1790 | | * Maintain a page-level count of the number of times the scan's array |
1791 | | * keys advanced in a way that affected at least one skip array |
1792 | | */ |
1793 | 0 | if (sktrig_required && skip_array_advanced) |
1794 | 0 | pstate->nskipadvances++; |
1795 | | |
1796 | | /* |
1797 | | * Does tuple now satisfy our new qual? Recheck with _bt_check_compare. |
1798 | | * |
1799 | | * Calls triggered by an unsatisfied required scan key, whose tuple now |
1800 | | * satisfies all required scan keys, but not all nonrequired array keys, |
1801 | | * will still require a recheck call to _bt_check_compare. They'll still |
1802 | | * need its "second pass" handling of required inequality scan keys. |
1803 | | * (Might have missed a still-unsatisfied required inequality scan key |
1804 | | * that caller didn't detect as the sktrig scan key during its initial |
1805 | | * _bt_check_compare call that used the old/original qual.) |
1806 | | * |
1807 | | * Calls triggered by an unsatisfied nonrequired array scan key never need |
1808 | | * "second pass" handling of required inequalities (nor any other handling |
1809 | | * of any required scan key). All that matters is whether caller's tuple |
1810 | | * satisfies the new qual, so it's safe to just skip the _bt_check_compare |
1811 | | * recheck when we've already determined that it can only return 'false'. |
1812 | | * |
1813 | | * Note: In practice most scan keys are marked required by preprocessing, |
1814 | | * if necessary by generating a preceding skip array. We nevertheless |
1815 | | * often handle array keys marked required as if they were nonrequired. |
1816 | | * This behavior is requested by our _bt_check_compare caller, though only |
1817 | | * when it is passed "forcenonrequired=true" by _bt_checkkeys. |
1818 | | */ |
1819 | 0 | if ((sktrig_required && all_required_satisfied) || |
1820 | 0 | (!sktrig_required && all_satisfied)) |
1821 | 0 | { |
1822 | 0 | int nsktrig = sktrig + 1; |
1823 | 0 | bool continuescan; |
1824 | |
|
1825 | 0 | Assert(all_required_satisfied); |
1826 | | |
1827 | | /* Recheck _bt_check_compare on behalf of caller */ |
1828 | 0 | if (_bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, |
1829 | 0 | !sktrig_required, &continuescan, |
1830 | 0 | &nsktrig) && |
1831 | 0 | !so->scanBehind) |
1832 | 0 | { |
1833 | | /* This tuple satisfies the new qual */ |
1834 | 0 | Assert(all_satisfied && continuescan); |
1835 | |
|
1836 | 0 | if (pstate) |
1837 | 0 | pstate->continuescan = true; |
1838 | |
|
1839 | 0 | return true; |
1840 | 0 | } |
1841 | | |
1842 | | /* |
1843 | | * Consider "second pass" handling of required inequalities. |
1844 | | * |
1845 | | * It's possible that our _bt_check_compare call indicated that the |
1846 | | * scan should end due to some unsatisfied inequality that wasn't |
1847 | | * initially recognized as such by us. Handle this by calling |
1848 | | * ourselves recursively, this time indicating that the trigger is the |
1849 | | * inequality that we missed first time around (and using a set of |
1850 | | * required array/equality keys that are now exact matches for tuple). |
1851 | | * |
1852 | | * We make a strong, general guarantee that every _bt_checkkeys call |
1853 | | * here will advance the array keys to the maximum possible extent |
1854 | | * that we can know to be safe based on caller's tuple alone. If we |
1855 | | * didn't perform this step, then that guarantee wouldn't quite hold. |
1856 | | */ |
1857 | 0 | if (unlikely(!continuescan)) |
1858 | 0 | { |
1859 | 0 | bool satisfied PG_USED_FOR_ASSERTS_ONLY; |
1860 | |
|
1861 | 0 | Assert(sktrig_required); |
1862 | 0 | Assert(so->keyData[nsktrig].sk_strategy != BTEqualStrategyNumber); |
1863 | | |
1864 | | /* |
1865 | | * The tuple must use "beyond end" advancement during the |
1866 | | * recursive call, so we cannot possibly end up back here when |
1867 | | * recursing. We'll consume a small, fixed amount of stack space. |
1868 | | */ |
1869 | 0 | Assert(!beyond_end_advance); |
1870 | | |
1871 | | /* Advance the array keys a second time using same tuple */ |
1872 | 0 | satisfied = _bt_advance_array_keys(scan, pstate, tuple, tupnatts, |
1873 | 0 | tupdesc, nsktrig, true); |
1874 | | |
1875 | | /* This tuple doesn't satisfy the inequality */ |
1876 | 0 | Assert(!satisfied); |
1877 | 0 | return false; |
1878 | 0 | } |
1879 | | |
1880 | | /* |
1881 | | * Some non-required scan key (from new qual) still not satisfied. |
1882 | | * |
1883 | | * All scan keys required in the current scan direction must still be |
1884 | | * satisfied, though, so we can trust all_required_satisfied below. |
1885 | | */ |
1886 | 0 | } |
1887 | | |
1888 | | /* |
1889 | | * When we were called just to deal with "advancing" non-required arrays, |
1890 | | * this is as far as we can go (cannot stop the scan for these callers) |
1891 | | */ |
1892 | 0 | if (!sktrig_required) |
1893 | 0 | { |
1894 | | /* Caller's tuple doesn't match any qual */ |
1895 | 0 | return false; |
1896 | 0 | } |
1897 | | |
1898 | | /* |
1899 | | * Postcondition array state assertion (for still-unsatisfied tuples). |
1900 | | * |
1901 | | * By here we have established that the scan's required arrays (scan must |
1902 | | * have at least one required array) advanced, without becoming exhausted. |
1903 | | * |
1904 | | * Caller's tuple is now < the newly advanced array keys (or > when this |
1905 | | * is a backwards scan), except in the case where we only got this far due |
1906 | | * to an unsatisfied non-required scan key. Verify that with an assert. |
1907 | | * |
1908 | | * Note: we don't just quit at this point when all required scan keys were |
1909 | | * found to be satisfied because we need to consider edge-cases involving |
1910 | | * scan keys required in the opposite direction only; those aren't tracked |
1911 | | * by all_required_satisfied. |
1912 | | */ |
1913 | 0 | Assert(_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, |
1914 | 0 | false, 0, NULL) == |
1915 | 0 | !all_required_satisfied); |
1916 | | |
1917 | | /* |
1918 | | * We generally permit primitive index scans to continue onto the next |
1919 | | * sibling page when the page's finaltup satisfies all required scan keys |
1920 | | * at the point where we're between pages. |
1921 | | * |
1922 | | * If caller's tuple is also the page's finaltup, and we see that required |
1923 | | * scan keys still aren't satisfied, start a new primitive index scan. |
1924 | | */ |
1925 | 0 | if (!all_required_satisfied && pstate->finaltup == tuple) |
1926 | 0 | goto new_prim_scan; |
1927 | | |
1928 | | /* |
1929 | | * Proactively check finaltup (don't wait until finaltup is reached by the |
1930 | | * scan) when it might well turn out to not be satisfied later on. |
1931 | | * |
1932 | | * Note: if so->scanBehind hasn't already been set for finaltup by us, |
1933 | | * it'll be set during this call to _bt_tuple_before_array_skeys. Either |
1934 | | * way, it'll be set correctly (for the whole page) after this point. |
1935 | | */ |
1936 | 0 | if (!all_required_satisfied && pstate->finaltup && |
1937 | 0 | _bt_tuple_before_array_skeys(scan, dir, pstate->finaltup, tupdesc, |
1938 | 0 | BTreeTupleGetNAtts(pstate->finaltup, rel), |
1939 | 0 | false, 0, &so->scanBehind)) |
1940 | 0 | goto new_prim_scan; |
1941 | | |
1942 | | /* |
1943 | | * When we encounter a truncated finaltup high key attribute, we're |
1944 | | * optimistic about the chances of its corresponding required scan key |
1945 | | * being satisfied when we go on to recheck it against tuples from this |
1946 | | * page's right sibling leaf page. We consider truncated attributes to be |
1947 | | * satisfied by required scan keys, which allows the primitive index scan |
1948 | | * to continue to the next leaf page. We must set so->scanBehind to true |
1949 | | * to remember that the last page's finaltup had "satisfied" required scan |
1950 | | * keys for one or more truncated attribute values (scan keys required in |
1951 | | * _either_ scan direction). |
1952 | | * |
1953 | | * There is a chance that _bt_readpage (which checks so->scanBehind) will |
1954 | | * find that even the sibling leaf page's finaltup is < the new array |
1955 | | * keys. When that happens, our optimistic policy will have incurred a |
1956 | | * single extra leaf page access that could have been avoided. |
1957 | | * |
1958 | | * A pessimistic policy would give backward scans a gratuitous advantage |
1959 | | * over forward scans. We'd punish forward scans for applying more |
1960 | | * accurate information from the high key, rather than just using the |
1961 | | * final non-pivot tuple as finaltup, in the style of backward scans. |
1962 | | * Being pessimistic would also give some scans with non-required arrays a |
1963 | | * perverse advantage over similar scans that use required arrays instead. |
1964 | | * |
1965 | | * This is similar to our scan-level heuristics, below. They also set |
1966 | | * scanBehind to speculatively continue the primscan onto the next page. |
1967 | | */ |
1968 | 0 | if (so->scanBehind) |
1969 | 0 | { |
1970 | | /* Truncated high key -- _bt_scanbehind_checkkeys recheck scheduled */ |
1971 | 0 | } |
1972 | | |
1973 | | /* |
1974 | | * Handle inequalities marked required in the opposite scan direction. |
1975 | | * They can also signal that we should start a new primitive index scan. |
1976 | | * |
1977 | | * It's possible that the scan is now positioned where "matching" tuples |
1978 | | * begin, and that caller's tuple satisfies all scan keys required in the |
1979 | | * current scan direction. But if caller's tuple still doesn't satisfy |
1980 | | * other scan keys that are required in the opposite scan direction only |
1981 | | * (e.g., a required >= strategy scan key when scan direction is forward), |
1982 | | * it's still possible that there are many leaf pages before the page that |
1983 | | * _bt_first could skip straight to. Groveling through all those pages |
1984 | | * will always give correct answers, but it can be very inefficient. We |
1985 | | * must avoid needlessly scanning extra pages. |
1986 | | * |
1987 | | * Separately, it's possible that _bt_check_compare set continuescan=false |
1988 | | * for a scan key that's required in the opposite direction only. This is |
1989 | | * a special case, that happens only when _bt_check_compare sees that the |
1990 | | * inequality encountered a NULL value. This signals the end of non-NULL |
1991 | | * values in the current scan direction, which is reason enough to end the |
1992 | | * (primitive) scan. If this happens at the start of a large group of |
1993 | | * NULL values, then we shouldn't expect to be called again until after |
1994 | | * the scan has already read indefinitely-many leaf pages full of tuples |
1995 | | * with NULL suffix values. (_bt_first is expected to skip over the group |
1996 | | * of NULLs by applying a similar "deduce NOT NULL" rule of its own, which |
1997 | | * involves consing up an explicit SK_SEARCHNOTNULL key.) |
1998 | | * |
1999 | | * Apply a test against finaltup to detect and recover from the problem: |
2000 | | * if even finaltup doesn't satisfy such an inequality, we just skip by |
2001 | | * starting a new primitive index scan. When we skip, we know for sure |
2002 | | * that all of the tuples on the current page following caller's tuple are |
2003 | | * also before the _bt_first-wise start of tuples for our new qual. That |
2004 | | * at least suggests many more skippable pages beyond the current page. |
2005 | | * (when so->scanBehind and so->oppositeDirCheck are set, this'll happen |
2006 | | * when we test the next page's finaltup/high key instead.) |
2007 | | */ |
2008 | 0 | else if (has_required_opposite_direction_only && pstate->finaltup && |
2009 | 0 | unlikely(!_bt_oppodir_checkkeys(scan, dir, pstate->finaltup))) |
2010 | 0 | { |
2011 | | /* |
2012 | | * Make sure that any SAOP arrays that were not marked required by |
2013 | | * preprocessing are reset to their first element for this direction |
2014 | | */ |
2015 | 0 | _bt_rewind_nonrequired_arrays(scan, dir); |
2016 | 0 | goto new_prim_scan; |
2017 | 0 | } |
2018 | | |
2019 | 0 | continue_scan: |
2020 | | |
2021 | | /* |
2022 | | * Stick with the ongoing primitive index scan for now. |
2023 | | * |
2024 | | * It's possible that later tuples will also turn out to have values that |
2025 | | * are still < the now-current array keys (or > the current array keys). |
2026 | | * Our caller will handle this by performing what amounts to a linear |
2027 | | * search of the page, implemented by calling _bt_check_compare and then |
2028 | | * _bt_tuple_before_array_skeys for each tuple. |
2029 | | * |
2030 | | * This approach has various advantages over a binary search of the page. |
2031 | | * Repeated binary searches of the page (one binary search for every array |
2032 | | * advancement) won't outperform a continuous linear search. While there |
2033 | | * are workloads that a naive linear search won't handle well, our caller |
2034 | | * has a "look ahead" fallback mechanism to deal with that problem. |
2035 | | */ |
2036 | 0 | pstate->continuescan = true; /* Override _bt_check_compare */ |
2037 | 0 | so->needPrimScan = false; /* _bt_readpage has more tuples to check */ |
2038 | |
|
2039 | 0 | if (so->scanBehind) |
2040 | 0 | { |
2041 | | /* |
2042 | | * Remember if recheck needs to call _bt_oppodir_checkkeys for next |
2043 | | * page's finaltup (see above comments about "Handle inequalities |
2044 | | * marked required in the opposite scan direction" for why). |
2045 | | */ |
2046 | 0 | so->oppositeDirCheck = has_required_opposite_direction_only; |
2047 | |
|
2048 | 0 | _bt_rewind_nonrequired_arrays(scan, dir); |
2049 | | |
2050 | | /* |
2051 | | * skip by setting "look ahead" mechanism's offnum for forwards scans |
2052 | | * (backwards scans check scanBehind flag directly instead) |
2053 | | */ |
2054 | 0 | if (ScanDirectionIsForward(dir)) |
2055 | 0 | pstate->skip = pstate->maxoff + 1; |
2056 | 0 | } |
2057 | | |
2058 | | /* Caller's tuple doesn't match the new qual */ |
2059 | 0 | return false; |
2060 | | |
2061 | 0 | new_prim_scan: |
2062 | |
|
2063 | 0 | Assert(pstate->finaltup); /* not on rightmost/leftmost page */ |
2064 | | |
2065 | | /* |
2066 | | * Looks like another primitive index scan is required. But consider |
2067 | | * continuing the current primscan based on scan-level heuristics. |
2068 | | * |
2069 | | * Continue the ongoing primitive scan (and schedule a recheck for when |
2070 | | * the scan arrives on the next sibling leaf page) when it has already |
2071 | | * read at least one leaf page before the one we're reading now. This |
2072 | | * makes primscan scheduling more efficient when scanning subsets of an |
2073 | | * index with many distinct attribute values matching many array elements. |
2074 | | * It encourages fewer, larger primitive scans where that makes sense. |
2075 | | * This will in turn encourage _bt_readpage to apply the pstate.startikey |
2076 | | * optimization more often. |
2077 | | * |
2078 | | * Also continue the ongoing primitive index scan when it is still on the |
2079 | | * first page if there have been more than NSKIPADVANCES_THRESHOLD calls |
2080 | | * here that each advanced at least one of the scan's skip arrays |
2081 | | * (deliberately ignore advancements that only affected SAOP arrays here). |
2082 | | * A page that cycles through this many skip array elements is quite |
2083 | | * likely to neighbor similar pages, that we'll also need to read. |
2084 | | * |
2085 | | * Note: These heuristics aren't as aggressive as you might think. We're |
2086 | | * conservative about allowing a primitive scan to step from the first |
2087 | | * leaf page it reads to the page's sibling page (we only allow it on |
2088 | | * first pages whose finaltup strongly suggests that it'll work out, as |
2089 | | * well as first pages that have a large number of skip array advances). |
2090 | | * Clearing this first page finaltup hurdle is a strong signal in itself. |
2091 | | * |
2092 | | * Note: The NSKIPADVANCES_THRESHOLD heuristic exists only to avoid |
2093 | | * pathological cases. Specifically, cases where a skip scan should just |
2094 | | * behave like a traditional full index scan, but ends up "skipping" again |
2095 | | * and again, descending to the prior leaf page's direct sibling leaf page |
2096 | | * each time. This misbehavior would otherwise be possible during scans |
2097 | | * that never quite manage to "clear the first page finaltup hurdle". |
2098 | | */ |
2099 | 0 | if (!pstate->firstpage || pstate->nskipadvances > NSKIPADVANCES_THRESHOLD) |
2100 | 0 | { |
2101 | | /* Schedule a recheck once on the next (or previous) page */ |
2102 | 0 | so->scanBehind = true; |
2103 | | |
2104 | | /* Continue the current primitive scan after all */ |
2105 | 0 | goto continue_scan; |
2106 | 0 | } |
2107 | | |
2108 | | /* |
2109 | | * End this primitive index scan, but schedule another. |
2110 | | * |
2111 | | * Note: We make a soft assumption that the current scan direction will |
2112 | | * also be used within _bt_next, when it is asked to step off this page. |
2113 | | * It is up to _bt_next to cancel this scheduled primitive index scan |
2114 | | * whenever it steps to a page in the direction opposite currPos.dir. |
2115 | | */ |
2116 | 0 | pstate->continuescan = false; /* Tell _bt_readpage we're done... */ |
2117 | 0 | so->needPrimScan = true; /* ...but call _bt_first again */ |
2118 | |
|
2119 | 0 | if (scan->parallel_scan) |
2120 | 0 | _bt_parallel_primscan_schedule(scan, so->currPos.currPage); |
2121 | | |
2122 | | /* Caller's tuple doesn't match the new qual */ |
2123 | 0 | return false; |
2124 | | |
2125 | 0 | end_toplevel_scan: |
2126 | | |
2127 | | /* |
2128 | | * End the current primitive index scan, but don't schedule another. |
2129 | | * |
2130 | | * This ends the entire top-level scan in the current scan direction. |
2131 | | * |
2132 | | * Note: The scan's arrays (including any non-required arrays) are now in |
2133 | | * their final positions for the current scan direction. If the scan |
2134 | | * direction happens to change, then the arrays will already be in their |
2135 | | * first positions for what will then be the current scan direction. |
2136 | | */ |
2137 | 0 | pstate->continuescan = false; /* Tell _bt_readpage we're done... */ |
2138 | 0 | so->needPrimScan = false; /* ...and don't call _bt_first again */ |
2139 | | |
2140 | | /* Caller's tuple doesn't match any qual */ |
2141 | 0 | return false; |
2142 | 0 | } |
2143 | | |
2144 | | #ifdef USE_ASSERT_CHECKING |
2145 | | /* |
2146 | | * Verify that the scan's qual state matches what we expect at the point that |
2147 | | * _bt_start_prim_scan is about to start a just-scheduled new primitive scan. |
2148 | | * |
2149 | | * We enforce a rule against non-required array scan keys: they must start out |
2150 | | * with whatever element is the first for the scan's current scan direction. |
2151 | | * See _bt_rewind_nonrequired_arrays comments for an explanation. |
2152 | | */ |
2153 | | static bool |
2154 | | _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir) |
2155 | | { |
2156 | | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
2157 | | int arrayidx = 0; |
2158 | | |
2159 | | for (int ikey = 0; ikey < so->numberOfKeys; ikey++) |
2160 | | { |
2161 | | ScanKey cur = so->keyData + ikey; |
2162 | | BTArrayKeyInfo *array = NULL; |
2163 | | int first_elem_dir; |
2164 | | |
2165 | | if (!(cur->sk_flags & SK_SEARCHARRAY) || |
2166 | | cur->sk_strategy != BTEqualStrategyNumber) |
2167 | | continue; |
2168 | | |
2169 | | array = &so->arrayKeys[arrayidx++]; |
2170 | | |
2171 | | if (((cur->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || |
2172 | | ((cur->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) |
2173 | | continue; |
2174 | | |
2175 | | if (ScanDirectionIsForward(dir)) |
2176 | | first_elem_dir = 0; |
2177 | | else |
2178 | | first_elem_dir = array->num_elems - 1; |
2179 | | |
2180 | | if (array->cur_elem != first_elem_dir) |
2181 | | return false; |
2182 | | } |
2183 | | |
2184 | | return _bt_verify_keys_with_arraykeys(scan); |
2185 | | } |
2186 | | |
2187 | | /* |
2188 | | * Verify that the scan's "so->keyData[]" scan keys are in agreement with |
2189 | | * its array key state |
2190 | | */ |
2191 | | static bool |
2192 | | _bt_verify_keys_with_arraykeys(IndexScanDesc scan) |
2193 | | { |
2194 | | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
2195 | | int last_sk_attno = InvalidAttrNumber, |
2196 | | arrayidx = 0; |
2197 | | |
2198 | | if (!so->qual_ok) |
2199 | | return false; |
2200 | | |
2201 | | for (int ikey = 0; ikey < so->numberOfKeys; ikey++) |
2202 | | { |
2203 | | ScanKey cur = so->keyData + ikey; |
2204 | | BTArrayKeyInfo *array; |
2205 | | |
2206 | | if (cur->sk_strategy != BTEqualStrategyNumber || |
2207 | | !(cur->sk_flags & SK_SEARCHARRAY)) |
2208 | | continue; |
2209 | | |
2210 | | array = &so->arrayKeys[arrayidx++]; |
2211 | | if (array->scan_key != ikey) |
2212 | | return false; |
2213 | | |
2214 | | if (array->num_elems == 0 || array->num_elems < -1) |
2215 | | return false; |
2216 | | |
2217 | | if (array->num_elems != -1 && |
2218 | | cur->sk_argument != array->elem_values[array->cur_elem]) |
2219 | | return false; |
2220 | | if (last_sk_attno > cur->sk_attno) |
2221 | | return false; |
2222 | | last_sk_attno = cur->sk_attno; |
2223 | | } |
2224 | | |
2225 | | if (arrayidx != so->numArrayKeys) |
2226 | | return false; |
2227 | | |
2228 | | return true; |
2229 | | } |
2230 | | #endif |
2231 | | |
2232 | | /* |
2233 | | * Test whether an indextuple satisfies all the scankey conditions. |
2234 | | * |
2235 | | * Return true if so, false if not. If the tuple fails to pass the qual, |
2236 | | * we also determine whether there's any need to continue the scan beyond |
2237 | | * this tuple, and set pstate.continuescan accordingly. See comments for |
2238 | | * _bt_preprocess_keys() about how this is done. |
2239 | | * |
2240 | | * Forward scan callers can pass a high key tuple in the hopes of having |
2241 | | * us set *continuescan to false, and avoiding an unnecessary visit to |
2242 | | * the page to the right. |
2243 | | * |
2244 | | * Advances the scan's array keys when necessary for arrayKeys=true callers. |
2245 | | * Scans without any array keys must always pass arrayKeys=false. |
2246 | | * |
2247 | | * Also stops and starts primitive index scans for arrayKeys=true callers. |
2248 | | * Scans with array keys are required to set up page state that helps us with |
2249 | | * this. The page's finaltup tuple (the page high key for a forward scan, or |
2250 | | * the page's first non-pivot tuple for a backward scan) must be set in |
2251 | | * pstate.finaltup ahead of the first call here for the page. Set this to |
2252 | | * NULL for rightmost page (or the leftmost page for backwards scans). |
2253 | | * |
2254 | | * scan: index scan descriptor (containing a search-type scankey) |
2255 | | * pstate: page level input and output parameters |
2256 | | * arrayKeys: should we advance the scan's array keys if necessary? |
2257 | | * tuple: index tuple to test |
2258 | | * tupnatts: number of attributes in tupnatts (high key may be truncated) |
2259 | | */ |
2260 | | bool |
2261 | | _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, bool arrayKeys, |
2262 | | IndexTuple tuple, int tupnatts) |
2263 | 0 | { |
2264 | 0 | TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); |
2265 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
2266 | 0 | ScanDirection dir = so->currPos.dir; |
2267 | 0 | int ikey = pstate->startikey; |
2268 | 0 | bool res; |
2269 | |
|
2270 | 0 | Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); |
2271 | 0 | Assert(!so->needPrimScan && !so->scanBehind && !so->oppositeDirCheck); |
2272 | 0 | Assert(arrayKeys || so->numArrayKeys == 0); |
2273 | |
|
2274 | 0 | res = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, arrayKeys, |
2275 | 0 | pstate->forcenonrequired, &pstate->continuescan, |
2276 | 0 | &ikey); |
2277 | | |
2278 | | /* |
2279 | | * If _bt_check_compare relied on the pstate.startikey optimization, call |
2280 | | * again (in assert-enabled builds) to verify it didn't affect our answer. |
2281 | | * |
2282 | | * Note: we can't do this when !pstate.forcenonrequired, since any arrays |
2283 | | * before pstate.startikey won't have advanced on this page at all. |
2284 | | */ |
2285 | 0 | Assert(!pstate->forcenonrequired || arrayKeys); |
2286 | | #ifdef USE_ASSERT_CHECKING |
2287 | | if (pstate->startikey > 0 && !pstate->forcenonrequired) |
2288 | | { |
2289 | | bool dres, |
2290 | | dcontinuescan; |
2291 | | int dikey = 0; |
2292 | | |
2293 | | /* Pass arrayKeys=false to avoid array side-effects */ |
2294 | | dres = _bt_check_compare(scan, dir, tuple, tupnatts, tupdesc, false, |
2295 | | pstate->forcenonrequired, &dcontinuescan, |
2296 | | &dikey); |
2297 | | Assert(res == dres); |
2298 | | Assert(pstate->continuescan == dcontinuescan); |
2299 | | |
2300 | | /* |
2301 | | * Should also get the same ikey result. We need a slightly weaker |
2302 | | * assertion during arrayKeys calls, since they might be using an |
2303 | | * array that couldn't be marked required during preprocessing. |
2304 | | */ |
2305 | | Assert(arrayKeys || ikey == dikey); |
2306 | | Assert(ikey <= dikey); |
2307 | | } |
2308 | | #endif |
2309 | | |
2310 | | /* |
2311 | | * Only one _bt_check_compare call is required in the common case where |
2312 | | * there are no equality strategy array scan keys. Otherwise we can only |
2313 | | * accept _bt_check_compare's answer unreservedly when it didn't set |
2314 | | * pstate.continuescan=false. |
2315 | | */ |
2316 | 0 | if (!arrayKeys || pstate->continuescan) |
2317 | 0 | return res; |
2318 | | |
2319 | | /* |
2320 | | * _bt_check_compare call set continuescan=false in the presence of |
2321 | | * equality type array keys. This could mean that the tuple is just past |
2322 | | * the end of matches for the current array keys. |
2323 | | * |
2324 | | * It's also possible that the scan is still _before_ the _start_ of |
2325 | | * tuples matching the current set of array keys. Check for that first. |
2326 | | */ |
2327 | 0 | Assert(!pstate->forcenonrequired); |
2328 | 0 | if (_bt_tuple_before_array_skeys(scan, dir, tuple, tupdesc, tupnatts, true, |
2329 | 0 | ikey, NULL)) |
2330 | 0 | { |
2331 | | /* Override _bt_check_compare, continue primitive scan */ |
2332 | 0 | pstate->continuescan = true; |
2333 | | |
2334 | | /* |
2335 | | * We will end up here repeatedly given a group of tuples > the |
2336 | | * previous array keys and < the now-current keys (for a backwards |
2337 | | * scan it's just the same, though the operators swap positions). |
2338 | | * |
2339 | | * We must avoid allowing this linear search process to scan very many |
2340 | | * tuples from well before the start of tuples matching the current |
2341 | | * array keys (or from well before the point where we'll once again |
2342 | | * have to advance the scan's array keys). |
2343 | | * |
2344 | | * We keep the overhead under control by speculatively "looking ahead" |
2345 | | * to later still-unscanned items from this same leaf page. We'll |
2346 | | * only attempt this once the number of tuples that the linear search |
2347 | | * process has examined starts to get out of hand. |
2348 | | */ |
2349 | 0 | pstate->rechecks++; |
2350 | 0 | if (pstate->rechecks >= LOOK_AHEAD_REQUIRED_RECHECKS) |
2351 | 0 | { |
2352 | | /* See if we should skip ahead within the current leaf page */ |
2353 | 0 | _bt_checkkeys_look_ahead(scan, pstate, tupnatts, tupdesc); |
2354 | | |
2355 | | /* |
2356 | | * Might have set pstate.skip to a later page offset. When that |
2357 | | * happens then _bt_readpage caller will inexpensively skip ahead |
2358 | | * to a later tuple from the same page (the one just after the |
2359 | | * tuple we successfully "looked ahead" to). |
2360 | | */ |
2361 | 0 | } |
2362 | | |
2363 | | /* This indextuple doesn't match the current qual, in any case */ |
2364 | 0 | return false; |
2365 | 0 | } |
2366 | | |
2367 | | /* |
2368 | | * Caller's tuple is >= the current set of array keys and other equality |
2369 | | * constraint scan keys (or <= if this is a backwards scan). It's now |
2370 | | * clear that we _must_ advance any required array keys in lockstep with |
2371 | | * the scan. |
2372 | | */ |
2373 | 0 | return _bt_advance_array_keys(scan, pstate, tuple, tupnatts, tupdesc, |
2374 | 0 | ikey, true); |
2375 | 0 | } |
2376 | | |
2377 | | /* |
2378 | | * Test whether caller's finaltup tuple is still before the start of matches |
2379 | | * for the current array keys. |
2380 | | * |
2381 | | * Called at the start of reading a page during a scan with array keys, though |
2382 | | * only when the so->scanBehind flag was set on the scan's prior page. |
2383 | | * |
2384 | | * Returns false if the tuple is still before the start of matches. When that |
2385 | | * happens, caller should cut its losses and start a new primitive index scan. |
2386 | | * Otherwise returns true. |
2387 | | */ |
2388 | | bool |
2389 | | _bt_scanbehind_checkkeys(IndexScanDesc scan, ScanDirection dir, |
2390 | | IndexTuple finaltup) |
2391 | 0 | { |
2392 | 0 | Relation rel = scan->indexRelation; |
2393 | 0 | TupleDesc tupdesc = RelationGetDescr(rel); |
2394 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
2395 | 0 | int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); |
2396 | 0 | bool scanBehind; |
2397 | |
|
2398 | 0 | Assert(so->numArrayKeys); |
2399 | |
|
2400 | 0 | if (_bt_tuple_before_array_skeys(scan, dir, finaltup, tupdesc, |
2401 | 0 | nfinaltupatts, false, 0, &scanBehind)) |
2402 | 0 | return false; |
2403 | | |
2404 | | /* |
2405 | | * If scanBehind was set, all of the untruncated attribute values from |
2406 | | * finaltup that correspond to an array match the array's current element, |
2407 | | * but there are other keys associated with truncated suffix attributes. |
2408 | | * Array advancement must have incremented the scan's arrays on the |
2409 | | * previous page, resulting in a set of array keys that happen to be an |
2410 | | * exact match for the current page high key's untruncated prefix values. |
2411 | | * |
2412 | | * This page definitely doesn't contain tuples that the scan will need to |
2413 | | * return. The next page may or may not contain relevant tuples. Handle |
2414 | | * this by cutting our losses and starting a new primscan. |
2415 | | */ |
2416 | 0 | if (scanBehind) |
2417 | 0 | return false; |
2418 | | |
2419 | 0 | if (!so->oppositeDirCheck) |
2420 | 0 | return true; |
2421 | | |
2422 | 0 | return _bt_oppodir_checkkeys(scan, dir, finaltup); |
2423 | 0 | } |
2424 | | |
2425 | | /* |
2426 | | * Test whether an indextuple fails to satisfy an inequality required in the |
2427 | | * opposite direction only. |
2428 | | * |
2429 | | * Caller's finaltup tuple is the page high key (for forwards scans), or the |
2430 | | * first non-pivot tuple (for backwards scans). Called during scans with |
2431 | | * required array keys and required opposite-direction inequalities. |
2432 | | * |
2433 | | * Returns false if an inequality scan key required in the opposite direction |
2434 | | * only isn't satisfied (and any earlier required scan keys are satisfied). |
2435 | | * Otherwise returns true. |
2436 | | * |
2437 | | * An unsatisfied inequality required in the opposite direction only might |
2438 | | * well enable skipping over many leaf pages, provided another _bt_first call |
2439 | | * takes place. This type of unsatisfied inequality won't usually cause |
2440 | | * _bt_checkkeys to stop the scan to consider array advancement/starting a new |
2441 | | * primitive index scan. |
2442 | | */ |
2443 | | static bool |
2444 | | _bt_oppodir_checkkeys(IndexScanDesc scan, ScanDirection dir, |
2445 | | IndexTuple finaltup) |
2446 | 0 | { |
2447 | 0 | Relation rel = scan->indexRelation; |
2448 | 0 | TupleDesc tupdesc = RelationGetDescr(rel); |
2449 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
2450 | 0 | int nfinaltupatts = BTreeTupleGetNAtts(finaltup, rel); |
2451 | 0 | bool continuescan; |
2452 | 0 | ScanDirection flipped = -dir; |
2453 | 0 | int ikey = 0; |
2454 | |
|
2455 | 0 | Assert(so->numArrayKeys); |
2456 | |
|
2457 | 0 | _bt_check_compare(scan, flipped, finaltup, nfinaltupatts, tupdesc, false, |
2458 | 0 | false, &continuescan, |
2459 | 0 | &ikey); |
2460 | |
|
2461 | 0 | if (!continuescan && so->keyData[ikey].sk_strategy != BTEqualStrategyNumber) |
2462 | 0 | return false; |
2463 | | |
2464 | 0 | return true; |
2465 | 0 | } |
2466 | | |
2467 | | /* |
2468 | | * Determines an offset to the first scan key (an so->keyData[]-wise offset) |
2469 | | * that is _not_ guaranteed to be satisfied by every tuple from pstate.page, |
2470 | | * which is set in pstate.startikey for _bt_checkkeys calls for the page. |
2471 | | * This allows caller to save cycles on comparisons of a prefix of keys while |
2472 | | * reading pstate.page. |
2473 | | * |
2474 | | * Also determines if later calls to _bt_checkkeys (for pstate.page) should be |
2475 | | * forced to treat all required scan keys >= pstate.startikey as nonrequired |
2476 | | * (that is, if they're to be treated as if any SK_BT_REQFWD/SK_BT_REQBKWD |
2477 | | * markings that were set by preprocessing were not set at all, for the |
2478 | | * duration of _bt_checkkeys calls prior to the call for pstate.finaltup). |
2479 | | * This is indicated to caller by setting pstate.forcenonrequired. |
2480 | | * |
2481 | | * Call here at the start of reading a leaf page beyond the first one for the |
2482 | | * primitive index scan. We consider all non-pivot tuples, so it doesn't make |
2483 | | * sense to call here when only a subset of those tuples can ever be read. |
2484 | | * This is also a good idea on performance grounds; not calling here when on |
2485 | | * the first page (first for the current primitive scan) avoids wasting cycles |
2486 | | * during selective point queries. They typically don't stand to gain as much |
2487 | | * when we can set pstate.startikey, and are likely to notice the overhead of |
2488 | | * calling here. (Also, allowing pstate.forcenonrequired to be set on a |
2489 | | * primscan's first page would mislead _bt_advance_array_keys, which expects |
2490 | | * pstate.nskipadvances to be representative of every first page's key space.) |
2491 | | * |
2492 | | * Caller must call _bt_start_array_keys and reset startikey/forcenonrequired |
2493 | | * ahead of the finaltup _bt_checkkeys call when we set forcenonrequired=true. |
2494 | | * This will give _bt_checkkeys the opportunity to call _bt_advance_array_keys |
2495 | | * with sktrig_required=true, restoring the invariant that the scan's required |
2496 | | * arrays always track the scan's progress through the index's key space. |
2497 | | * Caller won't need to do this on the rightmost/leftmost page in the index |
2498 | | * (where pstate.finaltup isn't ever set), since forcenonrequired will never |
2499 | | * be set here in the first place. |
2500 | | */ |
2501 | | void |
2502 | | _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) |
2503 | 0 | { |
2504 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
2505 | 0 | Relation rel = scan->indexRelation; |
2506 | 0 | TupleDesc tupdesc = RelationGetDescr(rel); |
2507 | 0 | ItemId iid; |
2508 | 0 | IndexTuple firsttup, |
2509 | 0 | lasttup; |
2510 | 0 | int startikey = 0, |
2511 | 0 | arrayidx = 0, |
2512 | 0 | firstchangingattnum; |
2513 | 0 | bool start_past_saop_eq = false; |
2514 | |
|
2515 | 0 | Assert(!so->scanBehind); |
2516 | 0 | Assert(pstate->minoff < pstate->maxoff); |
2517 | 0 | Assert(!pstate->firstpage); |
2518 | 0 | Assert(pstate->startikey == 0); |
2519 | 0 | Assert(!so->numArrayKeys || pstate->finaltup || |
2520 | 0 | P_RIGHTMOST(BTPageGetOpaque(pstate->page)) || |
2521 | 0 | P_LEFTMOST(BTPageGetOpaque(pstate->page))); |
2522 | |
|
2523 | 0 | if (so->numberOfKeys == 0) |
2524 | 0 | return; |
2525 | | |
2526 | | /* minoff is an offset to the lowest non-pivot tuple on the page */ |
2527 | 0 | iid = PageGetItemId(pstate->page, pstate->minoff); |
2528 | 0 | firsttup = (IndexTuple) PageGetItem(pstate->page, iid); |
2529 | | |
2530 | | /* maxoff is an offset to the highest non-pivot tuple on the page */ |
2531 | 0 | iid = PageGetItemId(pstate->page, pstate->maxoff); |
2532 | 0 | lasttup = (IndexTuple) PageGetItem(pstate->page, iid); |
2533 | | |
2534 | | /* Determine the first attribute whose values change on caller's page */ |
2535 | 0 | firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); |
2536 | |
|
2537 | 0 | for (; startikey < so->numberOfKeys; startikey++) |
2538 | 0 | { |
2539 | 0 | ScanKey key = so->keyData + startikey; |
2540 | 0 | BTArrayKeyInfo *array; |
2541 | 0 | Datum firstdatum, |
2542 | 0 | lastdatum; |
2543 | 0 | bool firstnull, |
2544 | 0 | lastnull; |
2545 | 0 | int32 result; |
2546 | | |
2547 | | /* |
2548 | | * Determine if it's safe to set pstate.startikey to an offset to a |
2549 | | * key that comes after this key, by examining this key |
2550 | | */ |
2551 | 0 | if (!(key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD))) |
2552 | 0 | { |
2553 | | /* Scan key isn't marked required (corner case) */ |
2554 | 0 | Assert(!(key->sk_flags & SK_ROW_HEADER)); |
2555 | 0 | break; /* unsafe */ |
2556 | 0 | } |
2557 | 0 | if (key->sk_flags & SK_ROW_HEADER) |
2558 | 0 | { |
2559 | | /* |
2560 | | * RowCompare inequality. |
2561 | | * |
2562 | | * Only the first subkey from a RowCompare can ever be marked |
2563 | | * required (that happens when the row header is marked required). |
2564 | | * There is no simple, general way for us to transitively deduce |
2565 | | * whether or not every tuple on the page satisfies a RowCompare |
2566 | | * key based only on firsttup and lasttup -- so we just give up. |
2567 | | */ |
2568 | 0 | if (!start_past_saop_eq && !so->skipScan) |
2569 | 0 | break; /* unsafe to go further */ |
2570 | | |
2571 | | /* |
2572 | | * We have to be even more careful with RowCompares that come |
2573 | | * after an array: we assume it's unsafe to even bypass the array. |
2574 | | * Calling _bt_start_array_keys to recover the scan's arrays |
2575 | | * following use of forcenonrequired mode isn't compatible with |
2576 | | * _bt_check_rowcompare's continuescan=false behavior with NULL |
2577 | | * row compare members. _bt_advance_array_keys must not make a |
2578 | | * decision on the basis of a key not being satisfied in the |
2579 | | * opposite-to-scan direction until the scan reaches a leaf page |
2580 | | * where the same key begins to be satisfied in scan direction. |
2581 | | * The _bt_first !used_all_subkeys behavior makes this limitation |
2582 | | * hard to work around some other way. |
2583 | | */ |
2584 | 0 | return; /* completely unsafe to set pstate.startikey */ |
2585 | 0 | } |
2586 | 0 | if (key->sk_strategy != BTEqualStrategyNumber) |
2587 | 0 | { |
2588 | | /* |
2589 | | * Scalar inequality key. |
2590 | | * |
2591 | | * It's definitely safe for _bt_checkkeys to avoid assessing this |
2592 | | * inequality when the page's first and last non-pivot tuples both |
2593 | | * satisfy the inequality (since the same must also be true of all |
2594 | | * the tuples in between these two). |
2595 | | * |
2596 | | * Unlike the "=" case, it doesn't matter if this attribute has |
2597 | | * more than one distinct value (though it _is_ necessary for any |
2598 | | * and all _prior_ attributes to contain no more than one distinct |
2599 | | * value amongst all of the tuples from pstate.page). |
2600 | | */ |
2601 | 0 | if (key->sk_attno > firstchangingattnum) /* >, not >= */ |
2602 | 0 | break; /* unsafe, preceding attr has multiple |
2603 | | * distinct values */ |
2604 | | |
2605 | 0 | firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull); |
2606 | 0 | lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull); |
2607 | |
|
2608 | 0 | if (key->sk_flags & SK_ISNULL) |
2609 | 0 | { |
2610 | | /* IS NOT NULL key */ |
2611 | 0 | Assert(key->sk_flags & SK_SEARCHNOTNULL); |
2612 | |
|
2613 | 0 | if (firstnull || lastnull) |
2614 | 0 | break; /* unsafe */ |
2615 | | |
2616 | | /* Safe, IS NOT NULL key satisfied by every tuple */ |
2617 | 0 | continue; |
2618 | 0 | } |
2619 | | |
2620 | | /* Test firsttup */ |
2621 | 0 | if (firstnull || |
2622 | 0 | !DatumGetBool(FunctionCall2Coll(&key->sk_func, |
2623 | 0 | key->sk_collation, firstdatum, |
2624 | 0 | key->sk_argument))) |
2625 | 0 | break; /* unsafe */ |
2626 | | |
2627 | | /* Test lasttup */ |
2628 | 0 | if (lastnull || |
2629 | 0 | !DatumGetBool(FunctionCall2Coll(&key->sk_func, |
2630 | 0 | key->sk_collation, lastdatum, |
2631 | 0 | key->sk_argument))) |
2632 | 0 | break; /* unsafe */ |
2633 | | |
2634 | | /* Safe, scalar inequality satisfied by every tuple */ |
2635 | 0 | continue; |
2636 | 0 | } |
2637 | | |
2638 | | /* Some = key (could be a scalar = key, could be an array = key) */ |
2639 | 0 | Assert(key->sk_strategy == BTEqualStrategyNumber); |
2640 | |
|
2641 | 0 | if (!(key->sk_flags & SK_SEARCHARRAY)) |
2642 | 0 | { |
2643 | | /* |
2644 | | * Scalar = key (possibly an IS NULL key). |
2645 | | * |
2646 | | * It is unsafe to set pstate.startikey to an ikey beyond this |
2647 | | * key, unless the = key is satisfied by every possible tuple on |
2648 | | * the page (possible only when attribute has just one distinct |
2649 | | * value among all tuples on the page). |
2650 | | */ |
2651 | 0 | if (key->sk_attno >= firstchangingattnum) |
2652 | 0 | break; /* unsafe, multiple distinct attr values */ |
2653 | | |
2654 | 0 | firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, |
2655 | 0 | &firstnull); |
2656 | 0 | if (key->sk_flags & SK_ISNULL) |
2657 | 0 | { |
2658 | | /* IS NULL key */ |
2659 | 0 | Assert(key->sk_flags & SK_SEARCHNULL); |
2660 | |
|
2661 | 0 | if (!firstnull) |
2662 | 0 | break; /* unsafe */ |
2663 | | |
2664 | | /* Safe, IS NULL key satisfied by every tuple */ |
2665 | 0 | continue; |
2666 | 0 | } |
2667 | 0 | if (firstnull || |
2668 | 0 | !DatumGetBool(FunctionCall2Coll(&key->sk_func, |
2669 | 0 | key->sk_collation, firstdatum, |
2670 | 0 | key->sk_argument))) |
2671 | 0 | break; /* unsafe */ |
2672 | | |
2673 | | /* Safe, scalar = key satisfied by every tuple */ |
2674 | 0 | continue; |
2675 | 0 | } |
2676 | | |
2677 | | /* = array key (could be a SAOP array, could be a skip array) */ |
2678 | 0 | array = &so->arrayKeys[arrayidx++]; |
2679 | 0 | Assert(array->scan_key == startikey); |
2680 | 0 | if (array->num_elems != -1) |
2681 | 0 | { |
2682 | | /* |
2683 | | * SAOP array = key. |
2684 | | * |
2685 | | * Handle this like we handle scalar = keys (though binary search |
2686 | | * for a matching element, to avoid relying on key's sk_argument). |
2687 | | */ |
2688 | 0 | if (key->sk_attno >= firstchangingattnum) |
2689 | 0 | break; /* unsafe, multiple distinct attr values */ |
2690 | | |
2691 | 0 | firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, |
2692 | 0 | &firstnull); |
2693 | 0 | _bt_binsrch_array_skey(&so->orderProcs[startikey], |
2694 | 0 | false, NoMovementScanDirection, |
2695 | 0 | firstdatum, firstnull, array, key, |
2696 | 0 | &result); |
2697 | 0 | if (result != 0) |
2698 | 0 | break; /* unsafe */ |
2699 | | |
2700 | | /* Safe, SAOP = key satisfied by every tuple */ |
2701 | 0 | start_past_saop_eq = true; |
2702 | 0 | continue; |
2703 | 0 | } |
2704 | | |
2705 | | /* |
2706 | | * Skip array = key |
2707 | | */ |
2708 | 0 | Assert(key->sk_flags & SK_BT_SKIP); |
2709 | 0 | if (array->null_elem) |
2710 | 0 | { |
2711 | | /* |
2712 | | * Non-range skip array = key. |
2713 | | * |
2714 | | * Safe, non-range skip array "satisfied" by every tuple on page |
2715 | | * (safe even when "key->sk_attno > firstchangingattnum"). |
2716 | | */ |
2717 | 0 | continue; |
2718 | 0 | } |
2719 | | |
2720 | | /* |
2721 | | * Range skip array = key. |
2722 | | * |
2723 | | * Handle this like we handle scalar inequality keys (but avoid using |
2724 | | * key's sk_argument directly, as in the SAOP array case). |
2725 | | */ |
2726 | 0 | if (key->sk_attno > firstchangingattnum) /* >, not >= */ |
2727 | 0 | break; /* unsafe, preceding attr has multiple |
2728 | | * distinct values */ |
2729 | | |
2730 | 0 | firstdatum = index_getattr(firsttup, key->sk_attno, tupdesc, &firstnull); |
2731 | 0 | lastdatum = index_getattr(lasttup, key->sk_attno, tupdesc, &lastnull); |
2732 | | |
2733 | | /* Test firsttup */ |
2734 | 0 | _bt_binsrch_skiparray_skey(false, ForwardScanDirection, |
2735 | 0 | firstdatum, firstnull, array, key, |
2736 | 0 | &result); |
2737 | 0 | if (result != 0) |
2738 | 0 | break; /* unsafe */ |
2739 | | |
2740 | | /* Test lasttup */ |
2741 | 0 | _bt_binsrch_skiparray_skey(false, ForwardScanDirection, |
2742 | 0 | lastdatum, lastnull, array, key, |
2743 | 0 | &result); |
2744 | 0 | if (result != 0) |
2745 | 0 | break; /* unsafe */ |
2746 | | |
2747 | | /* Safe, range skip array satisfied by every tuple on page */ |
2748 | 0 | } |
2749 | | |
2750 | | /* |
2751 | | * Use of forcenonrequired is typically undesirable, since it'll force |
2752 | | * _bt_readpage caller to read every tuple on the page -- even though, in |
2753 | | * general, it might well be possible to end the scan on an earlier tuple. |
2754 | | * However, caller must use forcenonrequired when start_past_saop_eq=true, |
2755 | | * since the usual required array behavior might fail to roll over to the |
2756 | | * SAOP array. |
2757 | | * |
2758 | | * We always prefer forcenonrequired=true during scans with skip arrays |
2759 | | * (except on the first page of each primitive index scan), though -- even |
2760 | | * when "startikey == 0". That way, _bt_advance_array_keys's low-order |
2761 | | * key precheck optimization can always be used (unless on the first page |
2762 | | * of the scan). It seems slightly preferable to check more tuples when |
2763 | | * that allows us to do significantly less skip array maintenance. |
2764 | | */ |
2765 | 0 | pstate->forcenonrequired = (start_past_saop_eq || so->skipScan); |
2766 | 0 | pstate->startikey = startikey; |
2767 | | |
2768 | | /* |
2769 | | * _bt_readpage caller is required to call _bt_checkkeys against page's |
2770 | | * finaltup with forcenonrequired=false whenever we initially set |
2771 | | * forcenonrequired=true. That way the scan's arrays will reliably track |
2772 | | * its progress through the index's key space. |
2773 | | * |
2774 | | * We don't expect this when _bt_readpage caller has no finaltup due to |
2775 | | * its page being the rightmost (or the leftmost, during backwards scans). |
2776 | | * When we see that _bt_readpage has no finaltup, back out of everything. |
2777 | | */ |
2778 | 0 | Assert(!pstate->forcenonrequired || so->numArrayKeys); |
2779 | 0 | if (pstate->forcenonrequired && !pstate->finaltup) |
2780 | 0 | { |
2781 | 0 | pstate->forcenonrequired = false; |
2782 | 0 | pstate->startikey = 0; |
2783 | 0 | } |
2784 | 0 | } |
2785 | | |
2786 | | /* |
2787 | | * Test whether an indextuple satisfies current scan condition. |
2788 | | * |
2789 | | * Return true if so, false if not. If not, also sets *continuescan to false |
2790 | | * when it's also not possible for any later tuples to pass the current qual |
2791 | | * (with the scan's current set of array keys, in the current scan direction), |
2792 | | * in addition to setting *ikey to the so->keyData[] subscript/offset for the |
2793 | | * unsatisfied scan key (needed when caller must consider advancing the scan's |
2794 | | * array keys). |
2795 | | * |
2796 | | * This is a subroutine for _bt_checkkeys. We provisionally assume that |
2797 | | * reaching the end of the current set of required keys (in particular the |
2798 | | * current required array keys) ends the ongoing (primitive) index scan. |
2799 | | * Callers without array keys should just end the scan right away when they |
2800 | | * find that continuescan has been set to false here by us. Things are more |
2801 | | * complicated for callers with array keys. |
2802 | | * |
2803 | | * Callers with array keys must first consider advancing the arrays when |
2804 | | * continuescan has been set to false here by us. They must then consider if |
2805 | | * it really does make sense to end the current (primitive) index scan, in |
2806 | | * light of everything that is known at that point. (In general when we set |
2807 | | * continuescan=false for these callers it must be treated as provisional.) |
2808 | | * |
2809 | | * We deal with advancing unsatisfied non-required arrays directly, though. |
2810 | | * This is safe, since by definition non-required keys can't end the scan. |
2811 | | * This is just how we determine if non-required arrays are just unsatisfied |
2812 | | * by the current array key, or if they're truly unsatisfied (that is, if |
2813 | | * they're unsatisfied by every possible array key). |
2814 | | * |
2815 | | * Pass advancenonrequired=false to avoid all array related side effects. |
2816 | | * This allows _bt_advance_array_keys caller to avoid infinite recursion. |
2817 | | * |
2818 | | * Pass forcenonrequired=true to instruct us to treat all keys as nonrequired. |
2819 | | * This is used to make it safe to temporarily stop properly maintaining the |
2820 | | * scan's required arrays. _bt_checkkeys caller (_bt_readpage, actually) |
2821 | | * determines a prefix of keys that must satisfy every possible corresponding |
2822 | | * index attribute value from its page, which is passed to us via *ikey arg |
2823 | | * (this is the first key that might be unsatisfied by tuples on the page). |
2824 | | * Obviously, we won't maintain any array keys from before *ikey, so it's |
2825 | | * quite possible for such arrays to "fall behind" the index's keyspace. |
2826 | | * Caller will need to "catch up" by passing forcenonrequired=true (alongside |
2827 | | * an *ikey=0) once the page's finaltup is reached. |
2828 | | * |
2829 | | * Note: it's safe to pass an *ikey > 0 with forcenonrequired=false, but only |
2830 | | * when caller determines that it won't affect array maintenance. |
2831 | | */ |
2832 | | static bool |
2833 | | _bt_check_compare(IndexScanDesc scan, ScanDirection dir, |
2834 | | IndexTuple tuple, int tupnatts, TupleDesc tupdesc, |
2835 | | bool advancenonrequired, bool forcenonrequired, |
2836 | | bool *continuescan, int *ikey) |
2837 | 0 | { |
2838 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
2839 | |
|
2840 | 0 | *continuescan = true; /* default assumption */ |
2841 | |
|
2842 | 0 | for (; *ikey < so->numberOfKeys; (*ikey)++) |
2843 | 0 | { |
2844 | 0 | ScanKey key = so->keyData + *ikey; |
2845 | 0 | Datum datum; |
2846 | 0 | bool isNull; |
2847 | 0 | bool requiredSameDir = false, |
2848 | 0 | requiredOppositeDirOnly = false; |
2849 | | |
2850 | | /* |
2851 | | * Check if the key is required in the current scan direction, in the |
2852 | | * opposite scan direction _only_, or in neither direction (except |
2853 | | * when we're forced to treat all scan keys as nonrequired) |
2854 | | */ |
2855 | 0 | if (forcenonrequired) |
2856 | 0 | { |
2857 | | /* treating scan's keys as non-required */ |
2858 | 0 | } |
2859 | 0 | else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsForward(dir)) || |
2860 | 0 | ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir))) |
2861 | 0 | requiredSameDir = true; |
2862 | 0 | else if (((key->sk_flags & SK_BT_REQFWD) && ScanDirectionIsBackward(dir)) || |
2863 | 0 | ((key->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsForward(dir))) |
2864 | 0 | requiredOppositeDirOnly = true; |
2865 | |
|
2866 | 0 | if (key->sk_attno > tupnatts) |
2867 | 0 | { |
2868 | | /* |
2869 | | * This attribute is truncated (must be high key). The value for |
2870 | | * this attribute in the first non-pivot tuple on the page to the |
2871 | | * right could be any possible value. Assume that truncated |
2872 | | * attribute passes the qual. |
2873 | | */ |
2874 | 0 | Assert(BTreeTupleIsPivot(tuple)); |
2875 | 0 | continue; |
2876 | 0 | } |
2877 | | |
2878 | | /* |
2879 | | * A skip array scan key uses one of several sentinel values. We just |
2880 | | * fall back on _bt_tuple_before_array_skeys when we see such a value. |
2881 | | */ |
2882 | 0 | if (key->sk_flags & (SK_BT_MINVAL | SK_BT_MAXVAL | |
2883 | 0 | SK_BT_NEXT | SK_BT_PRIOR)) |
2884 | 0 | { |
2885 | 0 | Assert(key->sk_flags & SK_SEARCHARRAY); |
2886 | 0 | Assert(key->sk_flags & SK_BT_SKIP); |
2887 | 0 | Assert(requiredSameDir || forcenonrequired); |
2888 | | |
2889 | | /* |
2890 | | * Cannot fall back on _bt_tuple_before_array_skeys when we're |
2891 | | * treating the scan's keys as nonrequired, though. Just handle |
2892 | | * this like any other non-required equality-type array key. |
2893 | | */ |
2894 | 0 | if (forcenonrequired) |
2895 | 0 | return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, |
2896 | 0 | tupdesc, *ikey, false); |
2897 | | |
2898 | 0 | *continuescan = false; |
2899 | 0 | return false; |
2900 | 0 | } |
2901 | | |
2902 | | /* row-comparison keys need special processing */ |
2903 | 0 | if (key->sk_flags & SK_ROW_HEADER) |
2904 | 0 | { |
2905 | 0 | if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, |
2906 | 0 | forcenonrequired, continuescan)) |
2907 | 0 | continue; |
2908 | 0 | return false; |
2909 | 0 | } |
2910 | | |
2911 | 0 | datum = index_getattr(tuple, |
2912 | 0 | key->sk_attno, |
2913 | 0 | tupdesc, |
2914 | 0 | &isNull); |
2915 | |
|
2916 | 0 | if (key->sk_flags & SK_ISNULL) |
2917 | 0 | { |
2918 | | /* Handle IS NULL/NOT NULL tests */ |
2919 | 0 | if (key->sk_flags & SK_SEARCHNULL) |
2920 | 0 | { |
2921 | 0 | if (isNull) |
2922 | 0 | continue; /* tuple satisfies this qual */ |
2923 | 0 | } |
2924 | 0 | else |
2925 | 0 | { |
2926 | 0 | Assert(key->sk_flags & SK_SEARCHNOTNULL); |
2927 | 0 | Assert(!(key->sk_flags & SK_BT_SKIP)); |
2928 | 0 | if (!isNull) |
2929 | 0 | continue; /* tuple satisfies this qual */ |
2930 | 0 | } |
2931 | | |
2932 | | /* |
2933 | | * Tuple fails this qual. If it's a required qual for the current |
2934 | | * scan direction, then we can conclude no further tuples will |
2935 | | * pass, either. |
2936 | | */ |
2937 | 0 | if (requiredSameDir) |
2938 | 0 | *continuescan = false; |
2939 | 0 | else if (unlikely(key->sk_flags & SK_BT_SKIP)) |
2940 | 0 | { |
2941 | | /* |
2942 | | * If we're treating scan keys as nonrequired, and encounter a |
2943 | | * skip array scan key whose current element is NULL, then it |
2944 | | * must be a non-range skip array. It must be satisfied, so |
2945 | | * there's no need to call _bt_advance_array_keys to check. |
2946 | | */ |
2947 | 0 | Assert(forcenonrequired && *ikey > 0); |
2948 | 0 | continue; |
2949 | 0 | } |
2950 | | |
2951 | | /* |
2952 | | * This indextuple doesn't match the qual. |
2953 | | */ |
2954 | 0 | return false; |
2955 | 0 | } |
2956 | | |
2957 | 0 | if (isNull) |
2958 | 0 | { |
2959 | | /* |
2960 | | * Scalar scan key isn't satisfied by NULL tuple value. |
2961 | | * |
2962 | | * If we're treating scan keys as nonrequired, and key is for a |
2963 | | * skip array, then we must attempt to advance the array to NULL |
2964 | | * (if we're successful then the tuple might match the qual). |
2965 | | */ |
2966 | 0 | if (unlikely(forcenonrequired && key->sk_flags & SK_BT_SKIP)) |
2967 | 0 | return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, |
2968 | 0 | tupdesc, *ikey, false); |
2969 | | |
2970 | 0 | if (key->sk_flags & SK_BT_NULLS_FIRST) |
2971 | 0 | { |
2972 | | /* |
2973 | | * Since NULLs are sorted before non-NULLs, we know we have |
2974 | | * reached the lower limit of the range of values for this |
2975 | | * index attr. On a backward scan, we can stop if this qual |
2976 | | * is one of the "must match" subset. We can stop regardless |
2977 | | * of whether the qual is > or <, so long as it's required, |
2978 | | * because it's not possible for any future tuples to pass. On |
2979 | | * a forward scan, however, we must keep going, because we may |
2980 | | * have initially positioned to the start of the index. |
2981 | | * (_bt_advance_array_keys also relies on this behavior during |
2982 | | * forward scans.) |
2983 | | */ |
2984 | 0 | if ((requiredSameDir || requiredOppositeDirOnly) && |
2985 | 0 | ScanDirectionIsBackward(dir)) |
2986 | 0 | *continuescan = false; |
2987 | 0 | } |
2988 | 0 | else |
2989 | 0 | { |
2990 | | /* |
2991 | | * Since NULLs are sorted after non-NULLs, we know we have |
2992 | | * reached the upper limit of the range of values for this |
2993 | | * index attr. On a forward scan, we can stop if this qual is |
2994 | | * one of the "must match" subset. We can stop regardless of |
2995 | | * whether the qual is > or <, so long as it's required, |
2996 | | * because it's not possible for any future tuples to pass. On |
2997 | | * a backward scan, however, we must keep going, because we |
2998 | | * may have initially positioned to the end of the index. |
2999 | | * (_bt_advance_array_keys also relies on this behavior during |
3000 | | * backward scans.) |
3001 | | */ |
3002 | 0 | if ((requiredSameDir || requiredOppositeDirOnly) && |
3003 | 0 | ScanDirectionIsForward(dir)) |
3004 | 0 | *continuescan = false; |
3005 | 0 | } |
3006 | | |
3007 | | /* |
3008 | | * This indextuple doesn't match the qual. |
3009 | | */ |
3010 | 0 | return false; |
3011 | 0 | } |
3012 | | |
3013 | 0 | if (!DatumGetBool(FunctionCall2Coll(&key->sk_func, key->sk_collation, |
3014 | 0 | datum, key->sk_argument))) |
3015 | 0 | { |
3016 | | /* |
3017 | | * Tuple fails this qual. If it's a required qual for the current |
3018 | | * scan direction, then we can conclude no further tuples will |
3019 | | * pass, either. |
3020 | | * |
3021 | | * Note: because we stop the scan as soon as any required equality |
3022 | | * qual fails, it is critical that equality quals be used for the |
3023 | | * initial positioning in _bt_first() when they are available. See |
3024 | | * comments in _bt_first(). |
3025 | | */ |
3026 | 0 | if (requiredSameDir) |
3027 | 0 | *continuescan = false; |
3028 | | |
3029 | | /* |
3030 | | * If this is a non-required equality-type array key, the tuple |
3031 | | * needs to be checked against every possible array key. Handle |
3032 | | * this by "advancing" the scan key's array to a matching value |
3033 | | * (if we're successful then the tuple might match the qual). |
3034 | | */ |
3035 | 0 | else if (advancenonrequired && |
3036 | 0 | key->sk_strategy == BTEqualStrategyNumber && |
3037 | 0 | (key->sk_flags & SK_SEARCHARRAY)) |
3038 | 0 | return _bt_advance_array_keys(scan, NULL, tuple, tupnatts, |
3039 | 0 | tupdesc, *ikey, false); |
3040 | | |
3041 | | /* |
3042 | | * This indextuple doesn't match the qual. |
3043 | | */ |
3044 | 0 | return false; |
3045 | 0 | } |
3046 | 0 | } |
3047 | | |
3048 | | /* If we get here, the tuple passes all index quals. */ |
3049 | 0 | return true; |
3050 | 0 | } |
3051 | | |
3052 | | /* |
3053 | | * Test whether an indextuple satisfies a row-comparison scan condition. |
3054 | | * |
3055 | | * Return true if so, false if not. If not, also clear *continuescan if |
3056 | | * it's not possible for any future tuples in the current scan direction |
3057 | | * to pass the qual. |
3058 | | * |
3059 | | * This is a subroutine for _bt_checkkeys/_bt_check_compare. |
3060 | | */ |
3061 | | static bool |
3062 | | _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, |
3063 | | TupleDesc tupdesc, ScanDirection dir, |
3064 | | bool forcenonrequired, bool *continuescan) |
3065 | 0 | { |
3066 | 0 | ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); |
3067 | 0 | int32 cmpresult = 0; |
3068 | 0 | bool result; |
3069 | | |
3070 | | /* First subkey should be same as the header says */ |
3071 | 0 | Assert(subkey->sk_attno == skey->sk_attno); |
3072 | | |
3073 | | /* Loop over columns of the row condition */ |
3074 | 0 | for (;;) |
3075 | 0 | { |
3076 | 0 | Datum datum; |
3077 | 0 | bool isNull; |
3078 | |
|
3079 | 0 | Assert(subkey->sk_flags & SK_ROW_MEMBER); |
3080 | |
|
3081 | 0 | if (subkey->sk_attno > tupnatts) |
3082 | 0 | { |
3083 | | /* |
3084 | | * This attribute is truncated (must be high key). The value for |
3085 | | * this attribute in the first non-pivot tuple on the page to the |
3086 | | * right could be any possible value. Assume that truncated |
3087 | | * attribute passes the qual. |
3088 | | */ |
3089 | 0 | Assert(BTreeTupleIsPivot(tuple)); |
3090 | 0 | cmpresult = 0; |
3091 | 0 | if (subkey->sk_flags & SK_ROW_END) |
3092 | 0 | break; |
3093 | 0 | subkey++; |
3094 | 0 | continue; |
3095 | 0 | } |
3096 | | |
3097 | 0 | datum = index_getattr(tuple, |
3098 | 0 | subkey->sk_attno, |
3099 | 0 | tupdesc, |
3100 | 0 | &isNull); |
3101 | |
|
3102 | 0 | if (isNull) |
3103 | 0 | { |
3104 | 0 | if (forcenonrequired) |
3105 | 0 | { |
3106 | | /* treating scan's keys as non-required */ |
3107 | 0 | } |
3108 | 0 | else if (subkey->sk_flags & SK_BT_NULLS_FIRST) |
3109 | 0 | { |
3110 | | /* |
3111 | | * Since NULLs are sorted before non-NULLs, we know we have |
3112 | | * reached the lower limit of the range of values for this |
3113 | | * index attr. On a backward scan, we can stop if this qual |
3114 | | * is one of the "must match" subset. We can stop regardless |
3115 | | * of whether the qual is > or <, so long as it's required, |
3116 | | * because it's not possible for any future tuples to pass. On |
3117 | | * a forward scan, however, we must keep going, because we may |
3118 | | * have initially positioned to the start of the index. |
3119 | | * (_bt_advance_array_keys also relies on this behavior during |
3120 | | * forward scans.) |
3121 | | */ |
3122 | 0 | if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && |
3123 | 0 | ScanDirectionIsBackward(dir)) |
3124 | 0 | *continuescan = false; |
3125 | 0 | } |
3126 | 0 | else |
3127 | 0 | { |
3128 | | /* |
3129 | | * Since NULLs are sorted after non-NULLs, we know we have |
3130 | | * reached the upper limit of the range of values for this |
3131 | | * index attr. On a forward scan, we can stop if this qual is |
3132 | | * one of the "must match" subset. We can stop regardless of |
3133 | | * whether the qual is > or <, so long as it's required, |
3134 | | * because it's not possible for any future tuples to pass. On |
3135 | | * a backward scan, however, we must keep going, because we |
3136 | | * may have initially positioned to the end of the index. |
3137 | | * (_bt_advance_array_keys also relies on this behavior during |
3138 | | * backward scans.) |
3139 | | */ |
3140 | 0 | if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && |
3141 | 0 | ScanDirectionIsForward(dir)) |
3142 | 0 | *continuescan = false; |
3143 | 0 | } |
3144 | | |
3145 | | /* |
3146 | | * In any case, this indextuple doesn't match the qual. |
3147 | | */ |
3148 | 0 | return false; |
3149 | 0 | } |
3150 | | |
3151 | 0 | if (subkey->sk_flags & SK_ISNULL) |
3152 | 0 | { |
3153 | | /* |
3154 | | * Unlike the simple-scankey case, this isn't a disallowed case |
3155 | | * (except when it's the first row element that has the NULL arg). |
3156 | | * But it can never match. If all the earlier row comparison |
3157 | | * columns are required for the scan direction, we can stop the |
3158 | | * scan, because there can't be another tuple that will succeed. |
3159 | | */ |
3160 | 0 | Assert(subkey != (ScanKey) DatumGetPointer(skey->sk_argument)); |
3161 | 0 | subkey--; |
3162 | 0 | if (forcenonrequired) |
3163 | 0 | { |
3164 | | /* treating scan's keys as non-required */ |
3165 | 0 | } |
3166 | 0 | else if ((subkey->sk_flags & SK_BT_REQFWD) && |
3167 | 0 | ScanDirectionIsForward(dir)) |
3168 | 0 | *continuescan = false; |
3169 | 0 | else if ((subkey->sk_flags & SK_BT_REQBKWD) && |
3170 | 0 | ScanDirectionIsBackward(dir)) |
3171 | 0 | *continuescan = false; |
3172 | 0 | return false; |
3173 | 0 | } |
3174 | | |
3175 | | /* Perform the test --- three-way comparison not bool operator */ |
3176 | 0 | cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, |
3177 | 0 | subkey->sk_collation, |
3178 | 0 | datum, |
3179 | 0 | subkey->sk_argument)); |
3180 | |
|
3181 | 0 | if (subkey->sk_flags & SK_BT_DESC) |
3182 | 0 | INVERT_COMPARE_RESULT(cmpresult); |
3183 | | |
3184 | | /* Done comparing if unequal, else advance to next column */ |
3185 | 0 | if (cmpresult != 0) |
3186 | 0 | break; |
3187 | | |
3188 | 0 | if (subkey->sk_flags & SK_ROW_END) |
3189 | 0 | break; |
3190 | 0 | subkey++; |
3191 | 0 | } |
3192 | | |
3193 | | /* |
3194 | | * At this point cmpresult indicates the overall result of the row |
3195 | | * comparison, and subkey points to the deciding column (or the last |
3196 | | * column if the result is "="). |
3197 | | */ |
3198 | 0 | switch (subkey->sk_strategy) |
3199 | 0 | { |
3200 | | /* EQ and NE cases aren't allowed here */ |
3201 | 0 | case BTLessStrategyNumber: |
3202 | 0 | result = (cmpresult < 0); |
3203 | 0 | break; |
3204 | 0 | case BTLessEqualStrategyNumber: |
3205 | 0 | result = (cmpresult <= 0); |
3206 | 0 | break; |
3207 | 0 | case BTGreaterEqualStrategyNumber: |
3208 | 0 | result = (cmpresult >= 0); |
3209 | 0 | break; |
3210 | 0 | case BTGreaterStrategyNumber: |
3211 | 0 | result = (cmpresult > 0); |
3212 | 0 | break; |
3213 | 0 | default: |
3214 | 0 | elog(ERROR, "unexpected strategy number %d", subkey->sk_strategy); |
3215 | 0 | result = 0; /* keep compiler quiet */ |
3216 | 0 | break; |
3217 | 0 | } |
3218 | | |
3219 | 0 | if (!result && !forcenonrequired) |
3220 | 0 | { |
3221 | | /* |
3222 | | * Tuple fails this qual. If it's a required qual for the current |
3223 | | * scan direction, then we can conclude no further tuples will pass, |
3224 | | * either. Note we have to look at the deciding column, not |
3225 | | * necessarily the first or last column of the row condition. |
3226 | | */ |
3227 | 0 | if ((subkey->sk_flags & SK_BT_REQFWD) && |
3228 | 0 | ScanDirectionIsForward(dir)) |
3229 | 0 | *continuescan = false; |
3230 | 0 | else if ((subkey->sk_flags & SK_BT_REQBKWD) && |
3231 | 0 | ScanDirectionIsBackward(dir)) |
3232 | 0 | *continuescan = false; |
3233 | 0 | } |
3234 | |
|
3235 | 0 | return result; |
3236 | 0 | } |
3237 | | |
3238 | | /* |
3239 | | * Determine if a scan with array keys should skip over uninteresting tuples. |
3240 | | * |
3241 | | * This is a subroutine for _bt_checkkeys. Called when _bt_readpage's linear |
3242 | | * search process (started after it finishes reading an initial group of |
3243 | | * matching tuples, used to locate the start of the next group of tuples |
3244 | | * matching the next set of required array keys) has already scanned an |
3245 | | * excessive number of tuples whose key space is "between arrays". |
3246 | | * |
3247 | | * When we perform look ahead successfully, we'll sets pstate.skip, which |
3248 | | * instructs _bt_readpage to skip ahead to that tuple next (could be past the |
3249 | | * end of the scan's leaf page). Pages where the optimization is effective |
3250 | | * will generally still need to skip several times. Each call here performs |
3251 | | * only a single "look ahead" comparison of a later tuple, whose distance from |
3252 | | * the current tuple's offset number is determined by applying heuristics. |
3253 | | */ |
3254 | | static void |
3255 | | _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, |
3256 | | int tupnatts, TupleDesc tupdesc) |
3257 | 0 | { |
3258 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
3259 | 0 | ScanDirection dir = so->currPos.dir; |
3260 | 0 | OffsetNumber aheadoffnum; |
3261 | 0 | IndexTuple ahead; |
3262 | |
|
3263 | 0 | Assert(!pstate->forcenonrequired); |
3264 | | |
3265 | | /* Avoid looking ahead when comparing the page high key */ |
3266 | 0 | if (pstate->offnum < pstate->minoff) |
3267 | 0 | return; |
3268 | | |
3269 | | /* |
3270 | | * Don't look ahead when there aren't enough tuples remaining on the page |
3271 | | * (in the current scan direction) for it to be worth our while |
3272 | | */ |
3273 | 0 | if (ScanDirectionIsForward(dir) && |
3274 | 0 | pstate->offnum >= pstate->maxoff - LOOK_AHEAD_DEFAULT_DISTANCE) |
3275 | 0 | return; |
3276 | 0 | else if (ScanDirectionIsBackward(dir) && |
3277 | 0 | pstate->offnum <= pstate->minoff + LOOK_AHEAD_DEFAULT_DISTANCE) |
3278 | 0 | return; |
3279 | | |
3280 | | /* |
3281 | | * The look ahead distance starts small, and ramps up as each call here |
3282 | | * allows _bt_readpage to skip over more tuples |
3283 | | */ |
3284 | 0 | if (!pstate->targetdistance) |
3285 | 0 | pstate->targetdistance = LOOK_AHEAD_DEFAULT_DISTANCE; |
3286 | 0 | else if (pstate->targetdistance < MaxIndexTuplesPerPage / 2) |
3287 | 0 | pstate->targetdistance *= 2; |
3288 | | |
3289 | | /* Don't read past the end (or before the start) of the page, though */ |
3290 | 0 | if (ScanDirectionIsForward(dir)) |
3291 | 0 | aheadoffnum = Min((int) pstate->maxoff, |
3292 | 0 | (int) pstate->offnum + pstate->targetdistance); |
3293 | 0 | else |
3294 | 0 | aheadoffnum = Max((int) pstate->minoff, |
3295 | 0 | (int) pstate->offnum - pstate->targetdistance); |
3296 | |
|
3297 | 0 | ahead = (IndexTuple) PageGetItem(pstate->page, |
3298 | 0 | PageGetItemId(pstate->page, aheadoffnum)); |
3299 | 0 | if (_bt_tuple_before_array_skeys(scan, dir, ahead, tupdesc, tupnatts, |
3300 | 0 | false, 0, NULL)) |
3301 | 0 | { |
3302 | | /* |
3303 | | * Success -- instruct _bt_readpage to skip ahead to very next tuple |
3304 | | * after the one we determined was still before the current array keys |
3305 | | */ |
3306 | 0 | if (ScanDirectionIsForward(dir)) |
3307 | 0 | pstate->skip = aheadoffnum + 1; |
3308 | 0 | else |
3309 | 0 | pstate->skip = aheadoffnum - 1; |
3310 | 0 | } |
3311 | 0 | else |
3312 | 0 | { |
3313 | | /* |
3314 | | * Failure -- "ahead" tuple is too far ahead (we were too aggressive). |
3315 | | * |
3316 | | * Reset the number of rechecks, and aggressively reduce the target |
3317 | | * distance (we're much more aggressive here than we were when the |
3318 | | * distance was initially ramped up). |
3319 | | */ |
3320 | 0 | pstate->rechecks = 0; |
3321 | 0 | pstate->targetdistance = Max(pstate->targetdistance / 8, 1); |
3322 | 0 | } |
3323 | 0 | } |
3324 | | |
3325 | | /* |
3326 | | * _bt_killitems - set LP_DEAD state for items an indexscan caller has |
3327 | | * told us were killed |
3328 | | * |
3329 | | * scan->opaque, referenced locally through so, contains information about the |
3330 | | * current page and killed tuples thereon (generally, this should only be |
3331 | | * called if so->numKilled > 0). |
3332 | | * |
3333 | | * Caller should not have a lock on the so->currPos page, but must hold a |
3334 | | * buffer pin when !so->dropPin. When we return, it still won't be locked. |
3335 | | * It'll continue to hold whatever pins were held before calling here. |
3336 | | * |
3337 | | * We match items by heap TID before assuming they are the right ones to set |
3338 | | * LP_DEAD. If the scan is one that holds a buffer pin on the target page |
3339 | | * continuously from initially reading the items until applying this function |
3340 | | * (if it is a !so->dropPin scan), VACUUM cannot have deleted any items on the |
3341 | | * page, so the page's TIDs can't have been recycled by now. There's no risk |
3342 | | * that we'll confuse a new index tuple that happens to use a recycled TID |
3343 | | * with a now-removed tuple with the same TID (that used to be on this same |
3344 | | * page). We can't rely on that during scans that drop buffer pins eagerly |
3345 | | * (so->dropPin scans), though, so we must condition setting LP_DEAD bits on |
3346 | | * the page LSN having not changed since back when _bt_readpage saw the page. |
3347 | | * We totally give up on setting LP_DEAD bits when the page LSN changed. |
3348 | | * |
3349 | | * We give up much less often during !so->dropPin scans, but it still happens. |
3350 | | * We cope with cases where items have moved right due to insertions. If an |
3351 | | * item has moved off the current page due to a split, we'll fail to find it |
3352 | | * and just give up on it. |
3353 | | */ |
3354 | | void |
3355 | | _bt_killitems(IndexScanDesc scan) |
3356 | 0 | { |
3357 | 0 | Relation rel = scan->indexRelation; |
3358 | 0 | BTScanOpaque so = (BTScanOpaque) scan->opaque; |
3359 | 0 | Page page; |
3360 | 0 | BTPageOpaque opaque; |
3361 | 0 | OffsetNumber minoff; |
3362 | 0 | OffsetNumber maxoff; |
3363 | 0 | int numKilled = so->numKilled; |
3364 | 0 | bool killedsomething = false; |
3365 | 0 | Buffer buf; |
3366 | |
|
3367 | 0 | Assert(numKilled > 0); |
3368 | 0 | Assert(BTScanPosIsValid(so->currPos)); |
3369 | 0 | Assert(scan->heapRelation != NULL); /* can't be a bitmap index scan */ |
3370 | | |
3371 | | /* Always invalidate so->killedItems[] before leaving so->currPos */ |
3372 | 0 | so->numKilled = 0; |
3373 | |
|
3374 | 0 | if (!so->dropPin) |
3375 | 0 | { |
3376 | | /* |
3377 | | * We have held the pin on this page since we read the index tuples, |
3378 | | * so all we need to do is lock it. The pin will have prevented |
3379 | | * concurrent VACUUMs from recycling any of the TIDs on the page. |
3380 | | */ |
3381 | 0 | Assert(BTScanPosIsPinned(so->currPos)); |
3382 | 0 | buf = so->currPos.buf; |
3383 | 0 | _bt_lockbuf(rel, buf, BT_READ); |
3384 | 0 | } |
3385 | 0 | else |
3386 | 0 | { |
3387 | 0 | XLogRecPtr latestlsn; |
3388 | |
|
3389 | 0 | Assert(!BTScanPosIsPinned(so->currPos)); |
3390 | 0 | Assert(RelationNeedsWAL(rel)); |
3391 | 0 | buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); |
3392 | |
|
3393 | 0 | latestlsn = BufferGetLSNAtomic(buf); |
3394 | 0 | Assert(!XLogRecPtrIsInvalid(so->currPos.lsn)); |
3395 | 0 | Assert(so->currPos.lsn <= latestlsn); |
3396 | 0 | if (so->currPos.lsn != latestlsn) |
3397 | 0 | { |
3398 | | /* Modified, give up on hinting */ |
3399 | 0 | _bt_relbuf(rel, buf); |
3400 | 0 | return; |
3401 | 0 | } |
3402 | | |
3403 | | /* Unmodified, hinting is safe */ |
3404 | 0 | } |
3405 | | |
3406 | 0 | page = BufferGetPage(buf); |
3407 | 0 | opaque = BTPageGetOpaque(page); |
3408 | 0 | minoff = P_FIRSTDATAKEY(opaque); |
3409 | 0 | maxoff = PageGetMaxOffsetNumber(page); |
3410 | |
|
3411 | 0 | for (int i = 0; i < numKilled; i++) |
3412 | 0 | { |
3413 | 0 | int itemIndex = so->killedItems[i]; |
3414 | 0 | BTScanPosItem *kitem = &so->currPos.items[itemIndex]; |
3415 | 0 | OffsetNumber offnum = kitem->indexOffset; |
3416 | |
|
3417 | 0 | Assert(itemIndex >= so->currPos.firstItem && |
3418 | 0 | itemIndex <= so->currPos.lastItem); |
3419 | 0 | if (offnum < minoff) |
3420 | 0 | continue; /* pure paranoia */ |
3421 | 0 | while (offnum <= maxoff) |
3422 | 0 | { |
3423 | 0 | ItemId iid = PageGetItemId(page, offnum); |
3424 | 0 | IndexTuple ituple = (IndexTuple) PageGetItem(page, iid); |
3425 | 0 | bool killtuple = false; |
3426 | |
|
3427 | 0 | if (BTreeTupleIsPosting(ituple)) |
3428 | 0 | { |
3429 | 0 | int pi = i + 1; |
3430 | 0 | int nposting = BTreeTupleGetNPosting(ituple); |
3431 | 0 | int j; |
3432 | | |
3433 | | /* |
3434 | | * We rely on the convention that heap TIDs in the scanpos |
3435 | | * items array are stored in ascending heap TID order for a |
3436 | | * group of TIDs that originally came from a posting list |
3437 | | * tuple. This convention even applies during backwards |
3438 | | * scans, where returning the TIDs in descending order might |
3439 | | * seem more natural. This is about effectiveness, not |
3440 | | * correctness. |
3441 | | * |
3442 | | * Note that the page may have been modified in almost any way |
3443 | | * since we first read it (in the !so->dropPin case), so it's |
3444 | | * possible that this posting list tuple wasn't a posting list |
3445 | | * tuple when we first encountered its heap TIDs. |
3446 | | */ |
3447 | 0 | for (j = 0; j < nposting; j++) |
3448 | 0 | { |
3449 | 0 | ItemPointer item = BTreeTupleGetPostingN(ituple, j); |
3450 | |
|
3451 | 0 | if (!ItemPointerEquals(item, &kitem->heapTid)) |
3452 | 0 | break; /* out of posting list loop */ |
3453 | | |
3454 | | /* |
3455 | | * kitem must have matching offnum when heap TIDs match, |
3456 | | * though only in the common case where the page can't |
3457 | | * have been concurrently modified |
3458 | | */ |
3459 | 0 | Assert(kitem->indexOffset == offnum || !so->dropPin); |
3460 | | |
3461 | | /* |
3462 | | * Read-ahead to later kitems here. |
3463 | | * |
3464 | | * We rely on the assumption that not advancing kitem here |
3465 | | * will prevent us from considering the posting list tuple |
3466 | | * fully dead by not matching its next heap TID in next |
3467 | | * loop iteration. |
3468 | | * |
3469 | | * If, on the other hand, this is the final heap TID in |
3470 | | * the posting list tuple, then tuple gets killed |
3471 | | * regardless (i.e. we handle the case where the last |
3472 | | * kitem is also the last heap TID in the last index tuple |
3473 | | * correctly -- posting tuple still gets killed). |
3474 | | */ |
3475 | 0 | if (pi < numKilled) |
3476 | 0 | kitem = &so->currPos.items[so->killedItems[pi++]]; |
3477 | 0 | } |
3478 | | |
3479 | | /* |
3480 | | * Don't bother advancing the outermost loop's int iterator to |
3481 | | * avoid processing killed items that relate to the same |
3482 | | * offnum/posting list tuple. This micro-optimization hardly |
3483 | | * seems worth it. (Further iterations of the outermost loop |
3484 | | * will fail to match on this same posting list's first heap |
3485 | | * TID instead, so we'll advance to the next offnum/index |
3486 | | * tuple pretty quickly.) |
3487 | | */ |
3488 | 0 | if (j == nposting) |
3489 | 0 | killtuple = true; |
3490 | 0 | } |
3491 | 0 | else if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)) |
3492 | 0 | killtuple = true; |
3493 | | |
3494 | | /* |
3495 | | * Mark index item as dead, if it isn't already. Since this |
3496 | | * happens while holding a buffer lock possibly in shared mode, |
3497 | | * it's possible that multiple processes attempt to do this |
3498 | | * simultaneously, leading to multiple full-page images being sent |
3499 | | * to WAL (if wal_log_hints or data checksums are enabled), which |
3500 | | * is undesirable. |
3501 | | */ |
3502 | 0 | if (killtuple && !ItemIdIsDead(iid)) |
3503 | 0 | { |
3504 | | /* found the item/all posting list items */ |
3505 | 0 | ItemIdMarkDead(iid); |
3506 | 0 | killedsomething = true; |
3507 | 0 | break; /* out of inner search loop */ |
3508 | 0 | } |
3509 | 0 | offnum = OffsetNumberNext(offnum); |
3510 | 0 | } |
3511 | 0 | } |
3512 | | |
3513 | | /* |
3514 | | * Since this can be redone later if needed, mark as dirty hint. |
3515 | | * |
3516 | | * Whenever we mark anything LP_DEAD, we also set the page's |
3517 | | * BTP_HAS_GARBAGE flag, which is likewise just a hint. (Note that we |
3518 | | * only rely on the page-level flag in !heapkeyspace indexes.) |
3519 | | */ |
3520 | 0 | if (killedsomething) |
3521 | 0 | { |
3522 | 0 | opaque->btpo_flags |= BTP_HAS_GARBAGE; |
3523 | 0 | MarkBufferDirtyHint(buf, true); |
3524 | 0 | } |
3525 | |
|
3526 | 0 | if (!so->dropPin) |
3527 | 0 | _bt_unlockbuf(rel, buf); |
3528 | 0 | else |
3529 | 0 | _bt_relbuf(rel, buf); |
3530 | 0 | } |
3531 | | |
3532 | | |
3533 | | /* |
3534 | | * The following routines manage a shared-memory area in which we track |
3535 | | * assignment of "vacuum cycle IDs" to currently-active btree vacuuming |
3536 | | * operations. There is a single counter which increments each time we |
3537 | | * start a vacuum to assign it a cycle ID. Since multiple vacuums could |
3538 | | * be active concurrently, we have to track the cycle ID for each active |
3539 | | * vacuum; this requires at most MaxBackends entries (usually far fewer). |
3540 | | * We assume at most one vacuum can be active for a given index. |
3541 | | * |
3542 | | * Access to the shared memory area is controlled by BtreeVacuumLock. |
3543 | | * In principle we could use a separate lmgr locktag for each index, |
3544 | | * but a single LWLock is much cheaper, and given the short time that |
3545 | | * the lock is ever held, the concurrency hit should be minimal. |
3546 | | */ |
3547 | | |
3548 | | typedef struct BTOneVacInfo |
3549 | | { |
3550 | | LockRelId relid; /* global identifier of an index */ |
3551 | | BTCycleId cycleid; /* cycle ID for its active VACUUM */ |
3552 | | } BTOneVacInfo; |
3553 | | |
3554 | | typedef struct BTVacInfo |
3555 | | { |
3556 | | BTCycleId cycle_ctr; /* cycle ID most recently assigned */ |
3557 | | int num_vacuums; /* number of currently active VACUUMs */ |
3558 | | int max_vacuums; /* allocated length of vacuums[] array */ |
3559 | | BTOneVacInfo vacuums[FLEXIBLE_ARRAY_MEMBER]; |
3560 | | } BTVacInfo; |
3561 | | |
3562 | | static BTVacInfo *btvacinfo; |
3563 | | |
3564 | | |
3565 | | /* |
3566 | | * _bt_vacuum_cycleid --- get the active vacuum cycle ID for an index, |
3567 | | * or zero if there is no active VACUUM |
3568 | | * |
3569 | | * Note: for correct interlocking, the caller must already hold pin and |
3570 | | * exclusive lock on each buffer it will store the cycle ID into. This |
3571 | | * ensures that even if a VACUUM starts immediately afterwards, it cannot |
3572 | | * process those pages until the page split is complete. |
3573 | | */ |
3574 | | BTCycleId |
3575 | | _bt_vacuum_cycleid(Relation rel) |
3576 | 0 | { |
3577 | 0 | BTCycleId result = 0; |
3578 | 0 | int i; |
3579 | | |
3580 | | /* Share lock is enough since this is a read-only operation */ |
3581 | 0 | LWLockAcquire(BtreeVacuumLock, LW_SHARED); |
3582 | |
|
3583 | 0 | for (i = 0; i < btvacinfo->num_vacuums; i++) |
3584 | 0 | { |
3585 | 0 | BTOneVacInfo *vac = &btvacinfo->vacuums[i]; |
3586 | |
|
3587 | 0 | if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId && |
3588 | 0 | vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId) |
3589 | 0 | { |
3590 | 0 | result = vac->cycleid; |
3591 | 0 | break; |
3592 | 0 | } |
3593 | 0 | } |
3594 | |
|
3595 | 0 | LWLockRelease(BtreeVacuumLock); |
3596 | 0 | return result; |
3597 | 0 | } |
3598 | | |
3599 | | /* |
3600 | | * _bt_start_vacuum --- assign a cycle ID to a just-starting VACUUM operation |
3601 | | * |
3602 | | * Note: the caller must guarantee that it will eventually call |
3603 | | * _bt_end_vacuum, else we'll permanently leak an array slot. To ensure |
3604 | | * that this happens even in elog(FATAL) scenarios, the appropriate coding |
3605 | | * is not just a PG_TRY, but |
3606 | | * PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel)) |
3607 | | */ |
3608 | | BTCycleId |
3609 | | _bt_start_vacuum(Relation rel) |
3610 | 0 | { |
3611 | 0 | BTCycleId result; |
3612 | 0 | int i; |
3613 | 0 | BTOneVacInfo *vac; |
3614 | |
|
3615 | 0 | LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE); |
3616 | | |
3617 | | /* |
3618 | | * Assign the next cycle ID, being careful to avoid zero as well as the |
3619 | | * reserved high values. |
3620 | | */ |
3621 | 0 | result = ++(btvacinfo->cycle_ctr); |
3622 | 0 | if (result == 0 || result > MAX_BT_CYCLE_ID) |
3623 | 0 | result = btvacinfo->cycle_ctr = 1; |
3624 | | |
3625 | | /* Let's just make sure there's no entry already for this index */ |
3626 | 0 | for (i = 0; i < btvacinfo->num_vacuums; i++) |
3627 | 0 | { |
3628 | 0 | vac = &btvacinfo->vacuums[i]; |
3629 | 0 | if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId && |
3630 | 0 | vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId) |
3631 | 0 | { |
3632 | | /* |
3633 | | * Unlike most places in the backend, we have to explicitly |
3634 | | * release our LWLock before throwing an error. This is because |
3635 | | * we expect _bt_end_vacuum() to be called before transaction |
3636 | | * abort cleanup can run to release LWLocks. |
3637 | | */ |
3638 | 0 | LWLockRelease(BtreeVacuumLock); |
3639 | 0 | elog(ERROR, "multiple active vacuums for index \"%s\"", |
3640 | 0 | RelationGetRelationName(rel)); |
3641 | 0 | } |
3642 | 0 | } |
3643 | | |
3644 | | /* OK, add an entry */ |
3645 | 0 | if (btvacinfo->num_vacuums >= btvacinfo->max_vacuums) |
3646 | 0 | { |
3647 | 0 | LWLockRelease(BtreeVacuumLock); |
3648 | 0 | elog(ERROR, "out of btvacinfo slots"); |
3649 | 0 | } |
3650 | 0 | vac = &btvacinfo->vacuums[btvacinfo->num_vacuums]; |
3651 | 0 | vac->relid = rel->rd_lockInfo.lockRelId; |
3652 | 0 | vac->cycleid = result; |
3653 | 0 | btvacinfo->num_vacuums++; |
3654 | |
|
3655 | 0 | LWLockRelease(BtreeVacuumLock); |
3656 | 0 | return result; |
3657 | 0 | } |
3658 | | |
3659 | | /* |
3660 | | * _bt_end_vacuum --- mark a btree VACUUM operation as done |
3661 | | * |
3662 | | * Note: this is deliberately coded not to complain if no entry is found; |
3663 | | * this allows the caller to put PG_TRY around the start_vacuum operation. |
3664 | | */ |
3665 | | void |
3666 | | _bt_end_vacuum(Relation rel) |
3667 | 0 | { |
3668 | 0 | int i; |
3669 | |
|
3670 | 0 | LWLockAcquire(BtreeVacuumLock, LW_EXCLUSIVE); |
3671 | | |
3672 | | /* Find the array entry */ |
3673 | 0 | for (i = 0; i < btvacinfo->num_vacuums; i++) |
3674 | 0 | { |
3675 | 0 | BTOneVacInfo *vac = &btvacinfo->vacuums[i]; |
3676 | |
|
3677 | 0 | if (vac->relid.relId == rel->rd_lockInfo.lockRelId.relId && |
3678 | 0 | vac->relid.dbId == rel->rd_lockInfo.lockRelId.dbId) |
3679 | 0 | { |
3680 | | /* Remove it by shifting down the last entry */ |
3681 | 0 | *vac = btvacinfo->vacuums[btvacinfo->num_vacuums - 1]; |
3682 | 0 | btvacinfo->num_vacuums--; |
3683 | 0 | break; |
3684 | 0 | } |
3685 | 0 | } |
3686 | |
|
3687 | 0 | LWLockRelease(BtreeVacuumLock); |
3688 | 0 | } |
3689 | | |
3690 | | /* |
3691 | | * _bt_end_vacuum wrapped as an on_shmem_exit callback function |
3692 | | */ |
3693 | | void |
3694 | | _bt_end_vacuum_callback(int code, Datum arg) |
3695 | 0 | { |
3696 | 0 | _bt_end_vacuum((Relation) DatumGetPointer(arg)); |
3697 | 0 | } |
3698 | | |
3699 | | /* |
3700 | | * BTreeShmemSize --- report amount of shared memory space needed |
3701 | | */ |
3702 | | Size |
3703 | | BTreeShmemSize(void) |
3704 | 0 | { |
3705 | 0 | Size size; |
3706 | |
|
3707 | 0 | size = offsetof(BTVacInfo, vacuums); |
3708 | 0 | size = add_size(size, mul_size(MaxBackends, sizeof(BTOneVacInfo))); |
3709 | 0 | return size; |
3710 | 0 | } |
3711 | | |
3712 | | /* |
3713 | | * BTreeShmemInit --- initialize this module's shared memory |
3714 | | */ |
3715 | | void |
3716 | | BTreeShmemInit(void) |
3717 | 0 | { |
3718 | 0 | bool found; |
3719 | |
|
3720 | 0 | btvacinfo = (BTVacInfo *) ShmemInitStruct("BTree Vacuum State", |
3721 | 0 | BTreeShmemSize(), |
3722 | 0 | &found); |
3723 | |
|
3724 | 0 | if (!IsUnderPostmaster) |
3725 | 0 | { |
3726 | | /* Initialize shared memory area */ |
3727 | 0 | Assert(!found); |
3728 | | |
3729 | | /* |
3730 | | * It doesn't really matter what the cycle counter starts at, but |
3731 | | * having it always start the same doesn't seem good. Seed with |
3732 | | * low-order bits of time() instead. |
3733 | | */ |
3734 | 0 | btvacinfo->cycle_ctr = (BTCycleId) time(NULL); |
3735 | |
|
3736 | 0 | btvacinfo->num_vacuums = 0; |
3737 | 0 | btvacinfo->max_vacuums = MaxBackends; |
3738 | 0 | } |
3739 | 0 | else |
3740 | 0 | Assert(found); |
3741 | 0 | } |
3742 | | |
3743 | | bytea * |
3744 | | btoptions(Datum reloptions, bool validate) |
3745 | 0 | { |
3746 | 0 | static const relopt_parse_elt tab[] = { |
3747 | 0 | {"fillfactor", RELOPT_TYPE_INT, offsetof(BTOptions, fillfactor)}, |
3748 | 0 | {"vacuum_cleanup_index_scale_factor", RELOPT_TYPE_REAL, |
3749 | 0 | offsetof(BTOptions, vacuum_cleanup_index_scale_factor)}, |
3750 | 0 | {"deduplicate_items", RELOPT_TYPE_BOOL, |
3751 | 0 | offsetof(BTOptions, deduplicate_items)} |
3752 | 0 | }; |
3753 | |
|
3754 | 0 | return (bytea *) build_reloptions(reloptions, validate, |
3755 | 0 | RELOPT_KIND_BTREE, |
3756 | 0 | sizeof(BTOptions), |
3757 | 0 | tab, lengthof(tab)); |
3758 | 0 | } |
3759 | | |
3760 | | /* |
3761 | | * btproperty() -- Check boolean properties of indexes. |
3762 | | * |
3763 | | * This is optional, but handling AMPROP_RETURNABLE here saves opening the rel |
3764 | | * to call btcanreturn. |
3765 | | */ |
3766 | | bool |
3767 | | btproperty(Oid index_oid, int attno, |
3768 | | IndexAMProperty prop, const char *propname, |
3769 | | bool *res, bool *isnull) |
3770 | 0 | { |
3771 | 0 | switch (prop) |
3772 | 0 | { |
3773 | 0 | case AMPROP_RETURNABLE: |
3774 | | /* answer only for columns, not AM or whole index */ |
3775 | 0 | if (attno == 0) |
3776 | 0 | return false; |
3777 | | /* otherwise, btree can always return data */ |
3778 | 0 | *res = true; |
3779 | 0 | return true; |
3780 | | |
3781 | 0 | default: |
3782 | 0 | return false; /* punt to generic code */ |
3783 | 0 | } |
3784 | 0 | } |
3785 | | |
3786 | | /* |
3787 | | * btbuildphasename() -- Return name of index build phase. |
3788 | | */ |
3789 | | char * |
3790 | | btbuildphasename(int64 phasenum) |
3791 | 0 | { |
3792 | 0 | switch (phasenum) |
3793 | 0 | { |
3794 | 0 | case PROGRESS_CREATEIDX_SUBPHASE_INITIALIZE: |
3795 | 0 | return "initializing"; |
3796 | 0 | case PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN: |
3797 | 0 | return "scanning table"; |
3798 | 0 | case PROGRESS_BTREE_PHASE_PERFORMSORT_1: |
3799 | 0 | return "sorting live tuples"; |
3800 | 0 | case PROGRESS_BTREE_PHASE_PERFORMSORT_2: |
3801 | 0 | return "sorting dead tuples"; |
3802 | 0 | case PROGRESS_BTREE_PHASE_LEAF_LOAD: |
3803 | 0 | return "loading tuples in tree"; |
3804 | 0 | default: |
3805 | 0 | return NULL; |
3806 | 0 | } |
3807 | 0 | } |
3808 | | |
3809 | | /* |
3810 | | * _bt_truncate() -- create tuple without unneeded suffix attributes. |
3811 | | * |
3812 | | * Returns truncated pivot index tuple allocated in caller's memory context, |
3813 | | * with key attributes copied from caller's firstright argument. If rel is |
3814 | | * an INCLUDE index, non-key attributes will definitely be truncated away, |
3815 | | * since they're not part of the key space. More aggressive suffix |
3816 | | * truncation can take place when it's clear that the returned tuple does not |
3817 | | * need one or more suffix key attributes. We only need to keep firstright |
3818 | | * attributes up to and including the first non-lastleft-equal attribute. |
3819 | | * Caller's insertion scankey is used to compare the tuples; the scankey's |
3820 | | * argument values are not considered here. |
3821 | | * |
3822 | | * Note that returned tuple's t_tid offset will hold the number of attributes |
3823 | | * present, so the original item pointer offset is not represented. Caller |
3824 | | * should only change truncated tuple's downlink. Note also that truncated |
3825 | | * key attributes are treated as containing "minus infinity" values by |
3826 | | * _bt_compare(). |
3827 | | * |
3828 | | * In the worst case (when a heap TID must be appended to distinguish lastleft |
3829 | | * from firstright), the size of the returned tuple is the size of firstright |
3830 | | * plus the size of an additional MAXALIGN()'d item pointer. This guarantee |
3831 | | * is important, since callers need to stay under the 1/3 of a page |
3832 | | * restriction on tuple size. If this routine is ever taught to truncate |
3833 | | * within an attribute/datum, it will need to avoid returning an enlarged |
3834 | | * tuple to caller when truncation + TOAST compression ends up enlarging the |
3835 | | * final datum. |
3836 | | */ |
3837 | | IndexTuple |
3838 | | _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, |
3839 | | BTScanInsert itup_key) |
3840 | 0 | { |
3841 | 0 | TupleDesc itupdesc = RelationGetDescr(rel); |
3842 | 0 | int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); |
3843 | 0 | int keepnatts; |
3844 | 0 | IndexTuple pivot; |
3845 | 0 | IndexTuple tidpivot; |
3846 | 0 | ItemPointer pivotheaptid; |
3847 | 0 | Size newsize; |
3848 | | |
3849 | | /* |
3850 | | * We should only ever truncate non-pivot tuples from leaf pages. It's |
3851 | | * never okay to truncate when splitting an internal page. |
3852 | | */ |
3853 | 0 | Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); |
3854 | | |
3855 | | /* Determine how many attributes must be kept in truncated tuple */ |
3856 | 0 | keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); |
3857 | |
|
3858 | | #ifdef DEBUG_NO_TRUNCATE |
3859 | | /* Force truncation to be ineffective for testing purposes */ |
3860 | | keepnatts = nkeyatts + 1; |
3861 | | #endif |
3862 | |
|
3863 | 0 | pivot = index_truncate_tuple(itupdesc, firstright, |
3864 | 0 | Min(keepnatts, nkeyatts)); |
3865 | |
|
3866 | 0 | if (BTreeTupleIsPosting(pivot)) |
3867 | 0 | { |
3868 | | /* |
3869 | | * index_truncate_tuple() just returns a straight copy of firstright |
3870 | | * when it has no attributes to truncate. When that happens, we may |
3871 | | * need to truncate away a posting list here instead. |
3872 | | */ |
3873 | 0 | Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1); |
3874 | 0 | Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts); |
3875 | 0 | pivot->t_info &= ~INDEX_SIZE_MASK; |
3876 | 0 | pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); |
3877 | 0 | } |
3878 | | |
3879 | | /* |
3880 | | * If there is a distinguishing key attribute within pivot tuple, we're |
3881 | | * done |
3882 | | */ |
3883 | 0 | if (keepnatts <= nkeyatts) |
3884 | 0 | { |
3885 | 0 | BTreeTupleSetNAtts(pivot, keepnatts, false); |
3886 | 0 | return pivot; |
3887 | 0 | } |
3888 | | |
3889 | | /* |
3890 | | * We have to store a heap TID in the new pivot tuple, since no non-TID |
3891 | | * key attribute value in firstright distinguishes the right side of the |
3892 | | * split from the left side. nbtree conceptualizes this case as an |
3893 | | * inability to truncate away any key attributes, since heap TID is |
3894 | | * treated as just another key attribute (despite lacking a pg_attribute |
3895 | | * entry). |
3896 | | * |
3897 | | * Use enlarged space that holds a copy of pivot. We need the extra space |
3898 | | * to store a heap TID at the end (using the special pivot tuple |
3899 | | * representation). Note that the original pivot already has firstright's |
3900 | | * possible posting list/non-key attribute values removed at this point. |
3901 | | */ |
3902 | 0 | newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData)); |
3903 | 0 | tidpivot = palloc0(newsize); |
3904 | 0 | memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot))); |
3905 | | /* Cannot leak memory here */ |
3906 | 0 | pfree(pivot); |
3907 | | |
3908 | | /* |
3909 | | * Store all of firstright's key attribute values plus a tiebreaker heap |
3910 | | * TID value in enlarged pivot tuple |
3911 | | */ |
3912 | 0 | tidpivot->t_info &= ~INDEX_SIZE_MASK; |
3913 | 0 | tidpivot->t_info |= newsize; |
3914 | 0 | BTreeTupleSetNAtts(tidpivot, nkeyatts, true); |
3915 | 0 | pivotheaptid = BTreeTupleGetHeapTID(tidpivot); |
3916 | | |
3917 | | /* |
3918 | | * Lehman & Yao use lastleft as the leaf high key in all cases, but don't |
3919 | | * consider suffix truncation. It seems like a good idea to follow that |
3920 | | * example in cases where no truncation takes place -- use lastleft's heap |
3921 | | * TID. (This is also the closest value to negative infinity that's |
3922 | | * legally usable.) |
3923 | | */ |
3924 | 0 | ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid); |
3925 | | |
3926 | | /* |
3927 | | * We're done. Assert() that heap TID invariants hold before returning. |
3928 | | * |
3929 | | * Lehman and Yao require that the downlink to the right page, which is to |
3930 | | * be inserted into the parent page in the second phase of a page split be |
3931 | | * a strict lower bound on items on the right page, and a non-strict upper |
3932 | | * bound for items on the left page. Assert that heap TIDs follow these |
3933 | | * invariants, since a heap TID value is apparently needed as a |
3934 | | * tiebreaker. |
3935 | | */ |
3936 | 0 | #ifndef DEBUG_NO_TRUNCATE |
3937 | 0 | Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft), |
3938 | 0 | BTreeTupleGetHeapTID(firstright)) < 0); |
3939 | 0 | Assert(ItemPointerCompare(pivotheaptid, |
3940 | 0 | BTreeTupleGetHeapTID(lastleft)) >= 0); |
3941 | 0 | Assert(ItemPointerCompare(pivotheaptid, |
3942 | 0 | BTreeTupleGetHeapTID(firstright)) < 0); |
3943 | | #else |
3944 | | |
3945 | | /* |
3946 | | * Those invariants aren't guaranteed to hold for lastleft + firstright |
3947 | | * heap TID attribute values when they're considered here only because |
3948 | | * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually |
3949 | | * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap |
3950 | | * TID value that always works as a strict lower bound for items to the |
3951 | | * right. In particular, it must avoid using firstright's leading key |
3952 | | * attribute values along with lastleft's heap TID value when lastleft's |
3953 | | * TID happens to be greater than firstright's TID. |
3954 | | */ |
3955 | | ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); |
3956 | | |
3957 | | /* |
3958 | | * Pivot heap TID should never be fully equal to firstright. Note that |
3959 | | * the pivot heap TID will still end up equal to lastleft's heap TID when |
3960 | | * that's the only usable value. |
3961 | | */ |
3962 | | ItemPointerSetOffsetNumber(pivotheaptid, |
3963 | | OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); |
3964 | | Assert(ItemPointerCompare(pivotheaptid, |
3965 | | BTreeTupleGetHeapTID(firstright)) < 0); |
3966 | | #endif |
3967 | |
|
3968 | 0 | return tidpivot; |
3969 | 0 | } |
3970 | | |
3971 | | /* |
3972 | | * _bt_keep_natts - how many key attributes to keep when truncating. |
3973 | | * |
3974 | | * Caller provides two tuples that enclose a split point. Caller's insertion |
3975 | | * scankey is used to compare the tuples; the scankey's argument values are |
3976 | | * not considered here. |
3977 | | * |
3978 | | * This can return a number of attributes that is one greater than the |
3979 | | * number of key attributes for the index relation. This indicates that the |
3980 | | * caller must use a heap TID as a unique-ifier in new pivot tuple. |
3981 | | */ |
3982 | | static int |
3983 | | _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, |
3984 | | BTScanInsert itup_key) |
3985 | 0 | { |
3986 | 0 | int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); |
3987 | 0 | TupleDesc itupdesc = RelationGetDescr(rel); |
3988 | 0 | int keepnatts; |
3989 | 0 | ScanKey scankey; |
3990 | | |
3991 | | /* |
3992 | | * _bt_compare() treats truncated key attributes as having the value minus |
3993 | | * infinity, which would break searches within !heapkeyspace indexes. We |
3994 | | * must still truncate away non-key attribute values, though. |
3995 | | */ |
3996 | 0 | if (!itup_key->heapkeyspace) |
3997 | 0 | return nkeyatts; |
3998 | | |
3999 | 0 | scankey = itup_key->scankeys; |
4000 | 0 | keepnatts = 1; |
4001 | 0 | for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++) |
4002 | 0 | { |
4003 | 0 | Datum datum1, |
4004 | 0 | datum2; |
4005 | 0 | bool isNull1, |
4006 | 0 | isNull2; |
4007 | |
|
4008 | 0 | datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); |
4009 | 0 | datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); |
4010 | |
|
4011 | 0 | if (isNull1 != isNull2) |
4012 | 0 | break; |
4013 | | |
4014 | 0 | if (!isNull1 && |
4015 | 0 | DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, |
4016 | 0 | scankey->sk_collation, |
4017 | 0 | datum1, |
4018 | 0 | datum2)) != 0) |
4019 | 0 | break; |
4020 | | |
4021 | 0 | keepnatts++; |
4022 | 0 | } |
4023 | | |
4024 | | /* |
4025 | | * Assert that _bt_keep_natts_fast() agrees with us in passing. This is |
4026 | | * expected in an allequalimage index. |
4027 | | */ |
4028 | 0 | Assert(!itup_key->allequalimage || |
4029 | 0 | keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); |
4030 | |
|
4031 | 0 | return keepnatts; |
4032 | 0 | } |
4033 | | |
4034 | | /* |
4035 | | * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts. |
4036 | | * |
4037 | | * This is exported so that a candidate split point can have its effect on |
4038 | | * suffix truncation inexpensively evaluated ahead of time when finding a |
4039 | | * split location. A naive bitwise approach to datum comparisons is used to |
4040 | | * save cycles. |
4041 | | * |
4042 | | * The approach taken here usually provides the same answer as _bt_keep_natts |
4043 | | * will (for the same pair of tuples from a heapkeyspace index), since the |
4044 | | * majority of btree opclasses can never indicate that two datums are equal |
4045 | | * unless they're bitwise equal after detoasting. When an index only has |
4046 | | * "equal image" columns, routine is guaranteed to give the same result as |
4047 | | * _bt_keep_natts would. |
4048 | | * |
4049 | | * Callers can rely on the fact that attributes considered equal here are |
4050 | | * definitely also equal according to _bt_keep_natts, even when the index uses |
4051 | | * an opclass or collation that is not "allequalimage"/deduplication-safe. |
4052 | | * This weaker guarantee is good enough for nbtsplitloc.c caller, since false |
4053 | | * negatives generally only have the effect of making leaf page splits use a |
4054 | | * more balanced split point. |
4055 | | */ |
4056 | | int |
4057 | | _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) |
4058 | 0 | { |
4059 | 0 | TupleDesc itupdesc = RelationGetDescr(rel); |
4060 | 0 | int keysz = IndexRelationGetNumberOfKeyAttributes(rel); |
4061 | 0 | int keepnatts; |
4062 | |
|
4063 | 0 | keepnatts = 1; |
4064 | 0 | for (int attnum = 1; attnum <= keysz; attnum++) |
4065 | 0 | { |
4066 | 0 | Datum datum1, |
4067 | 0 | datum2; |
4068 | 0 | bool isNull1, |
4069 | 0 | isNull2; |
4070 | 0 | CompactAttribute *att; |
4071 | |
|
4072 | 0 | datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); |
4073 | 0 | datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); |
4074 | 0 | att = TupleDescCompactAttr(itupdesc, attnum - 1); |
4075 | |
|
4076 | 0 | if (isNull1 != isNull2) |
4077 | 0 | break; |
4078 | | |
4079 | 0 | if (!isNull1 && |
4080 | 0 | !datum_image_eq(datum1, datum2, att->attbyval, att->attlen)) |
4081 | 0 | break; |
4082 | | |
4083 | 0 | keepnatts++; |
4084 | 0 | } |
4085 | |
|
4086 | 0 | return keepnatts; |
4087 | 0 | } |
4088 | | |
4089 | | /* |
4090 | | * _bt_check_natts() -- Verify tuple has expected number of attributes. |
4091 | | * |
4092 | | * Returns value indicating if the expected number of attributes were found |
4093 | | * for a particular offset on page. This can be used as a general purpose |
4094 | | * sanity check. |
4095 | | * |
4096 | | * Testing a tuple directly with BTreeTupleGetNAtts() should generally be |
4097 | | * preferred to calling here. That's usually more convenient, and is always |
4098 | | * more explicit. Call here instead when offnum's tuple may be a negative |
4099 | | * infinity tuple that uses the pre-v11 on-disk representation, or when a low |
4100 | | * context check is appropriate. This routine is as strict as possible about |
4101 | | * what is expected on each version of btree. |
4102 | | */ |
4103 | | bool |
4104 | | _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum) |
4105 | 0 | { |
4106 | 0 | int16 natts = IndexRelationGetNumberOfAttributes(rel); |
4107 | 0 | int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); |
4108 | 0 | BTPageOpaque opaque = BTPageGetOpaque(page); |
4109 | 0 | IndexTuple itup; |
4110 | 0 | int tupnatts; |
4111 | | |
4112 | | /* |
4113 | | * We cannot reliably test a deleted or half-dead page, since they have |
4114 | | * dummy high keys |
4115 | | */ |
4116 | 0 | if (P_IGNORE(opaque)) |
4117 | 0 | return true; |
4118 | | |
4119 | 0 | Assert(offnum >= FirstOffsetNumber && |
4120 | 0 | offnum <= PageGetMaxOffsetNumber(page)); |
4121 | |
|
4122 | 0 | itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); |
4123 | 0 | tupnatts = BTreeTupleGetNAtts(itup, rel); |
4124 | | |
4125 | | /* !heapkeyspace indexes do not support deduplication */ |
4126 | 0 | if (!heapkeyspace && BTreeTupleIsPosting(itup)) |
4127 | 0 | return false; |
4128 | | |
4129 | | /* Posting list tuples should never have "pivot heap TID" bit set */ |
4130 | 0 | if (BTreeTupleIsPosting(itup) && |
4131 | 0 | (ItemPointerGetOffsetNumberNoCheck(&itup->t_tid) & |
4132 | 0 | BT_PIVOT_HEAP_TID_ATTR) != 0) |
4133 | 0 | return false; |
4134 | | |
4135 | | /* INCLUDE indexes do not support deduplication */ |
4136 | 0 | if (natts != nkeyatts && BTreeTupleIsPosting(itup)) |
4137 | 0 | return false; |
4138 | | |
4139 | 0 | if (P_ISLEAF(opaque)) |
4140 | 0 | { |
4141 | 0 | if (offnum >= P_FIRSTDATAKEY(opaque)) |
4142 | 0 | { |
4143 | | /* |
4144 | | * Non-pivot tuple should never be explicitly marked as a pivot |
4145 | | * tuple |
4146 | | */ |
4147 | 0 | if (BTreeTupleIsPivot(itup)) |
4148 | 0 | return false; |
4149 | | |
4150 | | /* |
4151 | | * Leaf tuples that are not the page high key (non-pivot tuples) |
4152 | | * should never be truncated. (Note that tupnatts must have been |
4153 | | * inferred, even with a posting list tuple, because only pivot |
4154 | | * tuples store tupnatts directly.) |
4155 | | */ |
4156 | 0 | return tupnatts == natts; |
4157 | 0 | } |
4158 | 0 | else |
4159 | 0 | { |
4160 | | /* |
4161 | | * Rightmost page doesn't contain a page high key, so tuple was |
4162 | | * checked above as ordinary leaf tuple |
4163 | | */ |
4164 | 0 | Assert(!P_RIGHTMOST(opaque)); |
4165 | | |
4166 | | /* |
4167 | | * !heapkeyspace high key tuple contains only key attributes. Note |
4168 | | * that tupnatts will only have been explicitly represented in |
4169 | | * !heapkeyspace indexes that happen to have non-key attributes. |
4170 | | */ |
4171 | 0 | if (!heapkeyspace) |
4172 | 0 | return tupnatts == nkeyatts; |
4173 | | |
4174 | | /* Use generic heapkeyspace pivot tuple handling */ |
4175 | 0 | } |
4176 | 0 | } |
4177 | 0 | else /* !P_ISLEAF(opaque) */ |
4178 | 0 | { |
4179 | 0 | if (offnum == P_FIRSTDATAKEY(opaque)) |
4180 | 0 | { |
4181 | | /* |
4182 | | * The first tuple on any internal page (possibly the first after |
4183 | | * its high key) is its negative infinity tuple. Negative |
4184 | | * infinity tuples are always truncated to zero attributes. They |
4185 | | * are a particular kind of pivot tuple. |
4186 | | */ |
4187 | 0 | if (heapkeyspace) |
4188 | 0 | return tupnatts == 0; |
4189 | | |
4190 | | /* |
4191 | | * The number of attributes won't be explicitly represented if the |
4192 | | * negative infinity tuple was generated during a page split that |
4193 | | * occurred with a version of Postgres before v11. There must be |
4194 | | * a problem when there is an explicit representation that is |
4195 | | * non-zero, or when there is no explicit representation and the |
4196 | | * tuple is evidently not a pre-pg_upgrade tuple. |
4197 | | * |
4198 | | * Prior to v11, downlinks always had P_HIKEY as their offset. |
4199 | | * Accept that as an alternative indication of a valid |
4200 | | * !heapkeyspace negative infinity tuple. |
4201 | | */ |
4202 | 0 | return tupnatts == 0 || |
4203 | 0 | ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY; |
4204 | 0 | } |
4205 | 0 | else |
4206 | 0 | { |
4207 | | /* |
4208 | | * !heapkeyspace downlink tuple with separator key contains only |
4209 | | * key attributes. Note that tupnatts will only have been |
4210 | | * explicitly represented in !heapkeyspace indexes that happen to |
4211 | | * have non-key attributes. |
4212 | | */ |
4213 | 0 | if (!heapkeyspace) |
4214 | 0 | return tupnatts == nkeyatts; |
4215 | | |
4216 | | /* Use generic heapkeyspace pivot tuple handling */ |
4217 | 0 | } |
4218 | 0 | } |
4219 | | |
4220 | | /* Handle heapkeyspace pivot tuples (excluding minus infinity items) */ |
4221 | 0 | Assert(heapkeyspace); |
4222 | | |
4223 | | /* |
4224 | | * Explicit representation of the number of attributes is mandatory with |
4225 | | * heapkeyspace index pivot tuples, regardless of whether or not there are |
4226 | | * non-key attributes. |
4227 | | */ |
4228 | 0 | if (!BTreeTupleIsPivot(itup)) |
4229 | 0 | return false; |
4230 | | |
4231 | | /* Pivot tuple should not use posting list representation (redundant) */ |
4232 | 0 | if (BTreeTupleIsPosting(itup)) |
4233 | 0 | return false; |
4234 | | |
4235 | | /* |
4236 | | * Heap TID is a tiebreaker key attribute, so it cannot be untruncated |
4237 | | * when any other key attribute is truncated |
4238 | | */ |
4239 | 0 | if (BTreeTupleGetHeapTID(itup) != NULL && tupnatts != nkeyatts) |
4240 | 0 | return false; |
4241 | | |
4242 | | /* |
4243 | | * Pivot tuple must have at least one untruncated key attribute (minus |
4244 | | * infinity pivot tuples are the only exception). Pivot tuples can never |
4245 | | * represent that there is a value present for a key attribute that |
4246 | | * exceeds pg_index.indnkeyatts for the index. |
4247 | | */ |
4248 | 0 | return tupnatts > 0 && tupnatts <= nkeyatts; |
4249 | 0 | } |
4250 | | |
4251 | | /* |
4252 | | * |
4253 | | * _bt_check_third_page() -- check whether tuple fits on a btree page at all. |
4254 | | * |
4255 | | * We actually need to be able to fit three items on every page, so restrict |
4256 | | * any one item to 1/3 the per-page available space. Note that itemsz should |
4257 | | * not include the ItemId overhead. |
4258 | | * |
4259 | | * It might be useful to apply TOAST methods rather than throw an error here. |
4260 | | * Using out of line storage would break assumptions made by suffix truncation |
4261 | | * and by contrib/amcheck, though. |
4262 | | */ |
4263 | | void |
4264 | | _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, |
4265 | | Page page, IndexTuple newtup) |
4266 | 0 | { |
4267 | 0 | Size itemsz; |
4268 | 0 | BTPageOpaque opaque; |
4269 | |
|
4270 | 0 | itemsz = MAXALIGN(IndexTupleSize(newtup)); |
4271 | | |
4272 | | /* Double check item size against limit */ |
4273 | 0 | if (itemsz <= BTMaxItemSize) |
4274 | 0 | return; |
4275 | | |
4276 | | /* |
4277 | | * Tuple is probably too large to fit on page, but it's possible that the |
4278 | | * index uses version 2 or version 3, or that page is an internal page, in |
4279 | | * which case a slightly higher limit applies. |
4280 | | */ |
4281 | 0 | if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid) |
4282 | 0 | return; |
4283 | | |
4284 | | /* |
4285 | | * Internal page insertions cannot fail here, because that would mean that |
4286 | | * an earlier leaf level insertion that should have failed didn't |
4287 | | */ |
4288 | 0 | opaque = BTPageGetOpaque(page); |
4289 | 0 | if (!P_ISLEAF(opaque)) |
4290 | 0 | elog(ERROR, "cannot insert oversized tuple of size %zu on internal page of index \"%s\"", |
4291 | 0 | itemsz, RelationGetRelationName(rel)); |
4292 | | |
4293 | 0 | ereport(ERROR, |
4294 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
4295 | 0 | errmsg("index row size %zu exceeds btree version %u maximum %zu for index \"%s\"", |
4296 | 0 | itemsz, |
4297 | 0 | needheaptidspace ? BTREE_VERSION : BTREE_NOVAC_VERSION, |
4298 | 0 | needheaptidspace ? BTMaxItemSize : BTMaxItemSizeNoHeapTid, |
4299 | 0 | RelationGetRelationName(rel)), |
4300 | 0 | errdetail("Index row references tuple (%u,%u) in relation \"%s\".", |
4301 | 0 | ItemPointerGetBlockNumber(BTreeTupleGetHeapTID(newtup)), |
4302 | 0 | ItemPointerGetOffsetNumber(BTreeTupleGetHeapTID(newtup)), |
4303 | 0 | RelationGetRelationName(heap)), |
4304 | 0 | errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n" |
4305 | 0 | "Consider a function index of an MD5 hash of the value, " |
4306 | 0 | "or use full text indexing."), |
4307 | 0 | errtableconstraint(heap, RelationGetRelationName(rel)))); |
4308 | 0 | } |
4309 | | |
4310 | | /* |
4311 | | * Are all attributes in rel "equality is image equality" attributes? |
4312 | | * |
4313 | | * We use each attribute's BTEQUALIMAGE_PROC opclass procedure. If any |
4314 | | * opclass either lacks a BTEQUALIMAGE_PROC procedure or returns false, we |
4315 | | * return false; otherwise we return true. |
4316 | | * |
4317 | | * Returned boolean value is stored in index metapage during index builds. |
4318 | | * Deduplication can only be used when we return true. |
4319 | | */ |
4320 | | bool |
4321 | | _bt_allequalimage(Relation rel, bool debugmessage) |
4322 | 0 | { |
4323 | 0 | bool allequalimage = true; |
4324 | | |
4325 | | /* INCLUDE indexes can never support deduplication */ |
4326 | 0 | if (IndexRelationGetNumberOfAttributes(rel) != |
4327 | 0 | IndexRelationGetNumberOfKeyAttributes(rel)) |
4328 | 0 | return false; |
4329 | | |
4330 | 0 | for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(rel); i++) |
4331 | 0 | { |
4332 | 0 | Oid opfamily = rel->rd_opfamily[i]; |
4333 | 0 | Oid opcintype = rel->rd_opcintype[i]; |
4334 | 0 | Oid collation = rel->rd_indcollation[i]; |
4335 | 0 | Oid equalimageproc; |
4336 | |
|
4337 | 0 | equalimageproc = get_opfamily_proc(opfamily, opcintype, opcintype, |
4338 | 0 | BTEQUALIMAGE_PROC); |
4339 | | |
4340 | | /* |
4341 | | * If there is no BTEQUALIMAGE_PROC then deduplication is assumed to |
4342 | | * be unsafe. Otherwise, actually call proc and see what it says. |
4343 | | */ |
4344 | 0 | if (!OidIsValid(equalimageproc) || |
4345 | 0 | !DatumGetBool(OidFunctionCall1Coll(equalimageproc, collation, |
4346 | 0 | ObjectIdGetDatum(opcintype)))) |
4347 | 0 | { |
4348 | 0 | allequalimage = false; |
4349 | 0 | break; |
4350 | 0 | } |
4351 | 0 | } |
4352 | |
|
4353 | 0 | if (debugmessage) |
4354 | 0 | { |
4355 | 0 | if (allequalimage) |
4356 | 0 | elog(DEBUG1, "index \"%s\" can safely use deduplication", |
4357 | 0 | RelationGetRelationName(rel)); |
4358 | 0 | else |
4359 | 0 | elog(DEBUG1, "index \"%s\" cannot use deduplication", |
4360 | 0 | RelationGetRelationName(rel)); |
4361 | 0 | } |
4362 | | |
4363 | 0 | return allequalimage; |
4364 | 0 | } |