/src/postgres/src/backend/access/hash/hash.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * hash.c |
4 | | * Implementation of Margo Seltzer's Hashing package for postgres. |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/access/hash/hash.c |
12 | | * |
13 | | * NOTES |
14 | | * This file contains only the public interface routines. |
15 | | * |
16 | | *------------------------------------------------------------------------- |
17 | | */ |
18 | | |
19 | | #include "postgres.h" |
20 | | |
21 | | #include "access/hash.h" |
22 | | #include "access/hash_xlog.h" |
23 | | #include "access/relscan.h" |
24 | | #include "access/stratnum.h" |
25 | | #include "access/tableam.h" |
26 | | #include "access/xloginsert.h" |
27 | | #include "commands/progress.h" |
28 | | #include "commands/vacuum.h" |
29 | | #include "miscadmin.h" |
30 | | #include "nodes/execnodes.h" |
31 | | #include "optimizer/plancat.h" |
32 | | #include "pgstat.h" |
33 | | #include "utils/fmgrprotos.h" |
34 | | #include "utils/index_selfuncs.h" |
35 | | #include "utils/rel.h" |
36 | | |
37 | | /* Working state for hashbuild and its callback */ |
38 | | typedef struct |
39 | | { |
40 | | HSpool *spool; /* NULL if not using spooling */ |
41 | | double indtuples; /* # tuples accepted into index */ |
42 | | Relation heapRel; /* heap relation descriptor */ |
43 | | } HashBuildState; |
44 | | |
45 | | static void hashbuildCallback(Relation index, |
46 | | ItemPointer tid, |
47 | | Datum *values, |
48 | | bool *isnull, |
49 | | bool tupleIsAlive, |
50 | | void *state); |
51 | | |
52 | | |
53 | | /* |
54 | | * Hash handler function: return IndexAmRoutine with access method parameters |
55 | | * and callbacks. |
56 | | */ |
57 | | Datum |
58 | | hashhandler(PG_FUNCTION_ARGS) |
59 | 0 | { |
60 | 0 | IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); |
61 | |
|
62 | 0 | amroutine->amstrategies = HTMaxStrategyNumber; |
63 | 0 | amroutine->amsupport = HASHNProcs; |
64 | 0 | amroutine->amoptsprocnum = HASHOPTIONS_PROC; |
65 | 0 | amroutine->amcanorder = false; |
66 | 0 | amroutine->amcanorderbyop = false; |
67 | 0 | amroutine->amcanhash = true; |
68 | 0 | amroutine->amconsistentequality = true; |
69 | 0 | amroutine->amconsistentordering = false; |
70 | 0 | amroutine->amcanbackward = true; |
71 | 0 | amroutine->amcanunique = false; |
72 | 0 | amroutine->amcanmulticol = false; |
73 | 0 | amroutine->amoptionalkey = false; |
74 | 0 | amroutine->amsearcharray = false; |
75 | 0 | amroutine->amsearchnulls = false; |
76 | 0 | amroutine->amstorage = false; |
77 | 0 | amroutine->amclusterable = false; |
78 | 0 | amroutine->ampredlocks = true; |
79 | 0 | amroutine->amcanparallel = false; |
80 | 0 | amroutine->amcanbuildparallel = false; |
81 | 0 | amroutine->amcaninclude = false; |
82 | 0 | amroutine->amusemaintenanceworkmem = false; |
83 | 0 | amroutine->amsummarizing = false; |
84 | 0 | amroutine->amparallelvacuumoptions = |
85 | 0 | VACUUM_OPTION_PARALLEL_BULKDEL; |
86 | 0 | amroutine->amkeytype = INT4OID; |
87 | |
|
88 | 0 | amroutine->ambuild = hashbuild; |
89 | 0 | amroutine->ambuildempty = hashbuildempty; |
90 | 0 | amroutine->aminsert = hashinsert; |
91 | 0 | amroutine->aminsertcleanup = NULL; |
92 | 0 | amroutine->ambulkdelete = hashbulkdelete; |
93 | 0 | amroutine->amvacuumcleanup = hashvacuumcleanup; |
94 | 0 | amroutine->amcanreturn = NULL; |
95 | 0 | amroutine->amcostestimate = hashcostestimate; |
96 | 0 | amroutine->amgettreeheight = NULL; |
97 | 0 | amroutine->amoptions = hashoptions; |
98 | 0 | amroutine->amproperty = NULL; |
99 | 0 | amroutine->ambuildphasename = NULL; |
100 | 0 | amroutine->amvalidate = hashvalidate; |
101 | 0 | amroutine->amadjustmembers = hashadjustmembers; |
102 | 0 | amroutine->ambeginscan = hashbeginscan; |
103 | 0 | amroutine->amrescan = hashrescan; |
104 | 0 | amroutine->amgettuple = hashgettuple; |
105 | 0 | amroutine->amgetbitmap = hashgetbitmap; |
106 | 0 | amroutine->amendscan = hashendscan; |
107 | 0 | amroutine->ammarkpos = NULL; |
108 | 0 | amroutine->amrestrpos = NULL; |
109 | 0 | amroutine->amestimateparallelscan = NULL; |
110 | 0 | amroutine->aminitparallelscan = NULL; |
111 | 0 | amroutine->amparallelrescan = NULL; |
112 | 0 | amroutine->amtranslatestrategy = hashtranslatestrategy; |
113 | 0 | amroutine->amtranslatecmptype = hashtranslatecmptype; |
114 | |
|
115 | 0 | PG_RETURN_POINTER(amroutine); |
116 | 0 | } |
117 | | |
118 | | /* |
119 | | * hashbuild() -- build a new hash index. |
120 | | */ |
121 | | IndexBuildResult * |
122 | | hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) |
123 | 0 | { |
124 | 0 | IndexBuildResult *result; |
125 | 0 | BlockNumber relpages; |
126 | 0 | double reltuples; |
127 | 0 | double allvisfrac; |
128 | 0 | uint32 num_buckets; |
129 | 0 | Size sort_threshold; |
130 | 0 | HashBuildState buildstate; |
131 | | |
132 | | /* |
133 | | * We expect to be called exactly once for any index relation. If that's |
134 | | * not the case, big trouble's what we have. |
135 | | */ |
136 | 0 | if (RelationGetNumberOfBlocks(index) != 0) |
137 | 0 | elog(ERROR, "index \"%s\" already contains data", |
138 | 0 | RelationGetRelationName(index)); |
139 | | |
140 | | /* Estimate the number of rows currently present in the table */ |
141 | 0 | estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); |
142 | | |
143 | | /* Initialize the hash index metadata page and initial buckets */ |
144 | 0 | num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM); |
145 | | |
146 | | /* |
147 | | * If we just insert the tuples into the index in scan order, then |
148 | | * (assuming their hash codes are pretty random) there will be no locality |
149 | | * of access to the index, and if the index is bigger than available RAM |
150 | | * then we'll thrash horribly. To prevent that scenario, we can sort the |
151 | | * tuples by (expected) bucket number. However, such a sort is useless |
152 | | * overhead when the index does fit in RAM. We choose to sort if the |
153 | | * initial index size exceeds maintenance_work_mem, or the number of |
154 | | * buffers usable for the index, whichever is less. (Limiting by the |
155 | | * number of buffers should reduce thrashing between PG buffers and kernel |
156 | | * buffers, which seems useful even if no physical I/O results. Limiting |
157 | | * by maintenance_work_mem is useful to allow easy testing of the sort |
158 | | * code path, and may be useful to DBAs as an additional control knob.) |
159 | | * |
160 | | * NOTE: this test will need adjustment if a bucket is ever different from |
161 | | * one page. Also, "initial index size" accounting does not include the |
162 | | * metapage, nor the first bitmap page. |
163 | | */ |
164 | 0 | sort_threshold = (maintenance_work_mem * (Size) 1024) / BLCKSZ; |
165 | 0 | if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP) |
166 | 0 | sort_threshold = Min(sort_threshold, NBuffers); |
167 | 0 | else |
168 | 0 | sort_threshold = Min(sort_threshold, NLocBuffer); |
169 | |
|
170 | 0 | if (num_buckets >= sort_threshold) |
171 | 0 | buildstate.spool = _h_spoolinit(heap, index, num_buckets); |
172 | 0 | else |
173 | 0 | buildstate.spool = NULL; |
174 | | |
175 | | /* prepare to build the index */ |
176 | 0 | buildstate.indtuples = 0; |
177 | 0 | buildstate.heapRel = heap; |
178 | | |
179 | | /* do the heap scan */ |
180 | 0 | reltuples = table_index_build_scan(heap, index, indexInfo, true, true, |
181 | 0 | hashbuildCallback, |
182 | 0 | &buildstate, NULL); |
183 | 0 | pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL, |
184 | 0 | buildstate.indtuples); |
185 | |
|
186 | 0 | if (buildstate.spool) |
187 | 0 | { |
188 | | /* sort the tuples and insert them into the index */ |
189 | 0 | _h_indexbuild(buildstate.spool, buildstate.heapRel); |
190 | 0 | _h_spooldestroy(buildstate.spool); |
191 | 0 | } |
192 | | |
193 | | /* |
194 | | * Return statistics |
195 | | */ |
196 | 0 | result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); |
197 | |
|
198 | 0 | result->heap_tuples = reltuples; |
199 | 0 | result->index_tuples = buildstate.indtuples; |
200 | |
|
201 | 0 | return result; |
202 | 0 | } |
203 | | |
204 | | /* |
205 | | * hashbuildempty() -- build an empty hash index in the initialization fork |
206 | | */ |
207 | | void |
208 | | hashbuildempty(Relation index) |
209 | 0 | { |
210 | 0 | _hash_init(index, 0, INIT_FORKNUM); |
211 | 0 | } |
212 | | |
213 | | /* |
214 | | * Per-tuple callback for table_index_build_scan |
215 | | */ |
216 | | static void |
217 | | hashbuildCallback(Relation index, |
218 | | ItemPointer tid, |
219 | | Datum *values, |
220 | | bool *isnull, |
221 | | bool tupleIsAlive, |
222 | | void *state) |
223 | 0 | { |
224 | 0 | HashBuildState *buildstate = (HashBuildState *) state; |
225 | 0 | Datum index_values[1]; |
226 | 0 | bool index_isnull[1]; |
227 | 0 | IndexTuple itup; |
228 | | |
229 | | /* convert data to a hash key; on failure, do not insert anything */ |
230 | 0 | if (!_hash_convert_tuple(index, |
231 | 0 | values, isnull, |
232 | 0 | index_values, index_isnull)) |
233 | 0 | return; |
234 | | |
235 | | /* Either spool the tuple for sorting, or just put it into the index */ |
236 | 0 | if (buildstate->spool) |
237 | 0 | _h_spool(buildstate->spool, tid, index_values, index_isnull); |
238 | 0 | else |
239 | 0 | { |
240 | | /* form an index tuple and point it at the heap tuple */ |
241 | 0 | itup = index_form_tuple(RelationGetDescr(index), |
242 | 0 | index_values, index_isnull); |
243 | 0 | itup->t_tid = *tid; |
244 | 0 | _hash_doinsert(index, itup, buildstate->heapRel, false); |
245 | 0 | pfree(itup); |
246 | 0 | } |
247 | |
|
248 | 0 | buildstate->indtuples += 1; |
249 | 0 | } |
250 | | |
251 | | /* |
252 | | * hashinsert() -- insert an index tuple into a hash table. |
253 | | * |
254 | | * Hash on the heap tuple's key, form an index tuple with hash code. |
255 | | * Find the appropriate location for the new tuple, and put it there. |
256 | | */ |
257 | | bool |
258 | | hashinsert(Relation rel, Datum *values, bool *isnull, |
259 | | ItemPointer ht_ctid, Relation heapRel, |
260 | | IndexUniqueCheck checkUnique, |
261 | | bool indexUnchanged, |
262 | | IndexInfo *indexInfo) |
263 | 0 | { |
264 | 0 | Datum index_values[1]; |
265 | 0 | bool index_isnull[1]; |
266 | 0 | IndexTuple itup; |
267 | | |
268 | | /* convert data to a hash key; on failure, do not insert anything */ |
269 | 0 | if (!_hash_convert_tuple(rel, |
270 | 0 | values, isnull, |
271 | 0 | index_values, index_isnull)) |
272 | 0 | return false; |
273 | | |
274 | | /* form an index tuple and point it at the heap tuple */ |
275 | 0 | itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull); |
276 | 0 | itup->t_tid = *ht_ctid; |
277 | |
|
278 | 0 | _hash_doinsert(rel, itup, heapRel, false); |
279 | |
|
280 | 0 | pfree(itup); |
281 | |
|
282 | 0 | return false; |
283 | 0 | } |
284 | | |
285 | | |
286 | | /* |
287 | | * hashgettuple() -- Get the next tuple in the scan. |
288 | | */ |
289 | | bool |
290 | | hashgettuple(IndexScanDesc scan, ScanDirection dir) |
291 | 0 | { |
292 | 0 | HashScanOpaque so = (HashScanOpaque) scan->opaque; |
293 | 0 | bool res; |
294 | | |
295 | | /* Hash indexes are always lossy since we store only the hash code */ |
296 | 0 | scan->xs_recheck = true; |
297 | | |
298 | | /* |
299 | | * If we've already initialized this scan, we can just advance it in the |
300 | | * appropriate direction. If we haven't done so yet, we call a routine to |
301 | | * get the first item in the scan. |
302 | | */ |
303 | 0 | if (!HashScanPosIsValid(so->currPos)) |
304 | 0 | res = _hash_first(scan, dir); |
305 | 0 | else |
306 | 0 | { |
307 | | /* |
308 | | * Check to see if we should kill the previously-fetched tuple. |
309 | | */ |
310 | 0 | if (scan->kill_prior_tuple) |
311 | 0 | { |
312 | | /* |
313 | | * Yes, so remember it for later. (We'll deal with all such tuples |
314 | | * at once right after leaving the index page or at end of scan.) |
315 | | * In case if caller reverses the indexscan direction it is quite |
316 | | * possible that the same item might get entered multiple times. |
317 | | * But, we don't detect that; instead, we just forget any excess |
318 | | * entries. |
319 | | */ |
320 | 0 | if (so->killedItems == NULL) |
321 | 0 | so->killedItems = (int *) |
322 | 0 | palloc(MaxIndexTuplesPerPage * sizeof(int)); |
323 | |
|
324 | 0 | if (so->numKilled < MaxIndexTuplesPerPage) |
325 | 0 | so->killedItems[so->numKilled++] = so->currPos.itemIndex; |
326 | 0 | } |
327 | | |
328 | | /* |
329 | | * Now continue the scan. |
330 | | */ |
331 | 0 | res = _hash_next(scan, dir); |
332 | 0 | } |
333 | |
|
334 | 0 | return res; |
335 | 0 | } |
336 | | |
337 | | |
338 | | /* |
339 | | * hashgetbitmap() -- get all tuples at once |
340 | | */ |
341 | | int64 |
342 | | hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) |
343 | 0 | { |
344 | 0 | HashScanOpaque so = (HashScanOpaque) scan->opaque; |
345 | 0 | bool res; |
346 | 0 | int64 ntids = 0; |
347 | 0 | HashScanPosItem *currItem; |
348 | |
|
349 | 0 | res = _hash_first(scan, ForwardScanDirection); |
350 | |
|
351 | 0 | while (res) |
352 | 0 | { |
353 | 0 | currItem = &so->currPos.items[so->currPos.itemIndex]; |
354 | | |
355 | | /* |
356 | | * _hash_first and _hash_next handle eliminate dead index entries |
357 | | * whenever scan->ignore_killed_tuples is true. Therefore, there's |
358 | | * nothing to do here except add the results to the TIDBitmap. |
359 | | */ |
360 | 0 | tbm_add_tuples(tbm, &(currItem->heapTid), 1, true); |
361 | 0 | ntids++; |
362 | |
|
363 | 0 | res = _hash_next(scan, ForwardScanDirection); |
364 | 0 | } |
365 | |
|
366 | 0 | return ntids; |
367 | 0 | } |
368 | | |
369 | | |
370 | | /* |
371 | | * hashbeginscan() -- start a scan on a hash index |
372 | | */ |
373 | | IndexScanDesc |
374 | | hashbeginscan(Relation rel, int nkeys, int norderbys) |
375 | 0 | { |
376 | 0 | IndexScanDesc scan; |
377 | 0 | HashScanOpaque so; |
378 | | |
379 | | /* no order by operators allowed */ |
380 | 0 | Assert(norderbys == 0); |
381 | |
|
382 | 0 | scan = RelationGetIndexScan(rel, nkeys, norderbys); |
383 | |
|
384 | 0 | so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); |
385 | 0 | HashScanPosInvalidate(so->currPos); |
386 | 0 | so->hashso_bucket_buf = InvalidBuffer; |
387 | 0 | so->hashso_split_bucket_buf = InvalidBuffer; |
388 | |
|
389 | 0 | so->hashso_buc_populated = false; |
390 | 0 | so->hashso_buc_split = false; |
391 | |
|
392 | 0 | so->killedItems = NULL; |
393 | 0 | so->numKilled = 0; |
394 | |
|
395 | 0 | scan->opaque = so; |
396 | |
|
397 | 0 | return scan; |
398 | 0 | } |
399 | | |
400 | | /* |
401 | | * hashrescan() -- rescan an index relation |
402 | | */ |
403 | | void |
404 | | hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, |
405 | | ScanKey orderbys, int norderbys) |
406 | 0 | { |
407 | 0 | HashScanOpaque so = (HashScanOpaque) scan->opaque; |
408 | 0 | Relation rel = scan->indexRelation; |
409 | |
|
410 | 0 | if (HashScanPosIsValid(so->currPos)) |
411 | 0 | { |
412 | | /* Before leaving current page, deal with any killed items */ |
413 | 0 | if (so->numKilled > 0) |
414 | 0 | _hash_kill_items(scan); |
415 | 0 | } |
416 | |
|
417 | 0 | _hash_dropscanbuf(rel, so); |
418 | | |
419 | | /* set position invalid (this will cause _hash_first call) */ |
420 | 0 | HashScanPosInvalidate(so->currPos); |
421 | | |
422 | | /* Update scan key, if a new one is given */ |
423 | 0 | if (scankey && scan->numberOfKeys > 0) |
424 | 0 | memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); |
425 | |
|
426 | 0 | so->hashso_buc_populated = false; |
427 | 0 | so->hashso_buc_split = false; |
428 | 0 | } |
429 | | |
430 | | /* |
431 | | * hashendscan() -- close down a scan |
432 | | */ |
433 | | void |
434 | | hashendscan(IndexScanDesc scan) |
435 | 0 | { |
436 | 0 | HashScanOpaque so = (HashScanOpaque) scan->opaque; |
437 | 0 | Relation rel = scan->indexRelation; |
438 | |
|
439 | 0 | if (HashScanPosIsValid(so->currPos)) |
440 | 0 | { |
441 | | /* Before leaving current page, deal with any killed items */ |
442 | 0 | if (so->numKilled > 0) |
443 | 0 | _hash_kill_items(scan); |
444 | 0 | } |
445 | |
|
446 | 0 | _hash_dropscanbuf(rel, so); |
447 | |
|
448 | 0 | if (so->killedItems != NULL) |
449 | 0 | pfree(so->killedItems); |
450 | 0 | pfree(so); |
451 | 0 | scan->opaque = NULL; |
452 | 0 | } |
453 | | |
454 | | /* |
455 | | * Bulk deletion of all index entries pointing to a set of heap tuples. |
456 | | * The set of target tuples is specified via a callback routine that tells |
457 | | * whether any given heap tuple (identified by ItemPointer) is being deleted. |
458 | | * |
459 | | * This function also deletes the tuples that are moved by split to other |
460 | | * bucket. |
461 | | * |
462 | | * Result: a palloc'd struct containing statistical info for VACUUM displays. |
463 | | */ |
464 | | IndexBulkDeleteResult * |
465 | | hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, |
466 | | IndexBulkDeleteCallback callback, void *callback_state) |
467 | 0 | { |
468 | 0 | Relation rel = info->index; |
469 | 0 | double tuples_removed; |
470 | 0 | double num_index_tuples; |
471 | 0 | double orig_ntuples; |
472 | 0 | Bucket orig_maxbucket; |
473 | 0 | Bucket cur_maxbucket; |
474 | 0 | Bucket cur_bucket; |
475 | 0 | Buffer metabuf = InvalidBuffer; |
476 | 0 | HashMetaPage metap; |
477 | 0 | HashMetaPage cachedmetap; |
478 | |
|
479 | 0 | tuples_removed = 0; |
480 | 0 | num_index_tuples = 0; |
481 | | |
482 | | /* |
483 | | * We need a copy of the metapage so that we can use its hashm_spares[] |
484 | | * values to compute bucket page addresses, but a cached copy should be |
485 | | * good enough. (If not, we'll detect that further down and refresh the |
486 | | * cache as necessary.) |
487 | | */ |
488 | 0 | cachedmetap = _hash_getcachedmetap(rel, &metabuf, false); |
489 | 0 | Assert(cachedmetap != NULL); |
490 | |
|
491 | 0 | orig_maxbucket = cachedmetap->hashm_maxbucket; |
492 | 0 | orig_ntuples = cachedmetap->hashm_ntuples; |
493 | | |
494 | | /* Scan the buckets that we know exist */ |
495 | 0 | cur_bucket = 0; |
496 | 0 | cur_maxbucket = orig_maxbucket; |
497 | |
|
498 | 0 | loop_top: |
499 | 0 | while (cur_bucket <= cur_maxbucket) |
500 | 0 | { |
501 | 0 | BlockNumber bucket_blkno; |
502 | 0 | BlockNumber blkno; |
503 | 0 | Buffer bucket_buf; |
504 | 0 | Buffer buf; |
505 | 0 | HashPageOpaque bucket_opaque; |
506 | 0 | Page page; |
507 | 0 | bool split_cleanup = false; |
508 | | |
509 | | /* Get address of bucket's start page */ |
510 | 0 | bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); |
511 | |
|
512 | 0 | blkno = bucket_blkno; |
513 | | |
514 | | /* |
515 | | * We need to acquire a cleanup lock on the primary bucket page to out |
516 | | * wait concurrent scans before deleting the dead tuples. |
517 | | */ |
518 | 0 | buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy); |
519 | 0 | LockBufferForCleanup(buf); |
520 | 0 | _hash_checkpage(rel, buf, LH_BUCKET_PAGE); |
521 | |
|
522 | 0 | page = BufferGetPage(buf); |
523 | 0 | bucket_opaque = HashPageGetOpaque(page); |
524 | | |
525 | | /* |
526 | | * If the bucket contains tuples that are moved by split, then we need |
527 | | * to delete such tuples. We can't delete such tuples if the split |
528 | | * operation on bucket is not finished as those are needed by scans. |
529 | | */ |
530 | 0 | if (!H_BUCKET_BEING_SPLIT(bucket_opaque) && |
531 | 0 | H_NEEDS_SPLIT_CLEANUP(bucket_opaque)) |
532 | 0 | { |
533 | 0 | split_cleanup = true; |
534 | | |
535 | | /* |
536 | | * This bucket might have been split since we last held a lock on |
537 | | * the metapage. If so, hashm_maxbucket, hashm_highmask and |
538 | | * hashm_lowmask might be old enough to cause us to fail to remove |
539 | | * tuples left behind by the most recent split. To prevent that, |
540 | | * now that the primary page of the target bucket has been locked |
541 | | * (and thus can't be further split), check whether we need to |
542 | | * update our cached metapage data. |
543 | | */ |
544 | 0 | Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber); |
545 | 0 | if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket) |
546 | 0 | { |
547 | 0 | cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); |
548 | 0 | Assert(cachedmetap != NULL); |
549 | 0 | } |
550 | 0 | } |
551 | |
|
552 | 0 | bucket_buf = buf; |
553 | |
|
554 | 0 | hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy, |
555 | 0 | cachedmetap->hashm_maxbucket, |
556 | 0 | cachedmetap->hashm_highmask, |
557 | 0 | cachedmetap->hashm_lowmask, &tuples_removed, |
558 | 0 | &num_index_tuples, split_cleanup, |
559 | 0 | callback, callback_state); |
560 | |
|
561 | 0 | _hash_dropbuf(rel, bucket_buf); |
562 | | |
563 | | /* Advance to next bucket */ |
564 | 0 | cur_bucket++; |
565 | 0 | } |
566 | |
|
567 | 0 | if (BufferIsInvalid(metabuf)) |
568 | 0 | metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE); |
569 | | |
570 | | /* Write-lock metapage and check for split since we started */ |
571 | 0 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
572 | 0 | metap = HashPageGetMeta(BufferGetPage(metabuf)); |
573 | |
|
574 | 0 | if (cur_maxbucket != metap->hashm_maxbucket) |
575 | 0 | { |
576 | | /* There's been a split, so process the additional bucket(s) */ |
577 | 0 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
578 | 0 | cachedmetap = _hash_getcachedmetap(rel, &metabuf, true); |
579 | 0 | Assert(cachedmetap != NULL); |
580 | 0 | cur_maxbucket = cachedmetap->hashm_maxbucket; |
581 | 0 | goto loop_top; |
582 | 0 | } |
583 | | |
584 | | /* Okay, we're really done. Update tuple count in metapage. */ |
585 | 0 | START_CRIT_SECTION(); |
586 | |
|
587 | 0 | if (orig_maxbucket == metap->hashm_maxbucket && |
588 | 0 | orig_ntuples == metap->hashm_ntuples) |
589 | 0 | { |
590 | | /* |
591 | | * No one has split or inserted anything since start of scan, so |
592 | | * believe our count as gospel. |
593 | | */ |
594 | 0 | metap->hashm_ntuples = num_index_tuples; |
595 | 0 | } |
596 | 0 | else |
597 | 0 | { |
598 | | /* |
599 | | * Otherwise, our count is untrustworthy since we may have |
600 | | * double-scanned tuples in split buckets. Proceed by dead-reckoning. |
601 | | * (Note: we still return estimated_count = false, because using this |
602 | | * count is better than not updating reltuples at all.) |
603 | | */ |
604 | 0 | if (metap->hashm_ntuples > tuples_removed) |
605 | 0 | metap->hashm_ntuples -= tuples_removed; |
606 | 0 | else |
607 | 0 | metap->hashm_ntuples = 0; |
608 | 0 | num_index_tuples = metap->hashm_ntuples; |
609 | 0 | } |
610 | |
|
611 | 0 | MarkBufferDirty(metabuf); |
612 | | |
613 | | /* XLOG stuff */ |
614 | 0 | if (RelationNeedsWAL(rel)) |
615 | 0 | { |
616 | 0 | xl_hash_update_meta_page xlrec; |
617 | 0 | XLogRecPtr recptr; |
618 | |
|
619 | 0 | xlrec.ntuples = metap->hashm_ntuples; |
620 | |
|
621 | 0 | XLogBeginInsert(); |
622 | 0 | XLogRegisterData(&xlrec, SizeOfHashUpdateMetaPage); |
623 | |
|
624 | 0 | XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); |
625 | |
|
626 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); |
627 | 0 | PageSetLSN(BufferGetPage(metabuf), recptr); |
628 | 0 | } |
629 | |
|
630 | 0 | END_CRIT_SECTION(); |
631 | |
|
632 | 0 | _hash_relbuf(rel, metabuf); |
633 | | |
634 | | /* return statistics */ |
635 | 0 | if (stats == NULL) |
636 | 0 | stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); |
637 | 0 | stats->estimated_count = false; |
638 | 0 | stats->num_index_tuples = num_index_tuples; |
639 | 0 | stats->tuples_removed += tuples_removed; |
640 | | /* hashvacuumcleanup will fill in num_pages */ |
641 | |
|
642 | 0 | return stats; |
643 | 0 | } |
644 | | |
645 | | /* |
646 | | * Post-VACUUM cleanup. |
647 | | * |
648 | | * Result: a palloc'd struct containing statistical info for VACUUM displays. |
649 | | */ |
650 | | IndexBulkDeleteResult * |
651 | | hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) |
652 | 0 | { |
653 | 0 | Relation rel = info->index; |
654 | 0 | BlockNumber num_pages; |
655 | | |
656 | | /* If hashbulkdelete wasn't called, return NULL signifying no change */ |
657 | | /* Note: this covers the analyze_only case too */ |
658 | 0 | if (stats == NULL) |
659 | 0 | return NULL; |
660 | | |
661 | | /* update statistics */ |
662 | 0 | num_pages = RelationGetNumberOfBlocks(rel); |
663 | 0 | stats->num_pages = num_pages; |
664 | |
|
665 | 0 | return stats; |
666 | 0 | } |
667 | | |
668 | | /* |
669 | | * Helper function to perform deletion of index entries from a bucket. |
670 | | * |
671 | | * This function expects that the caller has acquired a cleanup lock on the |
672 | | * primary bucket page, and will return with a write lock again held on the |
673 | | * primary bucket page. The lock won't necessarily be held continuously, |
674 | | * though, because we'll release it when visiting overflow pages. |
675 | | * |
676 | | * There can't be any concurrent scans in progress when we first enter this |
677 | | * function because of the cleanup lock we hold on the primary bucket page, |
678 | | * but as soon as we release that lock, there might be. If those scans got |
679 | | * ahead of our cleanup scan, they might see a tuple before we kill it and |
680 | | * wake up only after VACUUM has completed and the TID has been recycled for |
681 | | * an unrelated tuple. To avoid that calamity, we prevent scans from passing |
682 | | * our cleanup scan by locking the next page in the bucket chain before |
683 | | * releasing the lock on the previous page. (This type of lock chaining is not |
684 | | * ideal, so we might want to look for a better solution at some point.) |
685 | | * |
686 | | * We need to retain a pin on the primary bucket to ensure that no concurrent |
687 | | * split can start. |
688 | | */ |
689 | | void |
690 | | hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, |
691 | | BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, |
692 | | uint32 maxbucket, uint32 highmask, uint32 lowmask, |
693 | | double *tuples_removed, double *num_index_tuples, |
694 | | bool split_cleanup, |
695 | | IndexBulkDeleteCallback callback, void *callback_state) |
696 | 0 | { |
697 | 0 | BlockNumber blkno; |
698 | 0 | Buffer buf; |
699 | 0 | Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket; |
700 | 0 | bool bucket_dirty = false; |
701 | |
|
702 | 0 | blkno = bucket_blkno; |
703 | 0 | buf = bucket_buf; |
704 | |
|
705 | 0 | if (split_cleanup) |
706 | 0 | new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket, |
707 | 0 | lowmask, maxbucket); |
708 | | |
709 | | /* Scan each page in bucket */ |
710 | 0 | for (;;) |
711 | 0 | { |
712 | 0 | HashPageOpaque opaque; |
713 | 0 | OffsetNumber offno; |
714 | 0 | OffsetNumber maxoffno; |
715 | 0 | Buffer next_buf; |
716 | 0 | Page page; |
717 | 0 | OffsetNumber deletable[MaxOffsetNumber]; |
718 | 0 | int ndeletable = 0; |
719 | 0 | bool retain_pin = false; |
720 | 0 | bool clear_dead_marking = false; |
721 | |
|
722 | 0 | vacuum_delay_point(false); |
723 | |
|
724 | 0 | page = BufferGetPage(buf); |
725 | 0 | opaque = HashPageGetOpaque(page); |
726 | | |
727 | | /* Scan each tuple in page */ |
728 | 0 | maxoffno = PageGetMaxOffsetNumber(page); |
729 | 0 | for (offno = FirstOffsetNumber; |
730 | 0 | offno <= maxoffno; |
731 | 0 | offno = OffsetNumberNext(offno)) |
732 | 0 | { |
733 | 0 | ItemPointer htup; |
734 | 0 | IndexTuple itup; |
735 | 0 | Bucket bucket; |
736 | 0 | bool kill_tuple = false; |
737 | |
|
738 | 0 | itup = (IndexTuple) PageGetItem(page, |
739 | 0 | PageGetItemId(page, offno)); |
740 | 0 | htup = &(itup->t_tid); |
741 | | |
742 | | /* |
743 | | * To remove the dead tuples, we strictly want to rely on results |
744 | | * of callback function. refer btvacuumpage for detailed reason. |
745 | | */ |
746 | 0 | if (callback && callback(htup, callback_state)) |
747 | 0 | { |
748 | 0 | kill_tuple = true; |
749 | 0 | if (tuples_removed) |
750 | 0 | *tuples_removed += 1; |
751 | 0 | } |
752 | 0 | else if (split_cleanup) |
753 | 0 | { |
754 | | /* delete the tuples that are moved by split. */ |
755 | 0 | bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), |
756 | 0 | maxbucket, |
757 | 0 | highmask, |
758 | 0 | lowmask); |
759 | | /* mark the item for deletion */ |
760 | 0 | if (bucket != cur_bucket) |
761 | 0 | { |
762 | | /* |
763 | | * We expect tuples to either belong to current bucket or |
764 | | * new_bucket. This is ensured because we don't allow |
765 | | * further splits from bucket that contains garbage. See |
766 | | * comments in _hash_expandtable. |
767 | | */ |
768 | 0 | Assert(bucket == new_bucket); |
769 | 0 | kill_tuple = true; |
770 | 0 | } |
771 | 0 | } |
772 | |
|
773 | 0 | if (kill_tuple) |
774 | 0 | { |
775 | | /* mark the item for deletion */ |
776 | 0 | deletable[ndeletable++] = offno; |
777 | 0 | } |
778 | 0 | else |
779 | 0 | { |
780 | | /* we're keeping it, so count it */ |
781 | 0 | if (num_index_tuples) |
782 | 0 | *num_index_tuples += 1; |
783 | 0 | } |
784 | 0 | } |
785 | | |
786 | | /* retain the pin on primary bucket page till end of bucket scan */ |
787 | 0 | if (blkno == bucket_blkno) |
788 | 0 | retain_pin = true; |
789 | 0 | else |
790 | 0 | retain_pin = false; |
791 | |
|
792 | 0 | blkno = opaque->hasho_nextblkno; |
793 | | |
794 | | /* |
795 | | * Apply deletions, advance to next page and write page if needed. |
796 | | */ |
797 | 0 | if (ndeletable > 0) |
798 | 0 | { |
799 | | /* No ereport(ERROR) until changes are logged */ |
800 | 0 | START_CRIT_SECTION(); |
801 | |
|
802 | 0 | PageIndexMultiDelete(page, deletable, ndeletable); |
803 | 0 | bucket_dirty = true; |
804 | | |
805 | | /* |
806 | | * Let us mark the page as clean if vacuum removes the DEAD tuples |
807 | | * from an index page. We do this by clearing |
808 | | * LH_PAGE_HAS_DEAD_TUPLES flag. |
809 | | */ |
810 | 0 | if (tuples_removed && *tuples_removed > 0 && |
811 | 0 | H_HAS_DEAD_TUPLES(opaque)) |
812 | 0 | { |
813 | 0 | opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES; |
814 | 0 | clear_dead_marking = true; |
815 | 0 | } |
816 | |
|
817 | 0 | MarkBufferDirty(buf); |
818 | | |
819 | | /* XLOG stuff */ |
820 | 0 | if (RelationNeedsWAL(rel)) |
821 | 0 | { |
822 | 0 | xl_hash_delete xlrec; |
823 | 0 | XLogRecPtr recptr; |
824 | |
|
825 | 0 | xlrec.clear_dead_marking = clear_dead_marking; |
826 | 0 | xlrec.is_primary_bucket_page = (buf == bucket_buf); |
827 | |
|
828 | 0 | XLogBeginInsert(); |
829 | 0 | XLogRegisterData(&xlrec, SizeOfHashDelete); |
830 | | |
831 | | /* |
832 | | * bucket buffer was not changed, but still needs to be |
833 | | * registered to ensure that we can acquire a cleanup lock on |
834 | | * it during replay. |
835 | | */ |
836 | 0 | if (!xlrec.is_primary_bucket_page) |
837 | 0 | { |
838 | 0 | uint8 flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE; |
839 | |
|
840 | 0 | XLogRegisterBuffer(0, bucket_buf, flags); |
841 | 0 | } |
842 | |
|
843 | 0 | XLogRegisterBuffer(1, buf, REGBUF_STANDARD); |
844 | 0 | XLogRegisterBufData(1, deletable, |
845 | 0 | ndeletable * sizeof(OffsetNumber)); |
846 | |
|
847 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); |
848 | 0 | PageSetLSN(BufferGetPage(buf), recptr); |
849 | 0 | } |
850 | |
|
851 | 0 | END_CRIT_SECTION(); |
852 | 0 | } |
853 | | |
854 | | /* bail out if there are no more pages to scan. */ |
855 | 0 | if (!BlockNumberIsValid(blkno)) |
856 | 0 | break; |
857 | | |
858 | 0 | next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE, |
859 | 0 | LH_OVERFLOW_PAGE, |
860 | 0 | bstrategy); |
861 | | |
862 | | /* |
863 | | * release the lock on previous page after acquiring the lock on next |
864 | | * page |
865 | | */ |
866 | 0 | if (retain_pin) |
867 | 0 | LockBuffer(buf, BUFFER_LOCK_UNLOCK); |
868 | 0 | else |
869 | 0 | _hash_relbuf(rel, buf); |
870 | |
|
871 | 0 | buf = next_buf; |
872 | 0 | } |
873 | | |
874 | | /* |
875 | | * lock the bucket page to clear the garbage flag and squeeze the bucket. |
876 | | * if the current buffer is same as bucket buffer, then we already have |
877 | | * lock on bucket page. |
878 | | */ |
879 | 0 | if (buf != bucket_buf) |
880 | 0 | { |
881 | 0 | _hash_relbuf(rel, buf); |
882 | 0 | LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE); |
883 | 0 | } |
884 | | |
885 | | /* |
886 | | * Clear the garbage flag from bucket after deleting the tuples that are |
887 | | * moved by split. We purposefully clear the flag before squeeze bucket, |
888 | | * so that after restart, vacuum shouldn't again try to delete the moved |
889 | | * by split tuples. |
890 | | */ |
891 | 0 | if (split_cleanup) |
892 | 0 | { |
893 | 0 | HashPageOpaque bucket_opaque; |
894 | 0 | Page page; |
895 | |
|
896 | 0 | page = BufferGetPage(bucket_buf); |
897 | 0 | bucket_opaque = HashPageGetOpaque(page); |
898 | | |
899 | | /* No ereport(ERROR) until changes are logged */ |
900 | 0 | START_CRIT_SECTION(); |
901 | |
|
902 | 0 | bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; |
903 | 0 | MarkBufferDirty(bucket_buf); |
904 | | |
905 | | /* XLOG stuff */ |
906 | 0 | if (RelationNeedsWAL(rel)) |
907 | 0 | { |
908 | 0 | XLogRecPtr recptr; |
909 | |
|
910 | 0 | XLogBeginInsert(); |
911 | 0 | XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); |
912 | |
|
913 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); |
914 | 0 | PageSetLSN(page, recptr); |
915 | 0 | } |
916 | |
|
917 | 0 | END_CRIT_SECTION(); |
918 | 0 | } |
919 | | |
920 | | /* |
921 | | * If we have deleted anything, try to compact free space. For squeezing |
922 | | * the bucket, we must have a cleanup lock, else it can impact the |
923 | | * ordering of tuples for a scan that has started before it. |
924 | | */ |
925 | 0 | if (bucket_dirty && IsBufferCleanupOK(bucket_buf)) |
926 | 0 | _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf, |
927 | 0 | bstrategy); |
928 | 0 | else |
929 | 0 | LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); |
930 | 0 | } |
931 | | |
932 | | CompareType |
933 | | hashtranslatestrategy(StrategyNumber strategy, Oid opfamily) |
934 | 0 | { |
935 | 0 | if (strategy == HTEqualStrategyNumber) |
936 | 0 | return COMPARE_EQ; |
937 | 0 | return COMPARE_INVALID; |
938 | 0 | } |
939 | | |
940 | | StrategyNumber |
941 | | hashtranslatecmptype(CompareType cmptype, Oid opfamily) |
942 | 0 | { |
943 | 0 | if (cmptype == COMPARE_EQ) |
944 | 0 | return HTEqualStrategyNumber; |
945 | 0 | return InvalidStrategy; |
946 | 0 | } |