Coverage Report

Created: 2025-06-15 06:31

/src/postgres/src/backend/access/hash/hash.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * hash.c
4
 *    Implementation of Margo Seltzer's Hashing package for postgres.
5
 *
6
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7
 * Portions Copyright (c) 1994, Regents of the University of California
8
 *
9
 *
10
 * IDENTIFICATION
11
 *    src/backend/access/hash/hash.c
12
 *
13
 * NOTES
14
 *    This file contains only the public interface routines.
15
 *
16
 *-------------------------------------------------------------------------
17
 */
18
19
#include "postgres.h"
20
21
#include "access/hash.h"
22
#include "access/hash_xlog.h"
23
#include "access/relscan.h"
24
#include "access/stratnum.h"
25
#include "access/tableam.h"
26
#include "access/xloginsert.h"
27
#include "commands/progress.h"
28
#include "commands/vacuum.h"
29
#include "miscadmin.h"
30
#include "nodes/execnodes.h"
31
#include "optimizer/plancat.h"
32
#include "pgstat.h"
33
#include "utils/fmgrprotos.h"
34
#include "utils/index_selfuncs.h"
35
#include "utils/rel.h"
36
37
/* Working state for hashbuild and its callback */
38
typedef struct
39
{
40
  HSpool     *spool;      /* NULL if not using spooling */
41
  double    indtuples;    /* # tuples accepted into index */
42
  Relation  heapRel;    /* heap relation descriptor */
43
} HashBuildState;
44
45
static void hashbuildCallback(Relation index,
46
                ItemPointer tid,
47
                Datum *values,
48
                bool *isnull,
49
                bool tupleIsAlive,
50
                void *state);
51
52
53
/*
54
 * Hash handler function: return IndexAmRoutine with access method parameters
55
 * and callbacks.
56
 */
57
Datum
58
hashhandler(PG_FUNCTION_ARGS)
59
0
{
60
0
  IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
61
62
0
  amroutine->amstrategies = HTMaxStrategyNumber;
63
0
  amroutine->amsupport = HASHNProcs;
64
0
  amroutine->amoptsprocnum = HASHOPTIONS_PROC;
65
0
  amroutine->amcanorder = false;
66
0
  amroutine->amcanorderbyop = false;
67
0
  amroutine->amcanhash = true;
68
0
  amroutine->amconsistentequality = true;
69
0
  amroutine->amconsistentordering = false;
70
0
  amroutine->amcanbackward = true;
71
0
  amroutine->amcanunique = false;
72
0
  amroutine->amcanmulticol = false;
73
0
  amroutine->amoptionalkey = false;
74
0
  amroutine->amsearcharray = false;
75
0
  amroutine->amsearchnulls = false;
76
0
  amroutine->amstorage = false;
77
0
  amroutine->amclusterable = false;
78
0
  amroutine->ampredlocks = true;
79
0
  amroutine->amcanparallel = false;
80
0
  amroutine->amcanbuildparallel = false;
81
0
  amroutine->amcaninclude = false;
82
0
  amroutine->amusemaintenanceworkmem = false;
83
0
  amroutine->amsummarizing = false;
84
0
  amroutine->amparallelvacuumoptions =
85
0
    VACUUM_OPTION_PARALLEL_BULKDEL;
86
0
  amroutine->amkeytype = INT4OID;
87
88
0
  amroutine->ambuild = hashbuild;
89
0
  amroutine->ambuildempty = hashbuildempty;
90
0
  amroutine->aminsert = hashinsert;
91
0
  amroutine->aminsertcleanup = NULL;
92
0
  amroutine->ambulkdelete = hashbulkdelete;
93
0
  amroutine->amvacuumcleanup = hashvacuumcleanup;
94
0
  amroutine->amcanreturn = NULL;
95
0
  amroutine->amcostestimate = hashcostestimate;
96
0
  amroutine->amgettreeheight = NULL;
97
0
  amroutine->amoptions = hashoptions;
98
0
  amroutine->amproperty = NULL;
99
0
  amroutine->ambuildphasename = NULL;
100
0
  amroutine->amvalidate = hashvalidate;
101
0
  amroutine->amadjustmembers = hashadjustmembers;
102
0
  amroutine->ambeginscan = hashbeginscan;
103
0
  amroutine->amrescan = hashrescan;
104
0
  amroutine->amgettuple = hashgettuple;
105
0
  amroutine->amgetbitmap = hashgetbitmap;
106
0
  amroutine->amendscan = hashendscan;
107
0
  amroutine->ammarkpos = NULL;
108
0
  amroutine->amrestrpos = NULL;
109
0
  amroutine->amestimateparallelscan = NULL;
110
0
  amroutine->aminitparallelscan = NULL;
111
0
  amroutine->amparallelrescan = NULL;
112
0
  amroutine->amtranslatestrategy = hashtranslatestrategy;
113
0
  amroutine->amtranslatecmptype = hashtranslatecmptype;
114
115
0
  PG_RETURN_POINTER(amroutine);
116
0
}
117
118
/*
119
 *  hashbuild() -- build a new hash index.
120
 */
121
IndexBuildResult *
122
hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
123
0
{
124
0
  IndexBuildResult *result;
125
0
  BlockNumber relpages;
126
0
  double    reltuples;
127
0
  double    allvisfrac;
128
0
  uint32    num_buckets;
129
0
  Size    sort_threshold;
130
0
  HashBuildState buildstate;
131
132
  /*
133
   * We expect to be called exactly once for any index relation. If that's
134
   * not the case, big trouble's what we have.
135
   */
136
0
  if (RelationGetNumberOfBlocks(index) != 0)
137
0
    elog(ERROR, "index \"%s\" already contains data",
138
0
       RelationGetRelationName(index));
139
140
  /* Estimate the number of rows currently present in the table */
141
0
  estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
142
143
  /* Initialize the hash index metadata page and initial buckets */
144
0
  num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
145
146
  /*
147
   * If we just insert the tuples into the index in scan order, then
148
   * (assuming their hash codes are pretty random) there will be no locality
149
   * of access to the index, and if the index is bigger than available RAM
150
   * then we'll thrash horribly.  To prevent that scenario, we can sort the
151
   * tuples by (expected) bucket number.  However, such a sort is useless
152
   * overhead when the index does fit in RAM.  We choose to sort if the
153
   * initial index size exceeds maintenance_work_mem, or the number of
154
   * buffers usable for the index, whichever is less.  (Limiting by the
155
   * number of buffers should reduce thrashing between PG buffers and kernel
156
   * buffers, which seems useful even if no physical I/O results.  Limiting
157
   * by maintenance_work_mem is useful to allow easy testing of the sort
158
   * code path, and may be useful to DBAs as an additional control knob.)
159
   *
160
   * NOTE: this test will need adjustment if a bucket is ever different from
161
   * one page.  Also, "initial index size" accounting does not include the
162
   * metapage, nor the first bitmap page.
163
   */
164
0
  sort_threshold = (maintenance_work_mem * (Size) 1024) / BLCKSZ;
165
0
  if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
166
0
    sort_threshold = Min(sort_threshold, NBuffers);
167
0
  else
168
0
    sort_threshold = Min(sort_threshold, NLocBuffer);
169
170
0
  if (num_buckets >= sort_threshold)
171
0
    buildstate.spool = _h_spoolinit(heap, index, num_buckets);
172
0
  else
173
0
    buildstate.spool = NULL;
174
175
  /* prepare to build the index */
176
0
  buildstate.indtuples = 0;
177
0
  buildstate.heapRel = heap;
178
179
  /* do the heap scan */
180
0
  reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
181
0
                     hashbuildCallback,
182
0
                     &buildstate, NULL);
183
0
  pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
184
0
                 buildstate.indtuples);
185
186
0
  if (buildstate.spool)
187
0
  {
188
    /* sort the tuples and insert them into the index */
189
0
    _h_indexbuild(buildstate.spool, buildstate.heapRel);
190
0
    _h_spooldestroy(buildstate.spool);
191
0
  }
192
193
  /*
194
   * Return statistics
195
   */
196
0
  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
197
198
0
  result->heap_tuples = reltuples;
199
0
  result->index_tuples = buildstate.indtuples;
200
201
0
  return result;
202
0
}
203
204
/*
205
 *  hashbuildempty() -- build an empty hash index in the initialization fork
206
 */
207
void
208
hashbuildempty(Relation index)
209
0
{
210
0
  _hash_init(index, 0, INIT_FORKNUM);
211
0
}
212
213
/*
214
 * Per-tuple callback for table_index_build_scan
215
 */
216
static void
217
hashbuildCallback(Relation index,
218
          ItemPointer tid,
219
          Datum *values,
220
          bool *isnull,
221
          bool tupleIsAlive,
222
          void *state)
223
0
{
224
0
  HashBuildState *buildstate = (HashBuildState *) state;
225
0
  Datum   index_values[1];
226
0
  bool    index_isnull[1];
227
0
  IndexTuple  itup;
228
229
  /* convert data to a hash key; on failure, do not insert anything */
230
0
  if (!_hash_convert_tuple(index,
231
0
               values, isnull,
232
0
               index_values, index_isnull))
233
0
    return;
234
235
  /* Either spool the tuple for sorting, or just put it into the index */
236
0
  if (buildstate->spool)
237
0
    _h_spool(buildstate->spool, tid, index_values, index_isnull);
238
0
  else
239
0
  {
240
    /* form an index tuple and point it at the heap tuple */
241
0
    itup = index_form_tuple(RelationGetDescr(index),
242
0
                index_values, index_isnull);
243
0
    itup->t_tid = *tid;
244
0
    _hash_doinsert(index, itup, buildstate->heapRel, false);
245
0
    pfree(itup);
246
0
  }
247
248
0
  buildstate->indtuples += 1;
249
0
}
250
251
/*
252
 *  hashinsert() -- insert an index tuple into a hash table.
253
 *
254
 *  Hash on the heap tuple's key, form an index tuple with hash code.
255
 *  Find the appropriate location for the new tuple, and put it there.
256
 */
257
bool
258
hashinsert(Relation rel, Datum *values, bool *isnull,
259
       ItemPointer ht_ctid, Relation heapRel,
260
       IndexUniqueCheck checkUnique,
261
       bool indexUnchanged,
262
       IndexInfo *indexInfo)
263
0
{
264
0
  Datum   index_values[1];
265
0
  bool    index_isnull[1];
266
0
  IndexTuple  itup;
267
268
  /* convert data to a hash key; on failure, do not insert anything */
269
0
  if (!_hash_convert_tuple(rel,
270
0
               values, isnull,
271
0
               index_values, index_isnull))
272
0
    return false;
273
274
  /* form an index tuple and point it at the heap tuple */
275
0
  itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
276
0
  itup->t_tid = *ht_ctid;
277
278
0
  _hash_doinsert(rel, itup, heapRel, false);
279
280
0
  pfree(itup);
281
282
0
  return false;
283
0
}
284
285
286
/*
287
 *  hashgettuple() -- Get the next tuple in the scan.
288
 */
289
bool
290
hashgettuple(IndexScanDesc scan, ScanDirection dir)
291
0
{
292
0
  HashScanOpaque so = (HashScanOpaque) scan->opaque;
293
0
  bool    res;
294
295
  /* Hash indexes are always lossy since we store only the hash code */
296
0
  scan->xs_recheck = true;
297
298
  /*
299
   * If we've already initialized this scan, we can just advance it in the
300
   * appropriate direction.  If we haven't done so yet, we call a routine to
301
   * get the first item in the scan.
302
   */
303
0
  if (!HashScanPosIsValid(so->currPos))
304
0
    res = _hash_first(scan, dir);
305
0
  else
306
0
  {
307
    /*
308
     * Check to see if we should kill the previously-fetched tuple.
309
     */
310
0
    if (scan->kill_prior_tuple)
311
0
    {
312
      /*
313
       * Yes, so remember it for later. (We'll deal with all such tuples
314
       * at once right after leaving the index page or at end of scan.)
315
       * In case if caller reverses the indexscan direction it is quite
316
       * possible that the same item might get entered multiple times.
317
       * But, we don't detect that; instead, we just forget any excess
318
       * entries.
319
       */
320
0
      if (so->killedItems == NULL)
321
0
        so->killedItems = (int *)
322
0
          palloc(MaxIndexTuplesPerPage * sizeof(int));
323
324
0
      if (so->numKilled < MaxIndexTuplesPerPage)
325
0
        so->killedItems[so->numKilled++] = so->currPos.itemIndex;
326
0
    }
327
328
    /*
329
     * Now continue the scan.
330
     */
331
0
    res = _hash_next(scan, dir);
332
0
  }
333
334
0
  return res;
335
0
}
336
337
338
/*
339
 *  hashgetbitmap() -- get all tuples at once
340
 */
341
int64
342
hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
343
0
{
344
0
  HashScanOpaque so = (HashScanOpaque) scan->opaque;
345
0
  bool    res;
346
0
  int64   ntids = 0;
347
0
  HashScanPosItem *currItem;
348
349
0
  res = _hash_first(scan, ForwardScanDirection);
350
351
0
  while (res)
352
0
  {
353
0
    currItem = &so->currPos.items[so->currPos.itemIndex];
354
355
    /*
356
     * _hash_first and _hash_next handle eliminate dead index entries
357
     * whenever scan->ignore_killed_tuples is true.  Therefore, there's
358
     * nothing to do here except add the results to the TIDBitmap.
359
     */
360
0
    tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
361
0
    ntids++;
362
363
0
    res = _hash_next(scan, ForwardScanDirection);
364
0
  }
365
366
0
  return ntids;
367
0
}
368
369
370
/*
371
 *  hashbeginscan() -- start a scan on a hash index
372
 */
373
IndexScanDesc
374
hashbeginscan(Relation rel, int nkeys, int norderbys)
375
0
{
376
0
  IndexScanDesc scan;
377
0
  HashScanOpaque so;
378
379
  /* no order by operators allowed */
380
0
  Assert(norderbys == 0);
381
382
0
  scan = RelationGetIndexScan(rel, nkeys, norderbys);
383
384
0
  so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
385
0
  HashScanPosInvalidate(so->currPos);
386
0
  so->hashso_bucket_buf = InvalidBuffer;
387
0
  so->hashso_split_bucket_buf = InvalidBuffer;
388
389
0
  so->hashso_buc_populated = false;
390
0
  so->hashso_buc_split = false;
391
392
0
  so->killedItems = NULL;
393
0
  so->numKilled = 0;
394
395
0
  scan->opaque = so;
396
397
0
  return scan;
398
0
}
399
400
/*
401
 *  hashrescan() -- rescan an index relation
402
 */
403
void
404
hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
405
       ScanKey orderbys, int norderbys)
406
0
{
407
0
  HashScanOpaque so = (HashScanOpaque) scan->opaque;
408
0
  Relation  rel = scan->indexRelation;
409
410
0
  if (HashScanPosIsValid(so->currPos))
411
0
  {
412
    /* Before leaving current page, deal with any killed items */
413
0
    if (so->numKilled > 0)
414
0
      _hash_kill_items(scan);
415
0
  }
416
417
0
  _hash_dropscanbuf(rel, so);
418
419
  /* set position invalid (this will cause _hash_first call) */
420
0
  HashScanPosInvalidate(so->currPos);
421
422
  /* Update scan key, if a new one is given */
423
0
  if (scankey && scan->numberOfKeys > 0)
424
0
    memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
425
426
0
  so->hashso_buc_populated = false;
427
0
  so->hashso_buc_split = false;
428
0
}
429
430
/*
431
 *  hashendscan() -- close down a scan
432
 */
433
void
434
hashendscan(IndexScanDesc scan)
435
0
{
436
0
  HashScanOpaque so = (HashScanOpaque) scan->opaque;
437
0
  Relation  rel = scan->indexRelation;
438
439
0
  if (HashScanPosIsValid(so->currPos))
440
0
  {
441
    /* Before leaving current page, deal with any killed items */
442
0
    if (so->numKilled > 0)
443
0
      _hash_kill_items(scan);
444
0
  }
445
446
0
  _hash_dropscanbuf(rel, so);
447
448
0
  if (so->killedItems != NULL)
449
0
    pfree(so->killedItems);
450
0
  pfree(so);
451
0
  scan->opaque = NULL;
452
0
}
453
454
/*
455
 * Bulk deletion of all index entries pointing to a set of heap tuples.
456
 * The set of target tuples is specified via a callback routine that tells
457
 * whether any given heap tuple (identified by ItemPointer) is being deleted.
458
 *
459
 * This function also deletes the tuples that are moved by split to other
460
 * bucket.
461
 *
462
 * Result: a palloc'd struct containing statistical info for VACUUM displays.
463
 */
464
IndexBulkDeleteResult *
465
hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
466
         IndexBulkDeleteCallback callback, void *callback_state)
467
0
{
468
0
  Relation  rel = info->index;
469
0
  double    tuples_removed;
470
0
  double    num_index_tuples;
471
0
  double    orig_ntuples;
472
0
  Bucket    orig_maxbucket;
473
0
  Bucket    cur_maxbucket;
474
0
  Bucket    cur_bucket;
475
0
  Buffer    metabuf = InvalidBuffer;
476
0
  HashMetaPage metap;
477
0
  HashMetaPage cachedmetap;
478
479
0
  tuples_removed = 0;
480
0
  num_index_tuples = 0;
481
482
  /*
483
   * We need a copy of the metapage so that we can use its hashm_spares[]
484
   * values to compute bucket page addresses, but a cached copy should be
485
   * good enough.  (If not, we'll detect that further down and refresh the
486
   * cache as necessary.)
487
   */
488
0
  cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
489
0
  Assert(cachedmetap != NULL);
490
491
0
  orig_maxbucket = cachedmetap->hashm_maxbucket;
492
0
  orig_ntuples = cachedmetap->hashm_ntuples;
493
494
  /* Scan the buckets that we know exist */
495
0
  cur_bucket = 0;
496
0
  cur_maxbucket = orig_maxbucket;
497
498
0
loop_top:
499
0
  while (cur_bucket <= cur_maxbucket)
500
0
  {
501
0
    BlockNumber bucket_blkno;
502
0
    BlockNumber blkno;
503
0
    Buffer    bucket_buf;
504
0
    Buffer    buf;
505
0
    HashPageOpaque bucket_opaque;
506
0
    Page    page;
507
0
    bool    split_cleanup = false;
508
509
    /* Get address of bucket's start page */
510
0
    bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
511
512
0
    blkno = bucket_blkno;
513
514
    /*
515
     * We need to acquire a cleanup lock on the primary bucket page to out
516
     * wait concurrent scans before deleting the dead tuples.
517
     */
518
0
    buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
519
0
    LockBufferForCleanup(buf);
520
0
    _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
521
522
0
    page = BufferGetPage(buf);
523
0
    bucket_opaque = HashPageGetOpaque(page);
524
525
    /*
526
     * If the bucket contains tuples that are moved by split, then we need
527
     * to delete such tuples.  We can't delete such tuples if the split
528
     * operation on bucket is not finished as those are needed by scans.
529
     */
530
0
    if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
531
0
      H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
532
0
    {
533
0
      split_cleanup = true;
534
535
      /*
536
       * This bucket might have been split since we last held a lock on
537
       * the metapage.  If so, hashm_maxbucket, hashm_highmask and
538
       * hashm_lowmask might be old enough to cause us to fail to remove
539
       * tuples left behind by the most recent split.  To prevent that,
540
       * now that the primary page of the target bucket has been locked
541
       * (and thus can't be further split), check whether we need to
542
       * update our cached metapage data.
543
       */
544
0
      Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
545
0
      if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
546
0
      {
547
0
        cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
548
0
        Assert(cachedmetap != NULL);
549
0
      }
550
0
    }
551
552
0
    bucket_buf = buf;
553
554
0
    hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
555
0
              cachedmetap->hashm_maxbucket,
556
0
              cachedmetap->hashm_highmask,
557
0
              cachedmetap->hashm_lowmask, &tuples_removed,
558
0
              &num_index_tuples, split_cleanup,
559
0
              callback, callback_state);
560
561
0
    _hash_dropbuf(rel, bucket_buf);
562
563
    /* Advance to next bucket */
564
0
    cur_bucket++;
565
0
  }
566
567
0
  if (BufferIsInvalid(metabuf))
568
0
    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
569
570
  /* Write-lock metapage and check for split since we started */
571
0
  LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
572
0
  metap = HashPageGetMeta(BufferGetPage(metabuf));
573
574
0
  if (cur_maxbucket != metap->hashm_maxbucket)
575
0
  {
576
    /* There's been a split, so process the additional bucket(s) */
577
0
    LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
578
0
    cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
579
0
    Assert(cachedmetap != NULL);
580
0
    cur_maxbucket = cachedmetap->hashm_maxbucket;
581
0
    goto loop_top;
582
0
  }
583
584
  /* Okay, we're really done.  Update tuple count in metapage. */
585
0
  START_CRIT_SECTION();
586
587
0
  if (orig_maxbucket == metap->hashm_maxbucket &&
588
0
    orig_ntuples == metap->hashm_ntuples)
589
0
  {
590
    /*
591
     * No one has split or inserted anything since start of scan, so
592
     * believe our count as gospel.
593
     */
594
0
    metap->hashm_ntuples = num_index_tuples;
595
0
  }
596
0
  else
597
0
  {
598
    /*
599
     * Otherwise, our count is untrustworthy since we may have
600
     * double-scanned tuples in split buckets.  Proceed by dead-reckoning.
601
     * (Note: we still return estimated_count = false, because using this
602
     * count is better than not updating reltuples at all.)
603
     */
604
0
    if (metap->hashm_ntuples > tuples_removed)
605
0
      metap->hashm_ntuples -= tuples_removed;
606
0
    else
607
0
      metap->hashm_ntuples = 0;
608
0
    num_index_tuples = metap->hashm_ntuples;
609
0
  }
610
611
0
  MarkBufferDirty(metabuf);
612
613
  /* XLOG stuff */
614
0
  if (RelationNeedsWAL(rel))
615
0
  {
616
0
    xl_hash_update_meta_page xlrec;
617
0
    XLogRecPtr  recptr;
618
619
0
    xlrec.ntuples = metap->hashm_ntuples;
620
621
0
    XLogBeginInsert();
622
0
    XLogRegisterData(&xlrec, SizeOfHashUpdateMetaPage);
623
624
0
    XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
625
626
0
    recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
627
0
    PageSetLSN(BufferGetPage(metabuf), recptr);
628
0
  }
629
630
0
  END_CRIT_SECTION();
631
632
0
  _hash_relbuf(rel, metabuf);
633
634
  /* return statistics */
635
0
  if (stats == NULL)
636
0
    stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
637
0
  stats->estimated_count = false;
638
0
  stats->num_index_tuples = num_index_tuples;
639
0
  stats->tuples_removed += tuples_removed;
640
  /* hashvacuumcleanup will fill in num_pages */
641
642
0
  return stats;
643
0
}
644
645
/*
646
 * Post-VACUUM cleanup.
647
 *
648
 * Result: a palloc'd struct containing statistical info for VACUUM displays.
649
 */
650
IndexBulkDeleteResult *
651
hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
652
0
{
653
0
  Relation  rel = info->index;
654
0
  BlockNumber num_pages;
655
656
  /* If hashbulkdelete wasn't called, return NULL signifying no change */
657
  /* Note: this covers the analyze_only case too */
658
0
  if (stats == NULL)
659
0
    return NULL;
660
661
  /* update statistics */
662
0
  num_pages = RelationGetNumberOfBlocks(rel);
663
0
  stats->num_pages = num_pages;
664
665
0
  return stats;
666
0
}
667
668
/*
669
 * Helper function to perform deletion of index entries from a bucket.
670
 *
671
 * This function expects that the caller has acquired a cleanup lock on the
672
 * primary bucket page, and will return with a write lock again held on the
673
 * primary bucket page.  The lock won't necessarily be held continuously,
674
 * though, because we'll release it when visiting overflow pages.
675
 *
676
 * There can't be any concurrent scans in progress when we first enter this
677
 * function because of the cleanup lock we hold on the primary bucket page,
678
 * but as soon as we release that lock, there might be.  If those scans got
679
 * ahead of our cleanup scan, they might see a tuple before we kill it and
680
 * wake up only after VACUUM has completed and the TID has been recycled for
681
 * an unrelated tuple.  To avoid that calamity, we prevent scans from passing
682
 * our cleanup scan by locking the next page in the bucket chain before
683
 * releasing the lock on the previous page.  (This type of lock chaining is not
684
 * ideal, so we might want to look for a better solution at some point.)
685
 *
686
 * We need to retain a pin on the primary bucket to ensure that no concurrent
687
 * split can start.
688
 */
689
void
690
hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
691
          BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
692
          uint32 maxbucket, uint32 highmask, uint32 lowmask,
693
          double *tuples_removed, double *num_index_tuples,
694
          bool split_cleanup,
695
          IndexBulkDeleteCallback callback, void *callback_state)
696
0
{
697
0
  BlockNumber blkno;
698
0
  Buffer    buf;
699
0
  Bucket    new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
700
0
  bool    bucket_dirty = false;
701
702
0
  blkno = bucket_blkno;
703
0
  buf = bucket_buf;
704
705
0
  if (split_cleanup)
706
0
    new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
707
0
                            lowmask, maxbucket);
708
709
  /* Scan each page in bucket */
710
0
  for (;;)
711
0
  {
712
0
    HashPageOpaque opaque;
713
0
    OffsetNumber offno;
714
0
    OffsetNumber maxoffno;
715
0
    Buffer    next_buf;
716
0
    Page    page;
717
0
    OffsetNumber deletable[MaxOffsetNumber];
718
0
    int     ndeletable = 0;
719
0
    bool    retain_pin = false;
720
0
    bool    clear_dead_marking = false;
721
722
0
    vacuum_delay_point(false);
723
724
0
    page = BufferGetPage(buf);
725
0
    opaque = HashPageGetOpaque(page);
726
727
    /* Scan each tuple in page */
728
0
    maxoffno = PageGetMaxOffsetNumber(page);
729
0
    for (offno = FirstOffsetNumber;
730
0
       offno <= maxoffno;
731
0
       offno = OffsetNumberNext(offno))
732
0
    {
733
0
      ItemPointer htup;
734
0
      IndexTuple  itup;
735
0
      Bucket    bucket;
736
0
      bool    kill_tuple = false;
737
738
0
      itup = (IndexTuple) PageGetItem(page,
739
0
                      PageGetItemId(page, offno));
740
0
      htup = &(itup->t_tid);
741
742
      /*
743
       * To remove the dead tuples, we strictly want to rely on results
744
       * of callback function.  refer btvacuumpage for detailed reason.
745
       */
746
0
      if (callback && callback(htup, callback_state))
747
0
      {
748
0
        kill_tuple = true;
749
0
        if (tuples_removed)
750
0
          *tuples_removed += 1;
751
0
      }
752
0
      else if (split_cleanup)
753
0
      {
754
        /* delete the tuples that are moved by split. */
755
0
        bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
756
0
                        maxbucket,
757
0
                        highmask,
758
0
                        lowmask);
759
        /* mark the item for deletion */
760
0
        if (bucket != cur_bucket)
761
0
        {
762
          /*
763
           * We expect tuples to either belong to current bucket or
764
           * new_bucket.  This is ensured because we don't allow
765
           * further splits from bucket that contains garbage. See
766
           * comments in _hash_expandtable.
767
           */
768
0
          Assert(bucket == new_bucket);
769
0
          kill_tuple = true;
770
0
        }
771
0
      }
772
773
0
      if (kill_tuple)
774
0
      {
775
        /* mark the item for deletion */
776
0
        deletable[ndeletable++] = offno;
777
0
      }
778
0
      else
779
0
      {
780
        /* we're keeping it, so count it */
781
0
        if (num_index_tuples)
782
0
          *num_index_tuples += 1;
783
0
      }
784
0
    }
785
786
    /* retain the pin on primary bucket page till end of bucket scan */
787
0
    if (blkno == bucket_blkno)
788
0
      retain_pin = true;
789
0
    else
790
0
      retain_pin = false;
791
792
0
    blkno = opaque->hasho_nextblkno;
793
794
    /*
795
     * Apply deletions, advance to next page and write page if needed.
796
     */
797
0
    if (ndeletable > 0)
798
0
    {
799
      /* No ereport(ERROR) until changes are logged */
800
0
      START_CRIT_SECTION();
801
802
0
      PageIndexMultiDelete(page, deletable, ndeletable);
803
0
      bucket_dirty = true;
804
805
      /*
806
       * Let us mark the page as clean if vacuum removes the DEAD tuples
807
       * from an index page. We do this by clearing
808
       * LH_PAGE_HAS_DEAD_TUPLES flag.
809
       */
810
0
      if (tuples_removed && *tuples_removed > 0 &&
811
0
        H_HAS_DEAD_TUPLES(opaque))
812
0
      {
813
0
        opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
814
0
        clear_dead_marking = true;
815
0
      }
816
817
0
      MarkBufferDirty(buf);
818
819
      /* XLOG stuff */
820
0
      if (RelationNeedsWAL(rel))
821
0
      {
822
0
        xl_hash_delete xlrec;
823
0
        XLogRecPtr  recptr;
824
825
0
        xlrec.clear_dead_marking = clear_dead_marking;
826
0
        xlrec.is_primary_bucket_page = (buf == bucket_buf);
827
828
0
        XLogBeginInsert();
829
0
        XLogRegisterData(&xlrec, SizeOfHashDelete);
830
831
        /*
832
         * bucket buffer was not changed, but still needs to be
833
         * registered to ensure that we can acquire a cleanup lock on
834
         * it during replay.
835
         */
836
0
        if (!xlrec.is_primary_bucket_page)
837
0
        {
838
0
          uint8   flags = REGBUF_STANDARD | REGBUF_NO_IMAGE | REGBUF_NO_CHANGE;
839
840
0
          XLogRegisterBuffer(0, bucket_buf, flags);
841
0
        }
842
843
0
        XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
844
0
        XLogRegisterBufData(1, deletable,
845
0
                  ndeletable * sizeof(OffsetNumber));
846
847
0
        recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
848
0
        PageSetLSN(BufferGetPage(buf), recptr);
849
0
      }
850
851
0
      END_CRIT_SECTION();
852
0
    }
853
854
    /* bail out if there are no more pages to scan. */
855
0
    if (!BlockNumberIsValid(blkno))
856
0
      break;
857
858
0
    next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
859
0
                        LH_OVERFLOW_PAGE,
860
0
                        bstrategy);
861
862
    /*
863
     * release the lock on previous page after acquiring the lock on next
864
     * page
865
     */
866
0
    if (retain_pin)
867
0
      LockBuffer(buf, BUFFER_LOCK_UNLOCK);
868
0
    else
869
0
      _hash_relbuf(rel, buf);
870
871
0
    buf = next_buf;
872
0
  }
873
874
  /*
875
   * lock the bucket page to clear the garbage flag and squeeze the bucket.
876
   * if the current buffer is same as bucket buffer, then we already have
877
   * lock on bucket page.
878
   */
879
0
  if (buf != bucket_buf)
880
0
  {
881
0
    _hash_relbuf(rel, buf);
882
0
    LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
883
0
  }
884
885
  /*
886
   * Clear the garbage flag from bucket after deleting the tuples that are
887
   * moved by split.  We purposefully clear the flag before squeeze bucket,
888
   * so that after restart, vacuum shouldn't again try to delete the moved
889
   * by split tuples.
890
   */
891
0
  if (split_cleanup)
892
0
  {
893
0
    HashPageOpaque bucket_opaque;
894
0
    Page    page;
895
896
0
    page = BufferGetPage(bucket_buf);
897
0
    bucket_opaque = HashPageGetOpaque(page);
898
899
    /* No ereport(ERROR) until changes are logged */
900
0
    START_CRIT_SECTION();
901
902
0
    bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
903
0
    MarkBufferDirty(bucket_buf);
904
905
    /* XLOG stuff */
906
0
    if (RelationNeedsWAL(rel))
907
0
    {
908
0
      XLogRecPtr  recptr;
909
910
0
      XLogBeginInsert();
911
0
      XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
912
913
0
      recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
914
0
      PageSetLSN(page, recptr);
915
0
    }
916
917
0
    END_CRIT_SECTION();
918
0
  }
919
920
  /*
921
   * If we have deleted anything, try to compact free space.  For squeezing
922
   * the bucket, we must have a cleanup lock, else it can impact the
923
   * ordering of tuples for a scan that has started before it.
924
   */
925
0
  if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
926
0
    _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
927
0
              bstrategy);
928
0
  else
929
0
    LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
930
0
}
931
932
CompareType
933
hashtranslatestrategy(StrategyNumber strategy, Oid opfamily)
934
0
{
935
0
  if (strategy == HTEqualStrategyNumber)
936
0
    return COMPARE_EQ;
937
0
  return COMPARE_INVALID;
938
0
}
939
940
StrategyNumber
941
hashtranslatecmptype(CompareType cmptype, Oid opfamily)
942
0
{
943
0
  if (cmptype == COMPARE_EQ)
944
0
    return HTEqualStrategyNumber;
945
0
  return InvalidStrategy;
946
0
}