Coverage Report

Created: 2025-09-27 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/postgres/src/backend/lib/dshash.c
Line
Count
Source
1
/*-------------------------------------------------------------------------
2
 *
3
 * dshash.c
4
 *    Concurrent hash tables backed by dynamic shared memory areas.
5
 *
6
 * This is an open hashing hash table, with a linked list at each table
7
 * entry.  It supports dynamic resizing, as required to prevent the linked
8
 * lists from growing too long on average.  Currently, only growing is
9
 * supported: the hash table never becomes smaller.
10
 *
11
 * To deal with concurrency, it has a fixed size set of partitions, each of
12
 * which is independently locked.  Each bucket maps to a partition; so insert,
13
 * find and iterate operations normally only acquire one lock.  Therefore,
14
 * good concurrency is achieved whenever such operations don't collide at the
15
 * lock partition level.  However, when a resize operation begins, all
16
 * partition locks must be acquired simultaneously for a brief period.  This
17
 * is only expected to happen a small number of times until a stable size is
18
 * found, since growth is geometric.
19
 *
20
 * Future versions may support iterators and incremental resizing; for now
21
 * the implementation is minimalist.
22
 *
23
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
24
 * Portions Copyright (c) 1994, Regents of the University of California
25
 *
26
 * IDENTIFICATION
27
 *    src/backend/lib/dshash.c
28
 *
29
 *-------------------------------------------------------------------------
30
 */
31
32
#include "postgres.h"
33
34
#include "common/hashfn.h"
35
#include "lib/dshash.h"
36
#include "storage/lwlock.h"
37
#include "utils/dsa.h"
38
39
/*
40
 * An item in the hash table.  This wraps the user's entry object in an
41
 * envelop that holds a pointer back to the bucket and a pointer to the next
42
 * item in the bucket.
43
 */
44
struct dshash_table_item
45
{
46
  /* The next item in the same bucket. */
47
  dsa_pointer next;
48
  /* The hashed key, to avoid having to recompute it. */
49
  dshash_hash hash;
50
  /* The user's entry object follows here.  See ENTRY_FROM_ITEM(item). */
51
};
52
53
/*
54
 * The number of partitions for locking purposes.  This is set to match
55
 * NUM_BUFFER_PARTITIONS for now, on the basis that whatever's good enough for
56
 * the buffer pool must be good enough for any other purpose.  This could
57
 * become a runtime parameter in future.
58
 */
59
0
#define DSHASH_NUM_PARTITIONS_LOG2 7
60
0
#define DSHASH_NUM_PARTITIONS (1 << DSHASH_NUM_PARTITIONS_LOG2)
61
62
/* A magic value used to identify our hash tables. */
63
0
#define DSHASH_MAGIC 0x75ff6a20
64
65
/*
66
 * Tracking information for each lock partition.  Initially, each partition
67
 * corresponds to one bucket, but each time the hash table grows, the buckets
68
 * covered by each partition split so the number of buckets covered doubles.
69
 *
70
 * We might want to add padding here so that each partition is on a different
71
 * cache line, but doing so would bloat this structure considerably.
72
 */
73
typedef struct dshash_partition
74
{
75
  LWLock    lock;     /* Protects all buckets in this partition. */
76
  size_t    count;      /* # of items in this partition's buckets */
77
} dshash_partition;
78
79
/*
80
 * The head object for a hash table.  This will be stored in dynamic shared
81
 * memory.
82
 */
83
typedef struct dshash_table_control
84
{
85
  dshash_table_handle handle;
86
  uint32    magic;
87
  dshash_partition partitions[DSHASH_NUM_PARTITIONS];
88
  int     lwlock_tranche_id;
89
90
  /*
91
   * The following members are written to only when ALL partitions locks are
92
   * held.  They can be read when any one partition lock is held.
93
   */
94
95
  /* Number of buckets expressed as power of 2 (8 = 256 buckets). */
96
  size_t    size_log2;    /* log2(number of buckets) */
97
  dsa_pointer buckets;    /* current bucket array */
98
} dshash_table_control;
99
100
/*
101
 * Per-backend state for a dynamic hash table.
102
 */
103
struct dshash_table
104
{
105
  dsa_area   *area;     /* Backing dynamic shared memory area. */
106
  dshash_parameters params; /* Parameters. */
107
  void     *arg;      /* User-supplied data pointer. */
108
  dshash_table_control *control;  /* Control object in DSM. */
109
  dsa_pointer *buckets;   /* Current bucket pointers in DSM. */
110
  size_t    size_log2;    /* log2(number of buckets) */
111
};
112
113
/* Given a pointer to an item, find the entry (user data) it holds. */
114
#define ENTRY_FROM_ITEM(item) \
115
0
  ((char *)(item) + MAXALIGN(sizeof(dshash_table_item)))
116
117
/* Given a pointer to an entry, find the item that holds it. */
118
#define ITEM_FROM_ENTRY(entry)                      \
119
0
  ((dshash_table_item *)((char *)(entry) -              \
120
0
               MAXALIGN(sizeof(dshash_table_item))))
121
122
/* How many resize operations (bucket splits) have there been? */
123
#define NUM_SPLITS(size_log2)         \
124
0
  (size_log2 - DSHASH_NUM_PARTITIONS_LOG2)
125
126
/* How many buckets are there in a given size? */
127
#define NUM_BUCKETS(size_log2)    \
128
0
  (((size_t) 1) << (size_log2))
129
130
/* How many buckets are there in each partition at a given size? */
131
#define BUCKETS_PER_PARTITION(size_log2)    \
132
0
  (((size_t) 1) << NUM_SPLITS(size_log2))
133
134
/* Max entries before we need to grow.  Half + quarter = 75% load factor. */
135
#define MAX_COUNT_PER_PARTITION(hash_table)       \
136
0
  (BUCKETS_PER_PARTITION(hash_table->size_log2) / 2 + \
137
0
   BUCKETS_PER_PARTITION(hash_table->size_log2) / 4)
138
139
/* Choose partition based on the highest order bits of the hash. */
140
#define PARTITION_FOR_HASH(hash)                    \
141
0
  (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - DSHASH_NUM_PARTITIONS_LOG2))
142
143
/*
144
 * Find the bucket index for a given hash and table size.  Each time the table
145
 * doubles in size, the appropriate bucket for a given hash value doubles and
146
 * possibly adds one, depending on the newly revealed bit, so that all buckets
147
 * are split.
148
 */
149
#define BUCKET_INDEX_FOR_HASH_AND_SIZE(hash, size_log2)   \
150
0
  (hash >> ((sizeof(dshash_hash) * CHAR_BIT) - (size_log2)))
151
152
/* The index of the first bucket in a given partition. */
153
#define BUCKET_INDEX_FOR_PARTITION(partition, size_log2)  \
154
0
  ((partition) << NUM_SPLITS(size_log2))
155
156
/* Choose partition based on bucket index. */
157
#define PARTITION_FOR_BUCKET_INDEX(bucket_idx, size_log2)       \
158
0
  ((bucket_idx) >> NUM_SPLITS(size_log2))
159
160
/* The head of the active bucket for a given hash value (lvalue). */
161
#define BUCKET_FOR_HASH(hash_table, hash)               \
162
0
  (hash_table->buckets[                       \
163
0
    BUCKET_INDEX_FOR_HASH_AND_SIZE(hash,              \
164
0
                     hash_table->size_log2)])
165
166
static void delete_item(dshash_table *hash_table,
167
            dshash_table_item *item);
168
static void resize(dshash_table *hash_table, size_t new_size_log2);
169
static inline void ensure_valid_bucket_pointers(dshash_table *hash_table);
170
static inline dshash_table_item *find_in_bucket(dshash_table *hash_table,
171
                        const void *key,
172
                        dsa_pointer item_pointer);
173
static void insert_item_into_bucket(dshash_table *hash_table,
174
                  dsa_pointer item_pointer,
175
                  dshash_table_item *item,
176
                  dsa_pointer *bucket);
177
static dshash_table_item *insert_into_bucket(dshash_table *hash_table,
178
                       const void *key,
179
                       dsa_pointer *bucket);
180
static bool delete_key_from_bucket(dshash_table *hash_table,
181
                   const void *key,
182
                   dsa_pointer *bucket_head);
183
static bool delete_item_from_bucket(dshash_table *hash_table,
184
                  dshash_table_item *item,
185
                  dsa_pointer *bucket_head);
186
static inline dshash_hash hash_key(dshash_table *hash_table, const void *key);
187
static inline bool equal_keys(dshash_table *hash_table,
188
                const void *a, const void *b);
189
static inline void copy_key(dshash_table *hash_table, void *dest,
190
              const void *src);
191
192
#define PARTITION_LOCK(hash_table, i)     \
193
0
  (&(hash_table)->control->partitions[(i)].lock)
194
195
#define ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table) \
196
0
  Assert(!LWLockAnyHeldByMe(&(hash_table)->control->partitions[0].lock, \
197
0
       DSHASH_NUM_PARTITIONS, sizeof(dshash_partition)))
198
199
/*
200
 * Create a new hash table backed by the given dynamic shared area, with the
201
 * given parameters.  The returned object is allocated in backend-local memory
202
 * using the current MemoryContext.  'arg' will be passed through to the
203
 * compare, hash, and copy functions.
204
 */
205
dshash_table *
206
dshash_create(dsa_area *area, const dshash_parameters *params, void *arg)
207
0
{
208
0
  dshash_table *hash_table;
209
0
  dsa_pointer control;
210
211
  /* Allocate the backend-local object representing the hash table. */
212
0
  hash_table = palloc(sizeof(dshash_table));
213
214
  /* Allocate the control object in shared memory. */
215
0
  control = dsa_allocate(area, sizeof(dshash_table_control));
216
217
  /* Set up the local and shared hash table structs. */
218
0
  hash_table->area = area;
219
0
  hash_table->params = *params;
220
0
  hash_table->arg = arg;
221
0
  hash_table->control = dsa_get_address(area, control);
222
0
  hash_table->control->handle = control;
223
0
  hash_table->control->magic = DSHASH_MAGIC;
224
0
  hash_table->control->lwlock_tranche_id = params->tranche_id;
225
226
  /* Set up the array of lock partitions. */
227
0
  {
228
0
    dshash_partition *partitions = hash_table->control->partitions;
229
0
    int     tranche_id = hash_table->control->lwlock_tranche_id;
230
0
    int     i;
231
232
0
    for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
233
0
    {
234
0
      LWLockInitialize(&partitions[i].lock, tranche_id);
235
0
      partitions[i].count = 0;
236
0
    }
237
0
  }
238
239
  /*
240
   * Set up the initial array of buckets.  Our initial size is the same as
241
   * the number of partitions.
242
   */
243
0
  hash_table->control->size_log2 = DSHASH_NUM_PARTITIONS_LOG2;
244
0
  hash_table->control->buckets =
245
0
    dsa_allocate_extended(area,
246
0
                sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS,
247
0
                DSA_ALLOC_NO_OOM | DSA_ALLOC_ZERO);
248
0
  if (!DsaPointerIsValid(hash_table->control->buckets))
249
0
  {
250
0
    dsa_free(area, control);
251
0
    ereport(ERROR,
252
0
        (errcode(ERRCODE_OUT_OF_MEMORY),
253
0
         errmsg("out of memory"),
254
0
         errdetail("Failed on DSA request of size %zu.",
255
0
               sizeof(dsa_pointer) * DSHASH_NUM_PARTITIONS)));
256
0
  }
257
0
  hash_table->buckets = dsa_get_address(area,
258
0
                      hash_table->control->buckets);
259
0
  hash_table->size_log2 = hash_table->control->size_log2;
260
261
0
  return hash_table;
262
0
}
263
264
/*
265
 * Attach to an existing hash table using a handle.  The returned object is
266
 * allocated in backend-local memory using the current MemoryContext.  'arg'
267
 * will be passed through to the compare and hash functions.
268
 */
269
dshash_table *
270
dshash_attach(dsa_area *area, const dshash_parameters *params,
271
        dshash_table_handle handle, void *arg)
272
0
{
273
0
  dshash_table *hash_table;
274
0
  dsa_pointer control;
275
276
  /* Allocate the backend-local object representing the hash table. */
277
0
  hash_table = palloc(sizeof(dshash_table));
278
279
  /* Find the control object in shared memory. */
280
0
  control = handle;
281
282
  /* Set up the local hash table struct. */
283
0
  hash_table->area = area;
284
0
  hash_table->params = *params;
285
0
  hash_table->arg = arg;
286
0
  hash_table->control = dsa_get_address(area, control);
287
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
288
289
  /*
290
   * These will later be set to the correct values by
291
   * ensure_valid_bucket_pointers(), at which time we'll be holding a
292
   * partition lock for interlocking against concurrent resizing.
293
   */
294
0
  hash_table->buckets = NULL;
295
0
  hash_table->size_log2 = 0;
296
297
0
  return hash_table;
298
0
}
299
300
/*
301
 * Detach from a hash table.  This frees backend-local resources associated
302
 * with the hash table, but the hash table will continue to exist until it is
303
 * either explicitly destroyed (by a backend that is still attached to it), or
304
 * the area that backs it is returned to the operating system.
305
 */
306
void
307
dshash_detach(dshash_table *hash_table)
308
0
{
309
0
  ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
310
311
  /* The hash table may have been destroyed.  Just free local memory. */
312
0
  pfree(hash_table);
313
0
}
314
315
/*
316
 * Destroy a hash table, returning all memory to the area.  The caller must be
317
 * certain that no other backend will attempt to access the hash table before
318
 * calling this function.  Other backend must explicitly call dshash_detach to
319
 * free up backend-local memory associated with the hash table.  The backend
320
 * that calls dshash_destroy must not call dshash_detach.
321
 */
322
void
323
dshash_destroy(dshash_table *hash_table)
324
0
{
325
0
  size_t    size;
326
0
  size_t    i;
327
328
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
329
0
  ensure_valid_bucket_pointers(hash_table);
330
331
  /* Free all the entries. */
332
0
  size = NUM_BUCKETS(hash_table->size_log2);
333
0
  for (i = 0; i < size; ++i)
334
0
  {
335
0
    dsa_pointer item_pointer = hash_table->buckets[i];
336
337
0
    while (DsaPointerIsValid(item_pointer))
338
0
    {
339
0
      dshash_table_item *item;
340
0
      dsa_pointer next_item_pointer;
341
342
0
      item = dsa_get_address(hash_table->area, item_pointer);
343
0
      next_item_pointer = item->next;
344
0
      dsa_free(hash_table->area, item_pointer);
345
0
      item_pointer = next_item_pointer;
346
0
    }
347
0
  }
348
349
  /*
350
   * Vandalize the control block to help catch programming errors where
351
   * other backends access the memory formerly occupied by this hash table.
352
   */
353
0
  hash_table->control->magic = 0;
354
355
  /* Free the active table and control object. */
356
0
  dsa_free(hash_table->area, hash_table->control->buckets);
357
0
  dsa_free(hash_table->area, hash_table->control->handle);
358
359
0
  pfree(hash_table);
360
0
}
361
362
/*
363
 * Get a handle that can be used by other processes to attach to this hash
364
 * table.
365
 */
366
dshash_table_handle
367
dshash_get_hash_table_handle(dshash_table *hash_table)
368
0
{
369
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
370
371
0
  return hash_table->control->handle;
372
0
}
373
374
/*
375
 * Look up an entry, given a key.  Returns a pointer to an entry if one can be
376
 * found with the given key.  Returns NULL if the key is not found.  If a
377
 * non-NULL value is returned, the entry is locked and must be released by
378
 * calling dshash_release_lock.  If an error is raised before
379
 * dshash_release_lock is called, the lock will be released automatically, but
380
 * the caller must take care to ensure that the entry is not left corrupted.
381
 * The lock mode is either shared or exclusive depending on 'exclusive'.
382
 *
383
 * The caller must not hold a lock already.
384
 *
385
 * Note that the lock held is in fact an LWLock, so interrupts will be held on
386
 * return from this function, and not resumed until dshash_release_lock is
387
 * called.  It is a very good idea for the caller to release the lock quickly.
388
 */
389
void *
390
dshash_find(dshash_table *hash_table, const void *key, bool exclusive)
391
0
{
392
0
  dshash_hash hash;
393
0
  size_t    partition;
394
0
  dshash_table_item *item;
395
396
0
  hash = hash_key(hash_table, key);
397
0
  partition = PARTITION_FOR_HASH(hash);
398
399
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
400
0
  ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
401
402
0
  LWLockAcquire(PARTITION_LOCK(hash_table, partition),
403
0
          exclusive ? LW_EXCLUSIVE : LW_SHARED);
404
0
  ensure_valid_bucket_pointers(hash_table);
405
406
  /* Search the active bucket. */
407
0
  item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
408
409
0
  if (!item)
410
0
  {
411
    /* Not found. */
412
0
    LWLockRelease(PARTITION_LOCK(hash_table, partition));
413
0
    return NULL;
414
0
  }
415
0
  else
416
0
  {
417
    /* The caller will free the lock by calling dshash_release_lock. */
418
0
    return ENTRY_FROM_ITEM(item);
419
0
  }
420
0
}
421
422
/*
423
 * Returns a pointer to an exclusively locked item which must be released with
424
 * dshash_release_lock.  If the key is found in the hash table, 'found' is set
425
 * to true and a pointer to the existing entry is returned.  If the key is not
426
 * found, 'found' is set to false, and a pointer to a newly created entry is
427
 * returned.
428
 *
429
 * Notes above dshash_find() regarding locking and error handling equally
430
 * apply here.
431
 */
432
void *
433
dshash_find_or_insert(dshash_table *hash_table,
434
            const void *key,
435
            bool *found)
436
0
{
437
0
  dshash_hash hash;
438
0
  size_t    partition_index;
439
0
  dshash_partition *partition;
440
0
  dshash_table_item *item;
441
442
0
  hash = hash_key(hash_table, key);
443
0
  partition_index = PARTITION_FOR_HASH(hash);
444
0
  partition = &hash_table->control->partitions[partition_index];
445
446
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
447
0
  ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
448
449
0
restart:
450
0
  LWLockAcquire(PARTITION_LOCK(hash_table, partition_index),
451
0
          LW_EXCLUSIVE);
452
0
  ensure_valid_bucket_pointers(hash_table);
453
454
  /* Search the active bucket. */
455
0
  item = find_in_bucket(hash_table, key, BUCKET_FOR_HASH(hash_table, hash));
456
457
0
  if (item)
458
0
    *found = true;
459
0
  else
460
0
  {
461
0
    *found = false;
462
463
    /* Check if we are getting too full. */
464
0
    if (partition->count > MAX_COUNT_PER_PARTITION(hash_table))
465
0
    {
466
      /*
467
       * The load factor (= keys / buckets) for all buckets protected by
468
       * this partition is > 0.75.  Presumably the same applies
469
       * generally across the whole hash table (though we don't attempt
470
       * to track that directly to avoid contention on some kind of
471
       * central counter; we just assume that this partition is
472
       * representative).  This is a good time to resize.
473
       *
474
       * Give up our existing lock first, because resizing needs to
475
       * reacquire all the locks in the right order to avoid deadlocks.
476
       */
477
0
      LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
478
0
      resize(hash_table, hash_table->size_log2 + 1);
479
480
0
      goto restart;
481
0
    }
482
483
    /* Finally we can try to insert the new item. */
484
0
    item = insert_into_bucket(hash_table, key,
485
0
                  &BUCKET_FOR_HASH(hash_table, hash));
486
0
    item->hash = hash;
487
    /* Adjust per-lock-partition counter for load factor knowledge. */
488
0
    ++partition->count;
489
0
  }
490
491
  /* The caller must release the lock with dshash_release_lock. */
492
0
  return ENTRY_FROM_ITEM(item);
493
0
}
494
495
/*
496
 * Remove an entry by key.  Returns true if the key was found and the
497
 * corresponding entry was removed.
498
 *
499
 * To delete an entry that you already have a pointer to, see
500
 * dshash_delete_entry.
501
 */
502
bool
503
dshash_delete_key(dshash_table *hash_table, const void *key)
504
0
{
505
0
  dshash_hash hash;
506
0
  size_t    partition;
507
0
  bool    found;
508
509
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
510
0
  ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
511
512
0
  hash = hash_key(hash_table, key);
513
0
  partition = PARTITION_FOR_HASH(hash);
514
515
0
  LWLockAcquire(PARTITION_LOCK(hash_table, partition), LW_EXCLUSIVE);
516
0
  ensure_valid_bucket_pointers(hash_table);
517
518
0
  if (delete_key_from_bucket(hash_table, key,
519
0
                 &BUCKET_FOR_HASH(hash_table, hash)))
520
0
  {
521
0
    Assert(hash_table->control->partitions[partition].count > 0);
522
0
    found = true;
523
0
    --hash_table->control->partitions[partition].count;
524
0
  }
525
0
  else
526
0
    found = false;
527
528
0
  LWLockRelease(PARTITION_LOCK(hash_table, partition));
529
530
0
  return found;
531
0
}
532
533
/*
534
 * Remove an entry.  The entry must already be exclusively locked, and must
535
 * have been obtained by dshash_find or dshash_find_or_insert.  Note that this
536
 * function releases the lock just like dshash_release_lock.
537
 *
538
 * To delete an entry by key, see dshash_delete_key.
539
 */
540
void
541
dshash_delete_entry(dshash_table *hash_table, void *entry)
542
0
{
543
0
  dshash_table_item *item = ITEM_FROM_ENTRY(entry);
544
0
  size_t    partition = PARTITION_FOR_HASH(item->hash);
545
546
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
547
0
  Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
548
0
                LW_EXCLUSIVE));
549
550
0
  delete_item(hash_table, item);
551
0
  LWLockRelease(PARTITION_LOCK(hash_table, partition));
552
0
}
553
554
/*
555
 * Unlock an entry which was locked by dshash_find or dshash_find_or_insert.
556
 */
557
void
558
dshash_release_lock(dshash_table *hash_table, void *entry)
559
0
{
560
0
  dshash_table_item *item = ITEM_FROM_ENTRY(entry);
561
0
  size_t    partition_index = PARTITION_FOR_HASH(item->hash);
562
563
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
564
565
0
  LWLockRelease(PARTITION_LOCK(hash_table, partition_index));
566
0
}
567
568
/*
569
 * A compare function that forwards to memcmp.
570
 */
571
int
572
dshash_memcmp(const void *a, const void *b, size_t size, void *arg)
573
0
{
574
0
  return memcmp(a, b, size);
575
0
}
576
577
/*
578
 * A hash function that forwards to tag_hash.
579
 */
580
dshash_hash
581
dshash_memhash(const void *v, size_t size, void *arg)
582
0
{
583
0
  return tag_hash(v, size);
584
0
}
585
586
/*
587
 * A copy function that forwards to memcpy.
588
 */
589
void
590
dshash_memcpy(void *dest, const void *src, size_t size, void *arg)
591
0
{
592
0
  (void) memcpy(dest, src, size);
593
0
}
594
595
/*
596
 * A compare function that forwards to strcmp.
597
 */
598
int
599
dshash_strcmp(const void *a, const void *b, size_t size, void *arg)
600
0
{
601
0
  Assert(strlen((const char *) a) < size);
602
0
  Assert(strlen((const char *) b) < size);
603
604
0
  return strcmp((const char *) a, (const char *) b);
605
0
}
606
607
/*
608
 * A hash function that forwards to string_hash.
609
 */
610
dshash_hash
611
dshash_strhash(const void *v, size_t size, void *arg)
612
0
{
613
0
  Assert(strlen((const char *) v) < size);
614
615
0
  return string_hash((const char *) v, size);
616
0
}
617
618
/*
619
 * A copy function that forwards to strcpy.
620
 */
621
void
622
dshash_strcpy(void *dest, const void *src, size_t size, void *arg)
623
0
{
624
0
  Assert(strlen((const char *) src) < size);
625
626
0
  (void) strcpy((char *) dest, (const char *) src);
627
0
}
628
629
/*
630
 * Sequentially scan through dshash table and return all the elements one by
631
 * one, return NULL when all elements have been returned.
632
 *
633
 * dshash_seq_term needs to be called when a scan finished.  The caller may
634
 * delete returned elements midst of a scan by using dshash_delete_current()
635
 * if exclusive = true.
636
 */
637
void
638
dshash_seq_init(dshash_seq_status *status, dshash_table *hash_table,
639
        bool exclusive)
640
0
{
641
0
  status->hash_table = hash_table;
642
0
  status->curbucket = 0;
643
0
  status->nbuckets = 0;
644
0
  status->curitem = NULL;
645
0
  status->pnextitem = InvalidDsaPointer;
646
0
  status->curpartition = -1;
647
0
  status->exclusive = exclusive;
648
0
}
649
650
/*
651
 * Returns the next element.
652
 *
653
 * Returned elements are locked and the caller may not release the lock. It is
654
 * released by future calls to dshash_seq_next() or dshash_seq_term().
655
 */
656
void *
657
dshash_seq_next(dshash_seq_status *status)
658
0
{
659
0
  dsa_pointer next_item_pointer;
660
661
  /*
662
   * Not yet holding any partition locks. Need to determine the size of the
663
   * hash table, it could have been resized since we were looking last.
664
   * Since we iterate in partition order, we can start by unconditionally
665
   * lock partition 0.
666
   *
667
   * Once we hold the lock, no resizing can happen until the scan ends. So
668
   * we don't need to repeatedly call ensure_valid_bucket_pointers().
669
   */
670
0
  if (status->curpartition == -1)
671
0
  {
672
0
    Assert(status->curbucket == 0);
673
0
    ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(status->hash_table);
674
675
0
    status->curpartition = 0;
676
677
0
    LWLockAcquire(PARTITION_LOCK(status->hash_table,
678
0
                   status->curpartition),
679
0
            status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
680
681
0
    ensure_valid_bucket_pointers(status->hash_table);
682
683
0
    status->nbuckets =
684
0
      NUM_BUCKETS(status->hash_table->control->size_log2);
685
0
    next_item_pointer = status->hash_table->buckets[status->curbucket];
686
0
  }
687
0
  else
688
0
    next_item_pointer = status->pnextitem;
689
690
0
  Assert(LWLockHeldByMeInMode(PARTITION_LOCK(status->hash_table,
691
0
                         status->curpartition),
692
0
                status->exclusive ? LW_EXCLUSIVE : LW_SHARED));
693
694
  /* Move to the next bucket if we finished the current bucket */
695
0
  while (!DsaPointerIsValid(next_item_pointer))
696
0
  {
697
0
    int     next_partition;
698
699
0
    if (++status->curbucket >= status->nbuckets)
700
0
    {
701
      /* all buckets have been scanned. finish. */
702
0
      return NULL;
703
0
    }
704
705
    /* Check if move to the next partition */
706
0
    next_partition =
707
0
      PARTITION_FOR_BUCKET_INDEX(status->curbucket,
708
0
                     status->hash_table->size_log2);
709
710
0
    if (status->curpartition != next_partition)
711
0
    {
712
      /*
713
       * Move to the next partition. Lock the next partition then
714
       * release the current, not in the reverse order to avoid
715
       * concurrent resizing.  Avoid dead lock by taking lock in the
716
       * same order with resize().
717
       */
718
0
      LWLockAcquire(PARTITION_LOCK(status->hash_table,
719
0
                     next_partition),
720
0
              status->exclusive ? LW_EXCLUSIVE : LW_SHARED);
721
0
      LWLockRelease(PARTITION_LOCK(status->hash_table,
722
0
                     status->curpartition));
723
0
      status->curpartition = next_partition;
724
0
    }
725
726
0
    next_item_pointer = status->hash_table->buckets[status->curbucket];
727
0
  }
728
729
0
  status->curitem =
730
0
    dsa_get_address(status->hash_table->area, next_item_pointer);
731
732
  /*
733
   * The caller may delete the item. Store the next item in case of
734
   * deletion.
735
   */
736
0
  status->pnextitem = status->curitem->next;
737
738
0
  return ENTRY_FROM_ITEM(status->curitem);
739
0
}
740
741
/*
742
 * Terminates the seqscan and release all locks.
743
 *
744
 * Needs to be called after finishing or when exiting a seqscan.
745
 */
746
void
747
dshash_seq_term(dshash_seq_status *status)
748
0
{
749
0
  if (status->curpartition >= 0)
750
0
    LWLockRelease(PARTITION_LOCK(status->hash_table, status->curpartition));
751
0
}
752
753
/*
754
 * Remove the current entry of the seq scan.
755
 */
756
void
757
dshash_delete_current(dshash_seq_status *status)
758
0
{
759
0
  dshash_table *hash_table = status->hash_table;
760
0
  dshash_table_item *item = status->curitem;
761
0
  size_t    partition PG_USED_FOR_ASSERTS_ONLY;
762
763
0
  partition = PARTITION_FOR_HASH(item->hash);
764
765
0
  Assert(status->exclusive);
766
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
767
0
  Assert(LWLockHeldByMeInMode(PARTITION_LOCK(hash_table, partition),
768
0
                LW_EXCLUSIVE));
769
770
0
  delete_item(hash_table, item);
771
0
}
772
773
/*
774
 * Print debugging information about the internal state of the hash table to
775
 * stderr.  The caller must hold no partition locks.
776
 */
777
void
778
dshash_dump(dshash_table *hash_table)
779
0
{
780
0
  size_t    i;
781
0
  size_t    j;
782
783
0
  Assert(hash_table->control->magic == DSHASH_MAGIC);
784
0
  ASSERT_NO_PARTITION_LOCKS_HELD_BY_ME(hash_table);
785
786
0
  for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
787
0
  {
788
0
    Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
789
0
    LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_SHARED);
790
0
  }
791
792
0
  ensure_valid_bucket_pointers(hash_table);
793
794
0
  fprintf(stderr,
795
0
      "hash table size = %zu\n", (size_t) 1 << hash_table->size_log2);
796
0
  for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
797
0
  {
798
0
    dshash_partition *partition = &hash_table->control->partitions[i];
799
0
    size_t    begin = BUCKET_INDEX_FOR_PARTITION(i, hash_table->size_log2);
800
0
    size_t    end = BUCKET_INDEX_FOR_PARTITION(i + 1, hash_table->size_log2);
801
802
0
    fprintf(stderr, "  partition %zu\n", i);
803
0
    fprintf(stderr,
804
0
        "    active buckets (key count = %zu)\n", partition->count);
805
806
0
    for (j = begin; j < end; ++j)
807
0
    {
808
0
      size_t    count = 0;
809
0
      dsa_pointer bucket = hash_table->buckets[j];
810
811
0
      while (DsaPointerIsValid(bucket))
812
0
      {
813
0
        dshash_table_item *item;
814
815
0
        item = dsa_get_address(hash_table->area, bucket);
816
817
0
        bucket = item->next;
818
0
        ++count;
819
0
      }
820
0
      fprintf(stderr, "      bucket %zu (key count = %zu)\n", j, count);
821
0
    }
822
0
  }
823
824
0
  for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
825
0
    LWLockRelease(PARTITION_LOCK(hash_table, i));
826
0
}
827
828
/*
829
 * Delete a locked item to which we have a pointer.
830
 */
831
static void
832
delete_item(dshash_table *hash_table, dshash_table_item *item)
833
0
{
834
0
  size_t    hash = item->hash;
835
0
  size_t    partition = PARTITION_FOR_HASH(hash);
836
837
0
  Assert(LWLockHeldByMe(PARTITION_LOCK(hash_table, partition)));
838
839
0
  if (delete_item_from_bucket(hash_table, item,
840
0
                &BUCKET_FOR_HASH(hash_table, hash)))
841
0
  {
842
0
    Assert(hash_table->control->partitions[partition].count > 0);
843
0
    --hash_table->control->partitions[partition].count;
844
0
  }
845
0
  else
846
0
  {
847
0
    Assert(false);
848
0
  }
849
0
}
850
851
/*
852
 * Grow the hash table if necessary to the requested number of buckets.  The
853
 * requested size must be double some previously observed size.
854
 *
855
 * Must be called without any partition lock held.
856
 */
857
static void
858
resize(dshash_table *hash_table, size_t new_size_log2)
859
0
{
860
0
  dsa_pointer old_buckets;
861
0
  dsa_pointer new_buckets_shared;
862
0
  dsa_pointer *new_buckets;
863
0
  size_t    size;
864
0
  size_t    new_size = ((size_t) 1) << new_size_log2;
865
0
  size_t    i;
866
867
  /*
868
   * Acquire the locks for all lock partitions.  This is expensive, but we
869
   * shouldn't have to do it many times.
870
   */
871
0
  for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
872
0
  {
873
0
    Assert(!LWLockHeldByMe(PARTITION_LOCK(hash_table, i)));
874
875
0
    LWLockAcquire(PARTITION_LOCK(hash_table, i), LW_EXCLUSIVE);
876
0
    if (i == 0 && hash_table->control->size_log2 >= new_size_log2)
877
0
    {
878
      /*
879
       * Another backend has already increased the size; we can avoid
880
       * obtaining all the locks and return early.
881
       */
882
0
      LWLockRelease(PARTITION_LOCK(hash_table, 0));
883
0
      return;
884
0
    }
885
0
  }
886
887
0
  Assert(new_size_log2 == hash_table->control->size_log2 + 1);
888
889
  /* Allocate the space for the new table. */
890
0
  new_buckets_shared =
891
0
    dsa_allocate_extended(hash_table->area,
892
0
                sizeof(dsa_pointer) * new_size,
893
0
                DSA_ALLOC_HUGE | DSA_ALLOC_ZERO);
894
0
  new_buckets = dsa_get_address(hash_table->area, new_buckets_shared);
895
896
  /*
897
   * We've allocated the new bucket array; all that remains to do now is to
898
   * reinsert all items, which amounts to adjusting all the pointers.
899
   */
900
0
  size = ((size_t) 1) << hash_table->control->size_log2;
901
0
  for (i = 0; i < size; ++i)
902
0
  {
903
0
    dsa_pointer item_pointer = hash_table->buckets[i];
904
905
0
    while (DsaPointerIsValid(item_pointer))
906
0
    {
907
0
      dshash_table_item *item;
908
0
      dsa_pointer next_item_pointer;
909
910
0
      item = dsa_get_address(hash_table->area, item_pointer);
911
0
      next_item_pointer = item->next;
912
0
      insert_item_into_bucket(hash_table, item_pointer, item,
913
0
                  &new_buckets[BUCKET_INDEX_FOR_HASH_AND_SIZE(item->hash,
914
0
                                        new_size_log2)]);
915
0
      item_pointer = next_item_pointer;
916
0
    }
917
0
  }
918
919
  /* Swap the hash table into place and free the old one. */
920
0
  old_buckets = hash_table->control->buckets;
921
0
  hash_table->control->buckets = new_buckets_shared;
922
0
  hash_table->control->size_log2 = new_size_log2;
923
0
  hash_table->buckets = new_buckets;
924
0
  dsa_free(hash_table->area, old_buckets);
925
926
  /* Release all the locks. */
927
0
  for (i = 0; i < DSHASH_NUM_PARTITIONS; ++i)
928
0
    LWLockRelease(PARTITION_LOCK(hash_table, i));
929
0
}
930
931
/*
932
 * Make sure that our backend-local bucket pointers are up to date.  The
933
 * caller must have locked one lock partition, which prevents resize() from
934
 * running concurrently.
935
 */
936
static inline void
937
ensure_valid_bucket_pointers(dshash_table *hash_table)
938
0
{
939
0
  if (hash_table->size_log2 != hash_table->control->size_log2)
940
0
  {
941
0
    hash_table->buckets = dsa_get_address(hash_table->area,
942
0
                        hash_table->control->buckets);
943
0
    hash_table->size_log2 = hash_table->control->size_log2;
944
0
  }
945
0
}
946
947
/*
948
 * Scan a locked bucket for a match, using the provided compare function.
949
 */
950
static inline dshash_table_item *
951
find_in_bucket(dshash_table *hash_table, const void *key,
952
         dsa_pointer item_pointer)
953
0
{
954
0
  while (DsaPointerIsValid(item_pointer))
955
0
  {
956
0
    dshash_table_item *item;
957
958
0
    item = dsa_get_address(hash_table->area, item_pointer);
959
0
    if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
960
0
      return item;
961
0
    item_pointer = item->next;
962
0
  }
963
0
  return NULL;
964
0
}
965
966
/*
967
 * Insert an already-allocated item into a bucket.
968
 */
969
static void
970
insert_item_into_bucket(dshash_table *hash_table,
971
            dsa_pointer item_pointer,
972
            dshash_table_item *item,
973
            dsa_pointer *bucket)
974
0
{
975
0
  Assert(item == dsa_get_address(hash_table->area, item_pointer));
976
977
0
  item->next = *bucket;
978
0
  *bucket = item_pointer;
979
0
}
980
981
/*
982
 * Allocate space for an entry with the given key and insert it into the
983
 * provided bucket.
984
 */
985
static dshash_table_item *
986
insert_into_bucket(dshash_table *hash_table,
987
           const void *key,
988
           dsa_pointer *bucket)
989
0
{
990
0
  dsa_pointer item_pointer;
991
0
  dshash_table_item *item;
992
993
0
  item_pointer = dsa_allocate(hash_table->area,
994
0
                hash_table->params.entry_size +
995
0
                MAXALIGN(sizeof(dshash_table_item)));
996
0
  item = dsa_get_address(hash_table->area, item_pointer);
997
0
  copy_key(hash_table, ENTRY_FROM_ITEM(item), key);
998
0
  insert_item_into_bucket(hash_table, item_pointer, item, bucket);
999
0
  return item;
1000
0
}
1001
1002
/*
1003
 * Search a bucket for a matching key and delete it.
1004
 */
1005
static bool
1006
delete_key_from_bucket(dshash_table *hash_table,
1007
             const void *key,
1008
             dsa_pointer *bucket_head)
1009
0
{
1010
0
  while (DsaPointerIsValid(*bucket_head))
1011
0
  {
1012
0
    dshash_table_item *item;
1013
1014
0
    item = dsa_get_address(hash_table->area, *bucket_head);
1015
1016
0
    if (equal_keys(hash_table, key, ENTRY_FROM_ITEM(item)))
1017
0
    {
1018
0
      dsa_pointer next;
1019
1020
0
      next = item->next;
1021
0
      dsa_free(hash_table->area, *bucket_head);
1022
0
      *bucket_head = next;
1023
1024
0
      return true;
1025
0
    }
1026
0
    bucket_head = &item->next;
1027
0
  }
1028
0
  return false;
1029
0
}
1030
1031
/*
1032
 * Delete the specified item from the bucket.
1033
 */
1034
static bool
1035
delete_item_from_bucket(dshash_table *hash_table,
1036
            dshash_table_item *item,
1037
            dsa_pointer *bucket_head)
1038
0
{
1039
0
  while (DsaPointerIsValid(*bucket_head))
1040
0
  {
1041
0
    dshash_table_item *bucket_item;
1042
1043
0
    bucket_item = dsa_get_address(hash_table->area, *bucket_head);
1044
1045
0
    if (bucket_item == item)
1046
0
    {
1047
0
      dsa_pointer next;
1048
1049
0
      next = item->next;
1050
0
      dsa_free(hash_table->area, *bucket_head);
1051
0
      *bucket_head = next;
1052
0
      return true;
1053
0
    }
1054
0
    bucket_head = &bucket_item->next;
1055
0
  }
1056
0
  return false;
1057
0
}
1058
1059
/*
1060
 * Compute the hash value for a key.
1061
 */
1062
static inline dshash_hash
1063
hash_key(dshash_table *hash_table, const void *key)
1064
0
{
1065
0
  return hash_table->params.hash_function(key,
1066
0
                      hash_table->params.key_size,
1067
0
                      hash_table->arg);
1068
0
}
1069
1070
/*
1071
 * Check whether two keys compare equal.
1072
 */
1073
static inline bool
1074
equal_keys(dshash_table *hash_table, const void *a, const void *b)
1075
0
{
1076
0
  return hash_table->params.compare_function(a, b,
1077
0
                         hash_table->params.key_size,
1078
0
                         hash_table->arg) == 0;
1079
0
}
1080
1081
/*
1082
 * Copy a key.
1083
 */
1084
static inline void
1085
copy_key(dshash_table *hash_table, void *dest, const void *src)
1086
0
{
1087
0
  hash_table->params.copy_function(dest, src,
1088
0
                   hash_table->params.key_size,
1089
0
                   hash_table->arg);
1090
0
}