/src/postgres/src/backend/access/hash/hashpage.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * hashpage.c |
4 | | * Hash table page management code for the Postgres hash access method |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/access/hash/hashpage.c |
12 | | * |
13 | | * NOTES |
14 | | * Postgres hash pages look like ordinary relation pages. The opaque |
15 | | * data at high addresses includes information about the page including |
16 | | * whether a page is an overflow page or a true bucket, the bucket |
17 | | * number, and the block numbers of the preceding and following pages |
18 | | * in the same bucket. |
19 | | * |
20 | | * The first page in a hash relation, page zero, is special -- it stores |
21 | | * information describing the hash table; it is referred to as the |
22 | | * "meta page." Pages one and higher store the actual data. |
23 | | * |
24 | | * There are also bitmap pages, which are not manipulated here; |
25 | | * see hashovfl.c. |
26 | | * |
27 | | *------------------------------------------------------------------------- |
28 | | */ |
29 | | #include "postgres.h" |
30 | | |
31 | | #include "access/hash.h" |
32 | | #include "access/hash_xlog.h" |
33 | | #include "access/xloginsert.h" |
34 | | #include "miscadmin.h" |
35 | | #include "port/pg_bitutils.h" |
36 | | #include "storage/predicate.h" |
37 | | #include "storage/smgr.h" |
38 | | #include "utils/rel.h" |
39 | | |
40 | | static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, |
41 | | uint32 nblocks); |
42 | | static void _hash_splitbucket(Relation rel, Buffer metabuf, |
43 | | Bucket obucket, Bucket nbucket, |
44 | | Buffer obuf, |
45 | | Buffer nbuf, |
46 | | HTAB *htab, |
47 | | uint32 maxbucket, |
48 | | uint32 highmask, uint32 lowmask); |
49 | | static void log_split_page(Relation rel, Buffer buf); |
50 | | |
51 | | |
52 | | /* |
53 | | * _hash_getbuf() -- Get a buffer by block number for read or write. |
54 | | * |
55 | | * 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK. |
56 | | * 'flags' is a bitwise OR of the allowed page types. |
57 | | * |
58 | | * This must be used only to fetch pages that are expected to be valid |
59 | | * already. _hash_checkpage() is applied using the given flags. |
60 | | * |
61 | | * When this routine returns, the appropriate lock is set on the |
62 | | * requested buffer and its reference count has been incremented |
63 | | * (ie, the buffer is "locked and pinned"). |
64 | | * |
65 | | * P_NEW is disallowed because this routine can only be used |
66 | | * to access pages that are known to be before the filesystem EOF. |
67 | | * Extending the index should be done with _hash_getnewbuf. |
68 | | */ |
69 | | Buffer |
70 | | _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags) |
71 | 0 | { |
72 | 0 | Buffer buf; |
73 | |
|
74 | 0 | if (blkno == P_NEW) |
75 | 0 | elog(ERROR, "hash AM does not use P_NEW"); |
76 | | |
77 | 0 | buf = ReadBuffer(rel, blkno); |
78 | |
|
79 | 0 | if (access != HASH_NOLOCK) |
80 | 0 | LockBuffer(buf, access); |
81 | | |
82 | | /* ref count and lock type are correct */ |
83 | |
|
84 | 0 | _hash_checkpage(rel, buf, flags); |
85 | |
|
86 | 0 | return buf; |
87 | 0 | } |
88 | | |
89 | | /* |
90 | | * _hash_getbuf_with_condlock_cleanup() -- Try to get a buffer for cleanup. |
91 | | * |
92 | | * We read the page and try to acquire a cleanup lock. If we get it, |
93 | | * we return the buffer; otherwise, we return InvalidBuffer. |
94 | | */ |
95 | | Buffer |
96 | | _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags) |
97 | 0 | { |
98 | 0 | Buffer buf; |
99 | |
|
100 | 0 | if (blkno == P_NEW) |
101 | 0 | elog(ERROR, "hash AM does not use P_NEW"); |
102 | | |
103 | 0 | buf = ReadBuffer(rel, blkno); |
104 | |
|
105 | 0 | if (!ConditionalLockBufferForCleanup(buf)) |
106 | 0 | { |
107 | 0 | ReleaseBuffer(buf); |
108 | 0 | return InvalidBuffer; |
109 | 0 | } |
110 | | |
111 | | /* ref count and lock type are correct */ |
112 | | |
113 | 0 | _hash_checkpage(rel, buf, flags); |
114 | |
|
115 | 0 | return buf; |
116 | 0 | } |
117 | | |
118 | | /* |
119 | | * _hash_getinitbuf() -- Get and initialize a buffer by block number. |
120 | | * |
121 | | * This must be used only to fetch pages that are known to be before |
122 | | * the index's filesystem EOF, but are to be filled from scratch. |
123 | | * _hash_pageinit() is applied automatically. Otherwise it has |
124 | | * effects similar to _hash_getbuf() with access = HASH_WRITE. |
125 | | * |
126 | | * When this routine returns, a write lock is set on the |
127 | | * requested buffer and its reference count has been incremented |
128 | | * (ie, the buffer is "locked and pinned"). |
129 | | * |
130 | | * P_NEW is disallowed because this routine can only be used |
131 | | * to access pages that are known to be before the filesystem EOF. |
132 | | * Extending the index should be done with _hash_getnewbuf. |
133 | | */ |
134 | | Buffer |
135 | | _hash_getinitbuf(Relation rel, BlockNumber blkno) |
136 | 0 | { |
137 | 0 | Buffer buf; |
138 | |
|
139 | 0 | if (blkno == P_NEW) |
140 | 0 | elog(ERROR, "hash AM does not use P_NEW"); |
141 | | |
142 | 0 | buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK, |
143 | 0 | NULL); |
144 | | |
145 | | /* ref count and lock type are correct */ |
146 | | |
147 | | /* initialize the page */ |
148 | 0 | _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf)); |
149 | |
|
150 | 0 | return buf; |
151 | 0 | } |
152 | | |
153 | | /* |
154 | | * _hash_initbuf() -- Get and initialize a buffer by bucket number. |
155 | | */ |
156 | | void |
157 | | _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, |
158 | | bool initpage) |
159 | 0 | { |
160 | 0 | HashPageOpaque pageopaque; |
161 | 0 | Page page; |
162 | |
|
163 | 0 | page = BufferGetPage(buf); |
164 | | |
165 | | /* initialize the page */ |
166 | 0 | if (initpage) |
167 | 0 | _hash_pageinit(page, BufferGetPageSize(buf)); |
168 | |
|
169 | 0 | pageopaque = HashPageGetOpaque(page); |
170 | | |
171 | | /* |
172 | | * Set hasho_prevblkno with current hashm_maxbucket. This value will be |
173 | | * used to validate cached HashMetaPageData. See |
174 | | * _hash_getbucketbuf_from_hashkey(). |
175 | | */ |
176 | 0 | pageopaque->hasho_prevblkno = max_bucket; |
177 | 0 | pageopaque->hasho_nextblkno = InvalidBlockNumber; |
178 | 0 | pageopaque->hasho_bucket = num_bucket; |
179 | 0 | pageopaque->hasho_flag = flag; |
180 | 0 | pageopaque->hasho_page_id = HASHO_PAGE_ID; |
181 | 0 | } |
182 | | |
183 | | /* |
184 | | * _hash_getnewbuf() -- Get a new page at the end of the index. |
185 | | * |
186 | | * This has the same API as _hash_getinitbuf, except that we are adding |
187 | | * a page to the index, and hence expect the page to be past the |
188 | | * logical EOF. (However, we have to support the case where it isn't, |
189 | | * since a prior try might have crashed after extending the filesystem |
190 | | * EOF but before updating the metapage to reflect the added page.) |
191 | | * |
192 | | * It is caller's responsibility to ensure that only one process can |
193 | | * extend the index at a time. In practice, this function is called |
194 | | * only while holding write lock on the metapage, because adding a page |
195 | | * is always associated with an update of metapage data. |
196 | | */ |
197 | | Buffer |
198 | | _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum) |
199 | 0 | { |
200 | 0 | BlockNumber nblocks = RelationGetNumberOfBlocksInFork(rel, forkNum); |
201 | 0 | Buffer buf; |
202 | |
|
203 | 0 | if (blkno == P_NEW) |
204 | 0 | elog(ERROR, "hash AM does not use P_NEW"); |
205 | 0 | if (blkno > nblocks) |
206 | 0 | elog(ERROR, "access to noncontiguous page in hash index \"%s\"", |
207 | 0 | RelationGetRelationName(rel)); |
208 | | |
209 | | /* smgr insists we explicitly extend the relation */ |
210 | 0 | if (blkno == nblocks) |
211 | 0 | { |
212 | 0 | buf = ExtendBufferedRel(BMR_REL(rel), forkNum, NULL, |
213 | 0 | EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK); |
214 | 0 | if (BufferGetBlockNumber(buf) != blkno) |
215 | 0 | elog(ERROR, "unexpected hash relation size: %u, should be %u", |
216 | 0 | BufferGetBlockNumber(buf), blkno); |
217 | 0 | } |
218 | 0 | else |
219 | 0 | { |
220 | 0 | buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK, |
221 | 0 | NULL); |
222 | 0 | } |
223 | | |
224 | | /* ref count and lock type are correct */ |
225 | | |
226 | | /* initialize the page */ |
227 | 0 | _hash_pageinit(BufferGetPage(buf), BufferGetPageSize(buf)); |
228 | |
|
229 | 0 | return buf; |
230 | 0 | } |
231 | | |
232 | | /* |
233 | | * _hash_getbuf_with_strategy() -- Get a buffer with nondefault strategy. |
234 | | * |
235 | | * This is identical to _hash_getbuf() but also allows a buffer access |
236 | | * strategy to be specified. We use this for VACUUM operations. |
237 | | */ |
238 | | Buffer |
239 | | _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, |
240 | | int access, int flags, |
241 | | BufferAccessStrategy bstrategy) |
242 | 0 | { |
243 | 0 | Buffer buf; |
244 | |
|
245 | 0 | if (blkno == P_NEW) |
246 | 0 | elog(ERROR, "hash AM does not use P_NEW"); |
247 | | |
248 | 0 | buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, bstrategy); |
249 | |
|
250 | 0 | if (access != HASH_NOLOCK) |
251 | 0 | LockBuffer(buf, access); |
252 | | |
253 | | /* ref count and lock type are correct */ |
254 | |
|
255 | 0 | _hash_checkpage(rel, buf, flags); |
256 | |
|
257 | 0 | return buf; |
258 | 0 | } |
259 | | |
260 | | /* |
261 | | * _hash_relbuf() -- release a locked buffer. |
262 | | * |
263 | | * Lock and pin (refcount) are both dropped. |
264 | | */ |
265 | | void |
266 | | _hash_relbuf(Relation rel, Buffer buf) |
267 | 0 | { |
268 | 0 | UnlockReleaseBuffer(buf); |
269 | 0 | } |
270 | | |
271 | | /* |
272 | | * _hash_dropbuf() -- release an unlocked buffer. |
273 | | * |
274 | | * This is used to unpin a buffer on which we hold no lock. |
275 | | */ |
276 | | void |
277 | | _hash_dropbuf(Relation rel, Buffer buf) |
278 | 0 | { |
279 | 0 | ReleaseBuffer(buf); |
280 | 0 | } |
281 | | |
282 | | /* |
283 | | * _hash_dropscanbuf() -- release buffers used in scan. |
284 | | * |
285 | | * This routine unpins the buffers used during scan on which we |
286 | | * hold no lock. |
287 | | */ |
288 | | void |
289 | | _hash_dropscanbuf(Relation rel, HashScanOpaque so) |
290 | 0 | { |
291 | | /* release pin we hold on primary bucket page */ |
292 | 0 | if (BufferIsValid(so->hashso_bucket_buf) && |
293 | 0 | so->hashso_bucket_buf != so->currPos.buf) |
294 | 0 | _hash_dropbuf(rel, so->hashso_bucket_buf); |
295 | 0 | so->hashso_bucket_buf = InvalidBuffer; |
296 | | |
297 | | /* release pin we hold on primary bucket page of bucket being split */ |
298 | 0 | if (BufferIsValid(so->hashso_split_bucket_buf) && |
299 | 0 | so->hashso_split_bucket_buf != so->currPos.buf) |
300 | 0 | _hash_dropbuf(rel, so->hashso_split_bucket_buf); |
301 | 0 | so->hashso_split_bucket_buf = InvalidBuffer; |
302 | | |
303 | | /* release any pin we still hold */ |
304 | 0 | if (BufferIsValid(so->currPos.buf)) |
305 | 0 | _hash_dropbuf(rel, so->currPos.buf); |
306 | 0 | so->currPos.buf = InvalidBuffer; |
307 | | |
308 | | /* reset split scan */ |
309 | 0 | so->hashso_buc_populated = false; |
310 | 0 | so->hashso_buc_split = false; |
311 | 0 | } |
312 | | |
313 | | |
314 | | /* |
315 | | * _hash_init() -- Initialize the metadata page of a hash index, |
316 | | * the initial buckets, and the initial bitmap page. |
317 | | * |
318 | | * The initial number of buckets is dependent on num_tuples, an estimate |
319 | | * of the number of tuples to be loaded into the index initially. The |
320 | | * chosen number of buckets is returned. |
321 | | * |
322 | | * We are fairly cavalier about locking here, since we know that no one else |
323 | | * could be accessing this index. In particular the rule about not holding |
324 | | * multiple buffer locks is ignored. |
325 | | */ |
326 | | uint32 |
327 | | _hash_init(Relation rel, double num_tuples, ForkNumber forkNum) |
328 | 0 | { |
329 | 0 | Buffer metabuf; |
330 | 0 | Buffer buf; |
331 | 0 | Buffer bitmapbuf; |
332 | 0 | Page pg; |
333 | 0 | HashMetaPage metap; |
334 | 0 | RegProcedure procid; |
335 | 0 | int32 data_width; |
336 | 0 | int32 item_width; |
337 | 0 | int32 ffactor; |
338 | 0 | uint32 num_buckets; |
339 | 0 | uint32 i; |
340 | 0 | bool use_wal; |
341 | | |
342 | | /* safety check */ |
343 | 0 | if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) |
344 | 0 | elog(ERROR, "cannot initialize non-empty hash index \"%s\"", |
345 | 0 | RelationGetRelationName(rel)); |
346 | | |
347 | | /* |
348 | | * WAL log creation of pages if the relation is persistent, or this is the |
349 | | * init fork. Init forks for unlogged relations always need to be WAL |
350 | | * logged. |
351 | | */ |
352 | 0 | use_wal = RelationNeedsWAL(rel) || forkNum == INIT_FORKNUM; |
353 | | |
354 | | /* |
355 | | * Determine the target fill factor (in tuples per bucket) for this index. |
356 | | * The idea is to make the fill factor correspond to pages about as full |
357 | | * as the user-settable fillfactor parameter says. We can compute it |
358 | | * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. |
359 | | */ |
360 | 0 | data_width = sizeof(uint32); |
361 | 0 | item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + |
362 | 0 | sizeof(ItemIdData); /* include the line pointer */ |
363 | 0 | ffactor = HashGetTargetPageUsage(rel) / item_width; |
364 | | /* keep to a sane range */ |
365 | 0 | if (ffactor < 10) |
366 | 0 | ffactor = 10; |
367 | |
|
368 | 0 | procid = index_getprocid(rel, 1, HASHSTANDARD_PROC); |
369 | | |
370 | | /* |
371 | | * We initialize the metapage, the first N bucket pages, and the first |
372 | | * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() |
373 | | * calls to occur. This ensures that the smgr level has the right idea of |
374 | | * the physical index length. |
375 | | * |
376 | | * Critical section not required, because on error the creation of the |
377 | | * whole relation will be rolled back. |
378 | | */ |
379 | 0 | metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); |
380 | 0 | _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); |
381 | 0 | MarkBufferDirty(metabuf); |
382 | |
|
383 | 0 | pg = BufferGetPage(metabuf); |
384 | 0 | metap = HashPageGetMeta(pg); |
385 | | |
386 | | /* XLOG stuff */ |
387 | 0 | if (use_wal) |
388 | 0 | { |
389 | 0 | xl_hash_init_meta_page xlrec; |
390 | 0 | XLogRecPtr recptr; |
391 | |
|
392 | 0 | xlrec.num_tuples = num_tuples; |
393 | 0 | xlrec.procid = metap->hashm_procid; |
394 | 0 | xlrec.ffactor = metap->hashm_ffactor; |
395 | |
|
396 | 0 | XLogBeginInsert(); |
397 | 0 | XLogRegisterData(&xlrec, SizeOfHashInitMetaPage); |
398 | 0 | XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD); |
399 | |
|
400 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); |
401 | |
|
402 | 0 | PageSetLSN(BufferGetPage(metabuf), recptr); |
403 | 0 | } |
404 | |
|
405 | 0 | num_buckets = metap->hashm_maxbucket + 1; |
406 | | |
407 | | /* |
408 | | * Release buffer lock on the metapage while we initialize buckets. |
409 | | * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS |
410 | | * won't accomplish anything. It's a bad idea to hold buffer locks for |
411 | | * long intervals in any case, since that can block the bgwriter. |
412 | | */ |
413 | 0 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
414 | | |
415 | | /* |
416 | | * Initialize and WAL Log the first N buckets |
417 | | */ |
418 | 0 | for (i = 0; i < num_buckets; i++) |
419 | 0 | { |
420 | 0 | BlockNumber blkno; |
421 | | |
422 | | /* Allow interrupts, in case N is huge */ |
423 | 0 | CHECK_FOR_INTERRUPTS(); |
424 | |
|
425 | 0 | blkno = BUCKET_TO_BLKNO(metap, i); |
426 | 0 | buf = _hash_getnewbuf(rel, blkno, forkNum); |
427 | 0 | _hash_initbuf(buf, metap->hashm_maxbucket, i, LH_BUCKET_PAGE, false); |
428 | 0 | MarkBufferDirty(buf); |
429 | |
|
430 | 0 | if (use_wal) |
431 | 0 | log_newpage(&rel->rd_locator, |
432 | 0 | forkNum, |
433 | 0 | blkno, |
434 | 0 | BufferGetPage(buf), |
435 | 0 | true); |
436 | 0 | _hash_relbuf(rel, buf); |
437 | 0 | } |
438 | | |
439 | | /* Now reacquire buffer lock on metapage */ |
440 | 0 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
441 | | |
442 | | /* |
443 | | * Initialize bitmap page |
444 | | */ |
445 | 0 | bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); |
446 | 0 | _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); |
447 | 0 | MarkBufferDirty(bitmapbuf); |
448 | | |
449 | | /* add the new bitmap page to the metapage's list of bitmaps */ |
450 | | /* metapage already has a write lock */ |
451 | 0 | if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) |
452 | 0 | ereport(ERROR, |
453 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
454 | 0 | errmsg("out of overflow pages in hash index \"%s\"", |
455 | 0 | RelationGetRelationName(rel)))); |
456 | | |
457 | 0 | metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; |
458 | |
|
459 | 0 | metap->hashm_nmaps++; |
460 | 0 | MarkBufferDirty(metabuf); |
461 | | |
462 | | /* XLOG stuff */ |
463 | 0 | if (use_wal) |
464 | 0 | { |
465 | 0 | xl_hash_init_bitmap_page xlrec; |
466 | 0 | XLogRecPtr recptr; |
467 | |
|
468 | 0 | xlrec.bmsize = metap->hashm_bmsize; |
469 | |
|
470 | 0 | XLogBeginInsert(); |
471 | 0 | XLogRegisterData(&xlrec, SizeOfHashInitBitmapPage); |
472 | 0 | XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); |
473 | | |
474 | | /* |
475 | | * This is safe only because nobody else can be modifying the index at |
476 | | * this stage; it's only visible to the transaction that is creating |
477 | | * it. |
478 | | */ |
479 | 0 | XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); |
480 | |
|
481 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); |
482 | |
|
483 | 0 | PageSetLSN(BufferGetPage(bitmapbuf), recptr); |
484 | 0 | PageSetLSN(BufferGetPage(metabuf), recptr); |
485 | 0 | } |
486 | | |
487 | | /* all done */ |
488 | 0 | _hash_relbuf(rel, bitmapbuf); |
489 | 0 | _hash_relbuf(rel, metabuf); |
490 | |
|
491 | 0 | return num_buckets; |
492 | 0 | } |
493 | | |
494 | | /* |
495 | | * _hash_init_metabuffer() -- Initialize the metadata page of a hash index. |
496 | | */ |
497 | | void |
498 | | _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, |
499 | | uint16 ffactor, bool initpage) |
500 | 0 | { |
501 | 0 | HashMetaPage metap; |
502 | 0 | HashPageOpaque pageopaque; |
503 | 0 | Page page; |
504 | 0 | double dnumbuckets; |
505 | 0 | uint32 num_buckets; |
506 | 0 | uint32 spare_index; |
507 | 0 | uint32 lshift; |
508 | | |
509 | | /* |
510 | | * Choose the number of initial bucket pages to match the fill factor |
511 | | * given the estimated number of tuples. We round up the result to the |
512 | | * total number of buckets which has to be allocated before using its |
513 | | * hashm_spares element. However always force at least 2 bucket pages. The |
514 | | * upper limit is determined by considerations explained in |
515 | | * _hash_expandtable(). |
516 | | */ |
517 | 0 | dnumbuckets = num_tuples / ffactor; |
518 | 0 | if (dnumbuckets <= 2.0) |
519 | 0 | num_buckets = 2; |
520 | 0 | else if (dnumbuckets >= (double) 0x40000000) |
521 | 0 | num_buckets = 0x40000000; |
522 | 0 | else |
523 | 0 | num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets)); |
524 | |
|
525 | 0 | spare_index = _hash_spareindex(num_buckets); |
526 | 0 | Assert(spare_index < HASH_MAX_SPLITPOINTS); |
527 | |
|
528 | 0 | page = BufferGetPage(buf); |
529 | 0 | if (initpage) |
530 | 0 | _hash_pageinit(page, BufferGetPageSize(buf)); |
531 | |
|
532 | 0 | pageopaque = HashPageGetOpaque(page); |
533 | 0 | pageopaque->hasho_prevblkno = InvalidBlockNumber; |
534 | 0 | pageopaque->hasho_nextblkno = InvalidBlockNumber; |
535 | 0 | pageopaque->hasho_bucket = InvalidBucket; |
536 | 0 | pageopaque->hasho_flag = LH_META_PAGE; |
537 | 0 | pageopaque->hasho_page_id = HASHO_PAGE_ID; |
538 | |
|
539 | 0 | metap = HashPageGetMeta(page); |
540 | |
|
541 | 0 | metap->hashm_magic = HASH_MAGIC; |
542 | 0 | metap->hashm_version = HASH_VERSION; |
543 | 0 | metap->hashm_ntuples = 0; |
544 | 0 | metap->hashm_nmaps = 0; |
545 | 0 | metap->hashm_ffactor = ffactor; |
546 | 0 | metap->hashm_bsize = HashGetMaxBitmapSize(page); |
547 | | |
548 | | /* find largest bitmap array size that will fit in page size */ |
549 | 0 | lshift = pg_leftmost_one_pos32(metap->hashm_bsize); |
550 | 0 | Assert(lshift > 0); |
551 | 0 | metap->hashm_bmsize = 1 << lshift; |
552 | 0 | metap->hashm_bmshift = lshift + BYTE_TO_BIT; |
553 | 0 | Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); |
554 | | |
555 | | /* |
556 | | * Label the index with its primary hash support function's OID. This is |
557 | | * pretty useless for normal operation (in fact, hashm_procid is not used |
558 | | * anywhere), but it might be handy for forensic purposes so we keep it. |
559 | | */ |
560 | 0 | metap->hashm_procid = procid; |
561 | | |
562 | | /* |
563 | | * We initialize the index with N buckets, 0 .. N-1, occupying physical |
564 | | * blocks 1 to N. The first freespace bitmap page is in block N+1. |
565 | | */ |
566 | 0 | metap->hashm_maxbucket = num_buckets - 1; |
567 | | |
568 | | /* |
569 | | * Set highmask as next immediate ((2 ^ x) - 1), which should be |
570 | | * sufficient to cover num_buckets. |
571 | | */ |
572 | 0 | metap->hashm_highmask = pg_nextpower2_32(num_buckets + 1) - 1; |
573 | 0 | metap->hashm_lowmask = (metap->hashm_highmask >> 1); |
574 | |
|
575 | 0 | MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); |
576 | 0 | MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); |
577 | | |
578 | | /* Set up mapping for one spare page after the initial splitpoints */ |
579 | 0 | metap->hashm_spares[spare_index] = 1; |
580 | 0 | metap->hashm_ovflpoint = spare_index; |
581 | 0 | metap->hashm_firstfree = 0; |
582 | | |
583 | | /* |
584 | | * Set pd_lower just past the end of the metadata. This is essential, |
585 | | * because without doing so, metadata will be lost if xlog.c compresses |
586 | | * the page. |
587 | | */ |
588 | 0 | ((PageHeader) page)->pd_lower = |
589 | 0 | ((char *) metap + sizeof(HashMetaPageData)) - (char *) page; |
590 | 0 | } |
591 | | |
592 | | /* |
593 | | * _hash_pageinit() -- Initialize a new hash index page. |
594 | | */ |
595 | | void |
596 | | _hash_pageinit(Page page, Size size) |
597 | 0 | { |
598 | 0 | PageInit(page, size, sizeof(HashPageOpaqueData)); |
599 | 0 | } |
600 | | |
601 | | /* |
602 | | * Attempt to expand the hash table by creating one new bucket. |
603 | | * |
604 | | * This will silently do nothing if we don't get cleanup lock on old or |
605 | | * new bucket. |
606 | | * |
607 | | * Complete the pending splits and remove the tuples from old bucket, |
608 | | * if there are any left over from the previous split. |
609 | | * |
610 | | * The caller must hold a pin, but no lock, on the metapage buffer. |
611 | | * The buffer is returned in the same state. |
612 | | */ |
613 | | void |
614 | | _hash_expandtable(Relation rel, Buffer metabuf) |
615 | 0 | { |
616 | 0 | HashMetaPage metap; |
617 | 0 | Bucket old_bucket; |
618 | 0 | Bucket new_bucket; |
619 | 0 | uint32 spare_ndx; |
620 | 0 | BlockNumber start_oblkno; |
621 | 0 | BlockNumber start_nblkno; |
622 | 0 | Buffer buf_nblkno; |
623 | 0 | Buffer buf_oblkno; |
624 | 0 | Page opage; |
625 | 0 | Page npage; |
626 | 0 | HashPageOpaque oopaque; |
627 | 0 | HashPageOpaque nopaque; |
628 | 0 | uint32 maxbucket; |
629 | 0 | uint32 highmask; |
630 | 0 | uint32 lowmask; |
631 | 0 | bool metap_update_masks = false; |
632 | 0 | bool metap_update_splitpoint = false; |
633 | |
|
634 | 0 | restart_expand: |
635 | | |
636 | | /* |
637 | | * Write-lock the meta page. It used to be necessary to acquire a |
638 | | * heavyweight lock to begin a split, but that is no longer required. |
639 | | */ |
640 | 0 | LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); |
641 | |
|
642 | 0 | _hash_checkpage(rel, metabuf, LH_META_PAGE); |
643 | 0 | metap = HashPageGetMeta(BufferGetPage(metabuf)); |
644 | | |
645 | | /* |
646 | | * Check to see if split is still needed; someone else might have already |
647 | | * done one while we waited for the lock. |
648 | | * |
649 | | * Make sure this stays in sync with _hash_doinsert() |
650 | | */ |
651 | 0 | if (metap->hashm_ntuples <= |
652 | 0 | (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1)) |
653 | 0 | goto fail; |
654 | | |
655 | | /* |
656 | | * Can't split anymore if maxbucket has reached its maximum possible |
657 | | * value. |
658 | | * |
659 | | * Ideally we'd allow bucket numbers up to UINT_MAX-1 (no higher because |
660 | | * the calculation maxbucket+1 mustn't overflow). Currently we restrict |
661 | | * to half that to prevent failure of pg_ceil_log2_32() and insufficient |
662 | | * space in hashm_spares[]. It's moot anyway because an index with 2^32 |
663 | | * buckets would certainly overflow BlockNumber and hence |
664 | | * _hash_alloc_buckets() would fail, but if we supported buckets smaller |
665 | | * than a disk block then this would be an independent constraint. |
666 | | * |
667 | | * If you change this, see also the maximum initial number of buckets in |
668 | | * _hash_init(). |
669 | | */ |
670 | 0 | if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) |
671 | 0 | goto fail; |
672 | | |
673 | | /* |
674 | | * Determine which bucket is to be split, and attempt to take cleanup lock |
675 | | * on the old bucket. If we can't get the lock, give up. |
676 | | * |
677 | | * The cleanup lock protects us not only against other backends, but |
678 | | * against our own backend as well. |
679 | | * |
680 | | * The cleanup lock is mainly to protect the split from concurrent |
681 | | * inserts. See src/backend/access/hash/README, Lock Definitions for |
682 | | * further details. Due to this locking restriction, if there is any |
683 | | * pending scan, the split will give up which is not good, but harmless. |
684 | | */ |
685 | 0 | new_bucket = metap->hashm_maxbucket + 1; |
686 | |
|
687 | 0 | old_bucket = (new_bucket & metap->hashm_lowmask); |
688 | |
|
689 | 0 | start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket); |
690 | |
|
691 | 0 | buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE); |
692 | 0 | if (!buf_oblkno) |
693 | 0 | goto fail; |
694 | | |
695 | 0 | opage = BufferGetPage(buf_oblkno); |
696 | 0 | oopaque = HashPageGetOpaque(opage); |
697 | | |
698 | | /* |
699 | | * We want to finish the split from a bucket as there is no apparent |
700 | | * benefit by not doing so and it will make the code complicated to finish |
701 | | * the split that involves multiple buckets considering the case where new |
702 | | * split also fails. We don't need to consider the new bucket for |
703 | | * completing the split here as it is not possible that a re-split of new |
704 | | * bucket starts when there is still a pending split from old bucket. |
705 | | */ |
706 | 0 | if (H_BUCKET_BEING_SPLIT(oopaque)) |
707 | 0 | { |
708 | | /* |
709 | | * Copy bucket mapping info now; refer the comment in code below where |
710 | | * we copy this information before calling _hash_splitbucket to see |
711 | | * why this is okay. |
712 | | */ |
713 | 0 | maxbucket = metap->hashm_maxbucket; |
714 | 0 | highmask = metap->hashm_highmask; |
715 | 0 | lowmask = metap->hashm_lowmask; |
716 | | |
717 | | /* |
718 | | * Release the lock on metapage and old_bucket, before completing the |
719 | | * split. |
720 | | */ |
721 | 0 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
722 | 0 | LockBuffer(buf_oblkno, BUFFER_LOCK_UNLOCK); |
723 | |
|
724 | 0 | _hash_finish_split(rel, metabuf, buf_oblkno, old_bucket, maxbucket, |
725 | 0 | highmask, lowmask); |
726 | | |
727 | | /* release the pin on old buffer and retry for expand. */ |
728 | 0 | _hash_dropbuf(rel, buf_oblkno); |
729 | |
|
730 | 0 | goto restart_expand; |
731 | 0 | } |
732 | | |
733 | | /* |
734 | | * Clean the tuples remained from the previous split. This operation |
735 | | * requires cleanup lock and we already have one on the old bucket, so |
736 | | * let's do it. We also don't want to allow further splits from the bucket |
737 | | * till the garbage of previous split is cleaned. This has two |
738 | | * advantages; first, it helps in avoiding the bloat due to garbage and |
739 | | * second is, during cleanup of bucket, we are always sure that the |
740 | | * garbage tuples belong to most recently split bucket. On the contrary, |
741 | | * if we allow cleanup of bucket after meta page is updated to indicate |
742 | | * the new split and before the actual split, the cleanup operation won't |
743 | | * be able to decide whether the tuple has been moved to the newly created |
744 | | * bucket and ended up deleting such tuples. |
745 | | */ |
746 | 0 | if (H_NEEDS_SPLIT_CLEANUP(oopaque)) |
747 | 0 | { |
748 | | /* |
749 | | * Copy bucket mapping info now; refer to the comment in code below |
750 | | * where we copy this information before calling _hash_splitbucket to |
751 | | * see why this is okay. |
752 | | */ |
753 | 0 | maxbucket = metap->hashm_maxbucket; |
754 | 0 | highmask = metap->hashm_highmask; |
755 | 0 | lowmask = metap->hashm_lowmask; |
756 | | |
757 | | /* Release the metapage lock. */ |
758 | 0 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
759 | |
|
760 | 0 | hashbucketcleanup(rel, old_bucket, buf_oblkno, start_oblkno, NULL, |
761 | 0 | maxbucket, highmask, lowmask, NULL, NULL, true, |
762 | 0 | NULL, NULL); |
763 | |
|
764 | 0 | _hash_dropbuf(rel, buf_oblkno); |
765 | |
|
766 | 0 | goto restart_expand; |
767 | 0 | } |
768 | | |
769 | | /* |
770 | | * There shouldn't be any active scan on new bucket. |
771 | | * |
772 | | * Note: it is safe to compute the new bucket's blkno here, even though we |
773 | | * may still need to update the BUCKET_TO_BLKNO mapping. This is because |
774 | | * the current value of hashm_spares[hashm_ovflpoint] correctly shows |
775 | | * where we are going to put a new splitpoint's worth of buckets. |
776 | | */ |
777 | 0 | start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket); |
778 | | |
779 | | /* |
780 | | * If the split point is increasing we need to allocate a new batch of |
781 | | * bucket pages. |
782 | | */ |
783 | 0 | spare_ndx = _hash_spareindex(new_bucket + 1); |
784 | 0 | if (spare_ndx > metap->hashm_ovflpoint) |
785 | 0 | { |
786 | 0 | uint32 buckets_to_add; |
787 | |
|
788 | 0 | Assert(spare_ndx == metap->hashm_ovflpoint + 1); |
789 | | |
790 | | /* |
791 | | * We treat allocation of buckets as a separate WAL-logged action. |
792 | | * Even if we fail after this operation, won't leak bucket pages; |
793 | | * rather, the next split will consume this space. In any case, even |
794 | | * without failure we don't use all the space in one split operation. |
795 | | */ |
796 | 0 | buckets_to_add = _hash_get_totalbuckets(spare_ndx) - new_bucket; |
797 | 0 | if (!_hash_alloc_buckets(rel, start_nblkno, buckets_to_add)) |
798 | 0 | { |
799 | | /* can't split due to BlockNumber overflow */ |
800 | 0 | _hash_relbuf(rel, buf_oblkno); |
801 | 0 | goto fail; |
802 | 0 | } |
803 | 0 | } |
804 | | |
805 | | /* |
806 | | * Physically allocate the new bucket's primary page. We want to do this |
807 | | * before changing the metapage's mapping info, in case we can't get the |
808 | | * disk space. |
809 | | * |
810 | | * XXX It doesn't make sense to call _hash_getnewbuf first, zeroing the |
811 | | * buffer, and then only afterwards check whether we have a cleanup lock. |
812 | | * However, since no scan can be accessing the buffer yet, any concurrent |
813 | | * accesses will just be from processes like the bgwriter or checkpointer |
814 | | * which don't care about its contents, so it doesn't really matter. |
815 | | */ |
816 | 0 | buf_nblkno = _hash_getnewbuf(rel, start_nblkno, MAIN_FORKNUM); |
817 | 0 | if (!IsBufferCleanupOK(buf_nblkno)) |
818 | 0 | { |
819 | 0 | _hash_relbuf(rel, buf_oblkno); |
820 | 0 | _hash_relbuf(rel, buf_nblkno); |
821 | 0 | goto fail; |
822 | 0 | } |
823 | | |
824 | | /* |
825 | | * Since we are scribbling on the pages in the shared buffers, establish a |
826 | | * critical section. Any failure in this next code leaves us with a big |
827 | | * problem: the metapage is effectively corrupt but could get written back |
828 | | * to disk. |
829 | | */ |
830 | 0 | START_CRIT_SECTION(); |
831 | | |
832 | | /* |
833 | | * Okay to proceed with split. Update the metapage bucket mapping info. |
834 | | */ |
835 | 0 | metap->hashm_maxbucket = new_bucket; |
836 | |
|
837 | 0 | if (new_bucket > metap->hashm_highmask) |
838 | 0 | { |
839 | | /* Starting a new doubling */ |
840 | 0 | metap->hashm_lowmask = metap->hashm_highmask; |
841 | 0 | metap->hashm_highmask = new_bucket | metap->hashm_lowmask; |
842 | 0 | metap_update_masks = true; |
843 | 0 | } |
844 | | |
845 | | /* |
846 | | * If the split point is increasing we need to adjust the hashm_spares[] |
847 | | * array and hashm_ovflpoint so that future overflow pages will be created |
848 | | * beyond this new batch of bucket pages. |
849 | | */ |
850 | 0 | if (spare_ndx > metap->hashm_ovflpoint) |
851 | 0 | { |
852 | 0 | metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; |
853 | 0 | metap->hashm_ovflpoint = spare_ndx; |
854 | 0 | metap_update_splitpoint = true; |
855 | 0 | } |
856 | |
|
857 | 0 | MarkBufferDirty(metabuf); |
858 | | |
859 | | /* |
860 | | * Copy bucket mapping info now; this saves re-accessing the meta page |
861 | | * inside _hash_splitbucket's inner loop. Note that once we drop the |
862 | | * split lock, other splits could begin, so these values might be out of |
863 | | * date before _hash_splitbucket finishes. That's okay, since all it |
864 | | * needs is to tell which of these two buckets to map hashkeys into. |
865 | | */ |
866 | 0 | maxbucket = metap->hashm_maxbucket; |
867 | 0 | highmask = metap->hashm_highmask; |
868 | 0 | lowmask = metap->hashm_lowmask; |
869 | |
|
870 | 0 | opage = BufferGetPage(buf_oblkno); |
871 | 0 | oopaque = HashPageGetOpaque(opage); |
872 | | |
873 | | /* |
874 | | * Mark the old bucket to indicate that split is in progress. (At |
875 | | * operation end, we will clear the split-in-progress flag.) Also, for a |
876 | | * primary bucket page, hasho_prevblkno stores the number of buckets that |
877 | | * existed as of the last split, so we must update that value here. |
878 | | */ |
879 | 0 | oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; |
880 | 0 | oopaque->hasho_prevblkno = maxbucket; |
881 | |
|
882 | 0 | MarkBufferDirty(buf_oblkno); |
883 | |
|
884 | 0 | npage = BufferGetPage(buf_nblkno); |
885 | | |
886 | | /* |
887 | | * initialize the new bucket's primary page and mark it to indicate that |
888 | | * split is in progress. |
889 | | */ |
890 | 0 | nopaque = HashPageGetOpaque(npage); |
891 | 0 | nopaque->hasho_prevblkno = maxbucket; |
892 | 0 | nopaque->hasho_nextblkno = InvalidBlockNumber; |
893 | 0 | nopaque->hasho_bucket = new_bucket; |
894 | 0 | nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; |
895 | 0 | nopaque->hasho_page_id = HASHO_PAGE_ID; |
896 | |
|
897 | 0 | MarkBufferDirty(buf_nblkno); |
898 | | |
899 | | /* XLOG stuff */ |
900 | 0 | if (RelationNeedsWAL(rel)) |
901 | 0 | { |
902 | 0 | xl_hash_split_allocate_page xlrec; |
903 | 0 | XLogRecPtr recptr; |
904 | |
|
905 | 0 | xlrec.new_bucket = maxbucket; |
906 | 0 | xlrec.old_bucket_flag = oopaque->hasho_flag; |
907 | 0 | xlrec.new_bucket_flag = nopaque->hasho_flag; |
908 | 0 | xlrec.flags = 0; |
909 | |
|
910 | 0 | XLogBeginInsert(); |
911 | |
|
912 | 0 | XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD); |
913 | 0 | XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT); |
914 | 0 | XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); |
915 | |
|
916 | 0 | if (metap_update_masks) |
917 | 0 | { |
918 | 0 | xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS; |
919 | 0 | XLogRegisterBufData(2, &metap->hashm_lowmask, sizeof(uint32)); |
920 | 0 | XLogRegisterBufData(2, &metap->hashm_highmask, sizeof(uint32)); |
921 | 0 | } |
922 | |
|
923 | 0 | if (metap_update_splitpoint) |
924 | 0 | { |
925 | 0 | xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; |
926 | 0 | XLogRegisterBufData(2, &metap->hashm_ovflpoint, |
927 | 0 | sizeof(uint32)); |
928 | 0 | XLogRegisterBufData(2, |
929 | 0 | &metap->hashm_spares[metap->hashm_ovflpoint], |
930 | 0 | sizeof(uint32)); |
931 | 0 | } |
932 | |
|
933 | 0 | XLogRegisterData(&xlrec, SizeOfHashSplitAllocPage); |
934 | |
|
935 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE); |
936 | |
|
937 | 0 | PageSetLSN(BufferGetPage(buf_oblkno), recptr); |
938 | 0 | PageSetLSN(BufferGetPage(buf_nblkno), recptr); |
939 | 0 | PageSetLSN(BufferGetPage(metabuf), recptr); |
940 | 0 | } |
941 | |
|
942 | 0 | END_CRIT_SECTION(); |
943 | | |
944 | | /* drop lock, but keep pin */ |
945 | 0 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
946 | | |
947 | | /* Relocate records to the new bucket */ |
948 | 0 | _hash_splitbucket(rel, metabuf, |
949 | 0 | old_bucket, new_bucket, |
950 | 0 | buf_oblkno, buf_nblkno, NULL, |
951 | 0 | maxbucket, highmask, lowmask); |
952 | | |
953 | | /* all done, now release the pins on primary buckets. */ |
954 | 0 | _hash_dropbuf(rel, buf_oblkno); |
955 | 0 | _hash_dropbuf(rel, buf_nblkno); |
956 | |
|
957 | 0 | return; |
958 | | |
959 | | /* Here if decide not to split or fail to acquire old bucket lock */ |
960 | 0 | fail: |
961 | | |
962 | | /* We didn't write the metapage, so just drop lock */ |
963 | 0 | LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); |
964 | 0 | } |
965 | | |
966 | | |
967 | | /* |
968 | | * _hash_alloc_buckets -- allocate a new splitpoint's worth of bucket pages |
969 | | * |
970 | | * This does not need to initialize the new bucket pages; we'll do that as |
971 | | * each one is used by _hash_expandtable(). But we have to extend the logical |
972 | | * EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in |
973 | | * sync with ours, so that we don't get complaints from smgr. |
974 | | * |
975 | | * We do this by writing a page of zeroes at the end of the splitpoint range. |
976 | | * We expect that the filesystem will ensure that the intervening pages read |
977 | | * as zeroes too. On many filesystems this "hole" will not be allocated |
978 | | * immediately, which means that the index file may end up more fragmented |
979 | | * than if we forced it all to be allocated now; but since we don't scan |
980 | | * hash indexes sequentially anyway, that probably doesn't matter. |
981 | | * |
982 | | * XXX It's annoying that this code is executed with the metapage lock held. |
983 | | * We need to interlock against _hash_addovflpage() adding a new overflow page |
984 | | * concurrently, but it'd likely be better to use LockRelationForExtension |
985 | | * for the purpose. OTOH, adding a splitpoint is a very infrequent operation, |
986 | | * so it may not be worth worrying about. |
987 | | * |
988 | | * Returns true if successful, or false if allocation failed due to |
989 | | * BlockNumber overflow. |
990 | | */ |
991 | | static bool |
992 | | _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) |
993 | 0 | { |
994 | 0 | BlockNumber lastblock; |
995 | 0 | PGIOAlignedBlock zerobuf; |
996 | 0 | Page page; |
997 | 0 | HashPageOpaque ovflopaque; |
998 | |
|
999 | 0 | lastblock = firstblock + nblocks - 1; |
1000 | | |
1001 | | /* |
1002 | | * Check for overflow in block number calculation; if so, we cannot extend |
1003 | | * the index anymore. |
1004 | | */ |
1005 | 0 | if (lastblock < firstblock || lastblock == InvalidBlockNumber) |
1006 | 0 | return false; |
1007 | | |
1008 | 0 | page = (Page) zerobuf.data; |
1009 | | |
1010 | | /* |
1011 | | * Initialize the page. Just zeroing the page won't work; see |
1012 | | * _hash_freeovflpage for similar usage. We take care to make the special |
1013 | | * space valid for the benefit of tools such as pageinspect. |
1014 | | */ |
1015 | 0 | _hash_pageinit(page, BLCKSZ); |
1016 | |
|
1017 | 0 | ovflopaque = HashPageGetOpaque(page); |
1018 | |
|
1019 | 0 | ovflopaque->hasho_prevblkno = InvalidBlockNumber; |
1020 | 0 | ovflopaque->hasho_nextblkno = InvalidBlockNumber; |
1021 | 0 | ovflopaque->hasho_bucket = InvalidBucket; |
1022 | 0 | ovflopaque->hasho_flag = LH_UNUSED_PAGE; |
1023 | 0 | ovflopaque->hasho_page_id = HASHO_PAGE_ID; |
1024 | |
|
1025 | 0 | if (RelationNeedsWAL(rel)) |
1026 | 0 | log_newpage(&rel->rd_locator, |
1027 | 0 | MAIN_FORKNUM, |
1028 | 0 | lastblock, |
1029 | 0 | zerobuf.data, |
1030 | 0 | true); |
1031 | |
|
1032 | 0 | PageSetChecksumInplace(page, lastblock); |
1033 | 0 | smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data, |
1034 | 0 | false); |
1035 | |
|
1036 | 0 | return true; |
1037 | 0 | } |
1038 | | |
1039 | | |
1040 | | /* |
1041 | | * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' |
1042 | | * |
1043 | | * This routine is used to partition the tuples between old and new bucket and |
1044 | | * is used to finish the incomplete split operations. To finish the previously |
1045 | | * interrupted split operation, the caller needs to fill htab. If htab is set, |
1046 | | * then we skip the movement of tuples that exists in htab, otherwise NULL |
1047 | | * value of htab indicates movement of all the tuples that belong to the new |
1048 | | * bucket. |
1049 | | * |
1050 | | * We are splitting a bucket that consists of a base bucket page and zero |
1051 | | * or more overflow (bucket chain) pages. We must relocate tuples that |
1052 | | * belong in the new bucket. |
1053 | | * |
1054 | | * The caller must hold cleanup locks on both buckets to ensure that |
1055 | | * no one else is trying to access them (see README). |
1056 | | * |
1057 | | * The caller must hold a pin, but no lock, on the metapage buffer. |
1058 | | * The buffer is returned in the same state. (The metapage is only |
1059 | | * touched if it becomes necessary to add or remove overflow pages.) |
1060 | | * |
1061 | | * Split needs to retain pin on primary bucket pages of both old and new |
1062 | | * buckets till end of operation. This is to prevent vacuum from starting |
1063 | | * while a split is in progress. |
1064 | | * |
1065 | | * In addition, the caller must have created the new bucket's base page, |
1066 | | * which is passed in buffer nbuf, pinned and write-locked. The lock will be |
1067 | | * released here and pin must be released by the caller. (The API is set up |
1068 | | * this way because we must do _hash_getnewbuf() before releasing the metapage |
1069 | | * write lock. So instead of passing the new bucket's start block number, we |
1070 | | * pass an actual buffer.) |
1071 | | */ |
1072 | | static void |
1073 | | _hash_splitbucket(Relation rel, |
1074 | | Buffer metabuf, |
1075 | | Bucket obucket, |
1076 | | Bucket nbucket, |
1077 | | Buffer obuf, |
1078 | | Buffer nbuf, |
1079 | | HTAB *htab, |
1080 | | uint32 maxbucket, |
1081 | | uint32 highmask, |
1082 | | uint32 lowmask) |
1083 | 0 | { |
1084 | 0 | Buffer bucket_obuf; |
1085 | 0 | Buffer bucket_nbuf; |
1086 | 0 | Page opage; |
1087 | 0 | Page npage; |
1088 | 0 | HashPageOpaque oopaque; |
1089 | 0 | HashPageOpaque nopaque; |
1090 | 0 | OffsetNumber itup_offsets[MaxIndexTuplesPerPage]; |
1091 | 0 | IndexTuple itups[MaxIndexTuplesPerPage]; |
1092 | 0 | Size all_tups_size = 0; |
1093 | 0 | int i; |
1094 | 0 | uint16 nitups = 0; |
1095 | |
|
1096 | 0 | bucket_obuf = obuf; |
1097 | 0 | opage = BufferGetPage(obuf); |
1098 | 0 | oopaque = HashPageGetOpaque(opage); |
1099 | |
|
1100 | 0 | bucket_nbuf = nbuf; |
1101 | 0 | npage = BufferGetPage(nbuf); |
1102 | 0 | nopaque = HashPageGetOpaque(npage); |
1103 | | |
1104 | | /* Copy the predicate locks from old bucket to new bucket. */ |
1105 | 0 | PredicateLockPageSplit(rel, |
1106 | 0 | BufferGetBlockNumber(bucket_obuf), |
1107 | 0 | BufferGetBlockNumber(bucket_nbuf)); |
1108 | | |
1109 | | /* |
1110 | | * Partition the tuples in the old bucket between the old bucket and the |
1111 | | * new bucket, advancing along the old bucket's overflow bucket chain and |
1112 | | * adding overflow pages to the new bucket as needed. Outer loop iterates |
1113 | | * once per page in old bucket. |
1114 | | */ |
1115 | 0 | for (;;) |
1116 | 0 | { |
1117 | 0 | BlockNumber oblkno; |
1118 | 0 | OffsetNumber ooffnum; |
1119 | 0 | OffsetNumber omaxoffnum; |
1120 | | |
1121 | | /* Scan each tuple in old page */ |
1122 | 0 | omaxoffnum = PageGetMaxOffsetNumber(opage); |
1123 | 0 | for (ooffnum = FirstOffsetNumber; |
1124 | 0 | ooffnum <= omaxoffnum; |
1125 | 0 | ooffnum = OffsetNumberNext(ooffnum)) |
1126 | 0 | { |
1127 | 0 | IndexTuple itup; |
1128 | 0 | Size itemsz; |
1129 | 0 | Bucket bucket; |
1130 | 0 | bool found = false; |
1131 | | |
1132 | | /* skip dead tuples */ |
1133 | 0 | if (ItemIdIsDead(PageGetItemId(opage, ooffnum))) |
1134 | 0 | continue; |
1135 | | |
1136 | | /* |
1137 | | * Before inserting a tuple, probe the hash table containing TIDs |
1138 | | * of tuples belonging to new bucket, if we find a match, then |
1139 | | * skip that tuple, else fetch the item's hash key (conveniently |
1140 | | * stored in the item) and determine which bucket it now belongs |
1141 | | * in. |
1142 | | */ |
1143 | 0 | itup = (IndexTuple) PageGetItem(opage, |
1144 | 0 | PageGetItemId(opage, ooffnum)); |
1145 | |
|
1146 | 0 | if (htab) |
1147 | 0 | (void) hash_search(htab, &itup->t_tid, HASH_FIND, &found); |
1148 | |
|
1149 | 0 | if (found) |
1150 | 0 | continue; |
1151 | | |
1152 | 0 | bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup), |
1153 | 0 | maxbucket, highmask, lowmask); |
1154 | |
|
1155 | 0 | if (bucket == nbucket) |
1156 | 0 | { |
1157 | 0 | IndexTuple new_itup; |
1158 | | |
1159 | | /* |
1160 | | * make a copy of index tuple as we have to scribble on it. |
1161 | | */ |
1162 | 0 | new_itup = CopyIndexTuple(itup); |
1163 | | |
1164 | | /* |
1165 | | * mark the index tuple as moved by split, such tuples are |
1166 | | * skipped by scan if there is split in progress for a bucket. |
1167 | | */ |
1168 | 0 | new_itup->t_info |= INDEX_MOVED_BY_SPLIT_MASK; |
1169 | | |
1170 | | /* |
1171 | | * insert the tuple into the new bucket. if it doesn't fit on |
1172 | | * the current page in the new bucket, we must allocate a new |
1173 | | * overflow page and place the tuple on that page instead. |
1174 | | */ |
1175 | 0 | itemsz = IndexTupleSize(new_itup); |
1176 | 0 | itemsz = MAXALIGN(itemsz); |
1177 | |
|
1178 | 0 | if (PageGetFreeSpaceForMultipleTuples(npage, nitups + 1) < (all_tups_size + itemsz)) |
1179 | 0 | { |
1180 | | /* |
1181 | | * Change the shared buffer state in critical section, |
1182 | | * otherwise any error could make it unrecoverable. |
1183 | | */ |
1184 | 0 | START_CRIT_SECTION(); |
1185 | |
|
1186 | 0 | _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); |
1187 | 0 | MarkBufferDirty(nbuf); |
1188 | | /* log the split operation before releasing the lock */ |
1189 | 0 | log_split_page(rel, nbuf); |
1190 | |
|
1191 | 0 | END_CRIT_SECTION(); |
1192 | | |
1193 | | /* drop lock, but keep pin */ |
1194 | 0 | LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); |
1195 | | |
1196 | | /* be tidy */ |
1197 | 0 | for (i = 0; i < nitups; i++) |
1198 | 0 | pfree(itups[i]); |
1199 | 0 | nitups = 0; |
1200 | 0 | all_tups_size = 0; |
1201 | | |
1202 | | /* chain to a new overflow page */ |
1203 | 0 | nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf)); |
1204 | 0 | npage = BufferGetPage(nbuf); |
1205 | 0 | nopaque = HashPageGetOpaque(npage); |
1206 | 0 | } |
1207 | |
|
1208 | 0 | itups[nitups++] = new_itup; |
1209 | 0 | all_tups_size += itemsz; |
1210 | 0 | } |
1211 | 0 | else |
1212 | 0 | { |
1213 | | /* |
1214 | | * the tuple stays on this page, so nothing to do. |
1215 | | */ |
1216 | 0 | Assert(bucket == obucket); |
1217 | 0 | } |
1218 | 0 | } |
1219 | |
|
1220 | 0 | oblkno = oopaque->hasho_nextblkno; |
1221 | | |
1222 | | /* retain the pin on the old primary bucket */ |
1223 | 0 | if (obuf == bucket_obuf) |
1224 | 0 | LockBuffer(obuf, BUFFER_LOCK_UNLOCK); |
1225 | 0 | else |
1226 | 0 | _hash_relbuf(rel, obuf); |
1227 | | |
1228 | | /* Exit loop if no more overflow pages in old bucket */ |
1229 | 0 | if (!BlockNumberIsValid(oblkno)) |
1230 | 0 | { |
1231 | | /* |
1232 | | * Change the shared buffer state in critical section, otherwise |
1233 | | * any error could make it unrecoverable. |
1234 | | */ |
1235 | 0 | START_CRIT_SECTION(); |
1236 | |
|
1237 | 0 | _hash_pgaddmultitup(rel, nbuf, itups, itup_offsets, nitups); |
1238 | 0 | MarkBufferDirty(nbuf); |
1239 | | /* log the split operation before releasing the lock */ |
1240 | 0 | log_split_page(rel, nbuf); |
1241 | |
|
1242 | 0 | END_CRIT_SECTION(); |
1243 | |
|
1244 | 0 | if (nbuf == bucket_nbuf) |
1245 | 0 | LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); |
1246 | 0 | else |
1247 | 0 | _hash_relbuf(rel, nbuf); |
1248 | | |
1249 | | /* be tidy */ |
1250 | 0 | for (i = 0; i < nitups; i++) |
1251 | 0 | pfree(itups[i]); |
1252 | 0 | break; |
1253 | 0 | } |
1254 | | |
1255 | | /* Else, advance to next old page */ |
1256 | 0 | obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); |
1257 | 0 | opage = BufferGetPage(obuf); |
1258 | 0 | oopaque = HashPageGetOpaque(opage); |
1259 | 0 | } |
1260 | | |
1261 | | /* |
1262 | | * We're at the end of the old bucket chain, so we're done partitioning |
1263 | | * the tuples. Mark the old and new buckets to indicate split is |
1264 | | * finished. |
1265 | | * |
1266 | | * To avoid deadlocks due to locking order of buckets, first lock the old |
1267 | | * bucket and then the new bucket. |
1268 | | */ |
1269 | 0 | LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); |
1270 | 0 | opage = BufferGetPage(bucket_obuf); |
1271 | 0 | oopaque = HashPageGetOpaque(opage); |
1272 | |
|
1273 | 0 | LockBuffer(bucket_nbuf, BUFFER_LOCK_EXCLUSIVE); |
1274 | 0 | npage = BufferGetPage(bucket_nbuf); |
1275 | 0 | nopaque = HashPageGetOpaque(npage); |
1276 | |
|
1277 | 0 | START_CRIT_SECTION(); |
1278 | |
|
1279 | 0 | oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; |
1280 | 0 | nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; |
1281 | | |
1282 | | /* |
1283 | | * After the split is finished, mark the old bucket to indicate that it |
1284 | | * contains deletable tuples. We will clear split-cleanup flag after |
1285 | | * deleting such tuples either at the end of split or at the next split |
1286 | | * from old bucket or at the time of vacuum. |
1287 | | */ |
1288 | 0 | oopaque->hasho_flag |= LH_BUCKET_NEEDS_SPLIT_CLEANUP; |
1289 | | |
1290 | | /* |
1291 | | * now write the buffers, here we don't release the locks as caller is |
1292 | | * responsible to release locks. |
1293 | | */ |
1294 | 0 | MarkBufferDirty(bucket_obuf); |
1295 | 0 | MarkBufferDirty(bucket_nbuf); |
1296 | |
|
1297 | 0 | if (RelationNeedsWAL(rel)) |
1298 | 0 | { |
1299 | 0 | XLogRecPtr recptr; |
1300 | 0 | xl_hash_split_complete xlrec; |
1301 | |
|
1302 | 0 | xlrec.old_bucket_flag = oopaque->hasho_flag; |
1303 | 0 | xlrec.new_bucket_flag = nopaque->hasho_flag; |
1304 | |
|
1305 | 0 | XLogBeginInsert(); |
1306 | |
|
1307 | 0 | XLogRegisterData(&xlrec, SizeOfHashSplitComplete); |
1308 | |
|
1309 | 0 | XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); |
1310 | 0 | XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); |
1311 | |
|
1312 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); |
1313 | |
|
1314 | 0 | PageSetLSN(BufferGetPage(bucket_obuf), recptr); |
1315 | 0 | PageSetLSN(BufferGetPage(bucket_nbuf), recptr); |
1316 | 0 | } |
1317 | |
|
1318 | 0 | END_CRIT_SECTION(); |
1319 | | |
1320 | | /* |
1321 | | * If possible, clean up the old bucket. We might not be able to do this |
1322 | | * if someone else has a pin on it, but if not then we can go ahead. This |
1323 | | * isn't absolutely necessary, but it reduces bloat; if we don't do it |
1324 | | * now, VACUUM will do it eventually, but maybe not until new overflow |
1325 | | * pages have been allocated. Note that there's no need to clean up the |
1326 | | * new bucket. |
1327 | | */ |
1328 | 0 | if (IsBufferCleanupOK(bucket_obuf)) |
1329 | 0 | { |
1330 | 0 | LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); |
1331 | 0 | hashbucketcleanup(rel, obucket, bucket_obuf, |
1332 | 0 | BufferGetBlockNumber(bucket_obuf), NULL, |
1333 | 0 | maxbucket, highmask, lowmask, NULL, NULL, true, |
1334 | 0 | NULL, NULL); |
1335 | 0 | } |
1336 | 0 | else |
1337 | 0 | { |
1338 | 0 | LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); |
1339 | 0 | LockBuffer(bucket_obuf, BUFFER_LOCK_UNLOCK); |
1340 | 0 | } |
1341 | 0 | } |
1342 | | |
1343 | | /* |
1344 | | * _hash_finish_split() -- Finish the previously interrupted split operation |
1345 | | * |
1346 | | * To complete the split operation, we form the hash table of TIDs in new |
1347 | | * bucket which is then used by split operation to skip tuples that are |
1348 | | * already moved before the split operation was previously interrupted. |
1349 | | * |
1350 | | * The caller must hold a pin, but no lock, on the metapage and old bucket's |
1351 | | * primary page buffer. The buffers are returned in the same state. (The |
1352 | | * metapage is only touched if it becomes necessary to add or remove overflow |
1353 | | * pages.) |
1354 | | */ |
1355 | | void |
1356 | | _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, |
1357 | | uint32 maxbucket, uint32 highmask, uint32 lowmask) |
1358 | 0 | { |
1359 | 0 | HASHCTL hash_ctl; |
1360 | 0 | HTAB *tidhtab; |
1361 | 0 | Buffer bucket_nbuf = InvalidBuffer; |
1362 | 0 | Buffer nbuf; |
1363 | 0 | Page npage; |
1364 | 0 | BlockNumber nblkno; |
1365 | 0 | BlockNumber bucket_nblkno; |
1366 | 0 | HashPageOpaque npageopaque; |
1367 | 0 | Bucket nbucket; |
1368 | 0 | bool found; |
1369 | | |
1370 | | /* Initialize hash tables used to track TIDs */ |
1371 | 0 | hash_ctl.keysize = sizeof(ItemPointerData); |
1372 | 0 | hash_ctl.entrysize = sizeof(ItemPointerData); |
1373 | 0 | hash_ctl.hcxt = CurrentMemoryContext; |
1374 | |
|
1375 | 0 | tidhtab = |
1376 | 0 | hash_create("bucket ctids", |
1377 | 0 | 256, /* arbitrary initial size */ |
1378 | 0 | &hash_ctl, |
1379 | 0 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
1380 | |
|
1381 | 0 | bucket_nblkno = nblkno = _hash_get_newblock_from_oldbucket(rel, obucket); |
1382 | | |
1383 | | /* |
1384 | | * Scan the new bucket and build hash table of TIDs |
1385 | | */ |
1386 | 0 | for (;;) |
1387 | 0 | { |
1388 | 0 | OffsetNumber noffnum; |
1389 | 0 | OffsetNumber nmaxoffnum; |
1390 | |
|
1391 | 0 | nbuf = _hash_getbuf(rel, nblkno, HASH_READ, |
1392 | 0 | LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); |
1393 | | |
1394 | | /* remember the primary bucket buffer to acquire cleanup lock on it. */ |
1395 | 0 | if (nblkno == bucket_nblkno) |
1396 | 0 | bucket_nbuf = nbuf; |
1397 | |
|
1398 | 0 | npage = BufferGetPage(nbuf); |
1399 | 0 | npageopaque = HashPageGetOpaque(npage); |
1400 | | |
1401 | | /* Scan each tuple in new page */ |
1402 | 0 | nmaxoffnum = PageGetMaxOffsetNumber(npage); |
1403 | 0 | for (noffnum = FirstOffsetNumber; |
1404 | 0 | noffnum <= nmaxoffnum; |
1405 | 0 | noffnum = OffsetNumberNext(noffnum)) |
1406 | 0 | { |
1407 | 0 | IndexTuple itup; |
1408 | | |
1409 | | /* Fetch the item's TID and insert it in hash table. */ |
1410 | 0 | itup = (IndexTuple) PageGetItem(npage, |
1411 | 0 | PageGetItemId(npage, noffnum)); |
1412 | |
|
1413 | 0 | (void) hash_search(tidhtab, &itup->t_tid, HASH_ENTER, &found); |
1414 | |
|
1415 | 0 | Assert(!found); |
1416 | 0 | } |
1417 | |
|
1418 | 0 | nblkno = npageopaque->hasho_nextblkno; |
1419 | | |
1420 | | /* |
1421 | | * release our write lock without modifying buffer and ensure to |
1422 | | * retain the pin on primary bucket. |
1423 | | */ |
1424 | 0 | if (nbuf == bucket_nbuf) |
1425 | 0 | LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); |
1426 | 0 | else |
1427 | 0 | _hash_relbuf(rel, nbuf); |
1428 | | |
1429 | | /* Exit loop if no more overflow pages in new bucket */ |
1430 | 0 | if (!BlockNumberIsValid(nblkno)) |
1431 | 0 | break; |
1432 | 0 | } |
1433 | | |
1434 | | /* |
1435 | | * Conditionally get the cleanup lock on old and new buckets to perform |
1436 | | * the split operation. If we don't get the cleanup locks, silently give |
1437 | | * up and next insertion on old bucket will try again to complete the |
1438 | | * split. |
1439 | | */ |
1440 | 0 | if (!ConditionalLockBufferForCleanup(obuf)) |
1441 | 0 | { |
1442 | 0 | hash_destroy(tidhtab); |
1443 | 0 | return; |
1444 | 0 | } |
1445 | 0 | if (!ConditionalLockBufferForCleanup(bucket_nbuf)) |
1446 | 0 | { |
1447 | 0 | LockBuffer(obuf, BUFFER_LOCK_UNLOCK); |
1448 | 0 | hash_destroy(tidhtab); |
1449 | 0 | return; |
1450 | 0 | } |
1451 | | |
1452 | 0 | npage = BufferGetPage(bucket_nbuf); |
1453 | 0 | npageopaque = HashPageGetOpaque(npage); |
1454 | 0 | nbucket = npageopaque->hasho_bucket; |
1455 | |
|
1456 | 0 | _hash_splitbucket(rel, metabuf, obucket, |
1457 | 0 | nbucket, obuf, bucket_nbuf, tidhtab, |
1458 | 0 | maxbucket, highmask, lowmask); |
1459 | |
|
1460 | 0 | _hash_dropbuf(rel, bucket_nbuf); |
1461 | 0 | hash_destroy(tidhtab); |
1462 | 0 | } |
1463 | | |
1464 | | /* |
1465 | | * log_split_page() -- Log the split operation |
1466 | | * |
1467 | | * We log the split operation when the new page in new bucket gets full, |
1468 | | * so we log the entire page. |
1469 | | * |
1470 | | * 'buf' must be locked by the caller which is also responsible for unlocking |
1471 | | * it. |
1472 | | */ |
1473 | | static void |
1474 | | log_split_page(Relation rel, Buffer buf) |
1475 | 0 | { |
1476 | 0 | if (RelationNeedsWAL(rel)) |
1477 | 0 | { |
1478 | 0 | XLogRecPtr recptr; |
1479 | |
|
1480 | 0 | XLogBeginInsert(); |
1481 | |
|
1482 | 0 | XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); |
1483 | |
|
1484 | 0 | recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE); |
1485 | |
|
1486 | 0 | PageSetLSN(BufferGetPage(buf), recptr); |
1487 | 0 | } |
1488 | 0 | } |
1489 | | |
1490 | | /* |
1491 | | * _hash_getcachedmetap() -- Returns cached metapage data. |
1492 | | * |
1493 | | * If metabuf is not InvalidBuffer, caller must hold a pin, but no lock, on |
1494 | | * the metapage. If not set, we'll set it before returning if we have to |
1495 | | * refresh the cache, and return with a pin but no lock on it; caller is |
1496 | | * responsible for releasing the pin. |
1497 | | * |
1498 | | * We refresh the cache if it's not initialized yet or force_refresh is true. |
1499 | | */ |
1500 | | HashMetaPage |
1501 | | _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh) |
1502 | 0 | { |
1503 | 0 | Page page; |
1504 | |
|
1505 | 0 | Assert(metabuf); |
1506 | 0 | if (force_refresh || rel->rd_amcache == NULL) |
1507 | 0 | { |
1508 | 0 | char *cache = NULL; |
1509 | | |
1510 | | /* |
1511 | | * It's important that we don't set rd_amcache to an invalid value. |
1512 | | * Either MemoryContextAlloc or _hash_getbuf could fail, so don't |
1513 | | * install a pointer to the newly-allocated storage in the actual |
1514 | | * relcache entry until both have succeeded. |
1515 | | */ |
1516 | 0 | if (rel->rd_amcache == NULL) |
1517 | 0 | cache = MemoryContextAlloc(rel->rd_indexcxt, |
1518 | 0 | sizeof(HashMetaPageData)); |
1519 | | |
1520 | | /* Read the metapage. */ |
1521 | 0 | if (BufferIsValid(*metabuf)) |
1522 | 0 | LockBuffer(*metabuf, BUFFER_LOCK_SHARE); |
1523 | 0 | else |
1524 | 0 | *metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, |
1525 | 0 | LH_META_PAGE); |
1526 | 0 | page = BufferGetPage(*metabuf); |
1527 | | |
1528 | | /* Populate the cache. */ |
1529 | 0 | if (rel->rd_amcache == NULL) |
1530 | 0 | rel->rd_amcache = cache; |
1531 | 0 | memcpy(rel->rd_amcache, HashPageGetMeta(page), |
1532 | 0 | sizeof(HashMetaPageData)); |
1533 | | |
1534 | | /* Release metapage lock, but keep the pin. */ |
1535 | 0 | LockBuffer(*metabuf, BUFFER_LOCK_UNLOCK); |
1536 | 0 | } |
1537 | |
|
1538 | 0 | return (HashMetaPage) rel->rd_amcache; |
1539 | 0 | } |
1540 | | |
1541 | | /* |
1542 | | * _hash_getbucketbuf_from_hashkey() -- Get the bucket's buffer for the given |
1543 | | * hashkey. |
1544 | | * |
1545 | | * Bucket pages do not move or get removed once they are allocated. This give |
1546 | | * us an opportunity to use the previously saved metapage contents to reach |
1547 | | * the target bucket buffer, instead of reading from the metapage every time. |
1548 | | * This saves one buffer access every time we want to reach the target bucket |
1549 | | * buffer, which is very helpful savings in bufmgr traffic and contention. |
1550 | | * |
1551 | | * The access type parameter (HASH_READ or HASH_WRITE) indicates whether the |
1552 | | * bucket buffer has to be locked for reading or writing. |
1553 | | * |
1554 | | * The out parameter cachedmetap is set with metapage contents used for |
1555 | | * hashkey to bucket buffer mapping. Some callers need this info to reach the |
1556 | | * old bucket in case of bucket split, see _hash_doinsert(). |
1557 | | */ |
1558 | | Buffer |
1559 | | _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, |
1560 | | HashMetaPage *cachedmetap) |
1561 | 0 | { |
1562 | 0 | HashMetaPage metap; |
1563 | 0 | Buffer buf; |
1564 | 0 | Buffer metabuf = InvalidBuffer; |
1565 | 0 | Page page; |
1566 | 0 | Bucket bucket; |
1567 | 0 | BlockNumber blkno; |
1568 | 0 | HashPageOpaque opaque; |
1569 | | |
1570 | | /* We read from target bucket buffer, hence locking is must. */ |
1571 | 0 | Assert(access == HASH_READ || access == HASH_WRITE); |
1572 | |
|
1573 | 0 | metap = _hash_getcachedmetap(rel, &metabuf, false); |
1574 | 0 | Assert(metap != NULL); |
1575 | | |
1576 | | /* |
1577 | | * Loop until we get a lock on the correct target bucket. |
1578 | | */ |
1579 | 0 | for (;;) |
1580 | 0 | { |
1581 | | /* |
1582 | | * Compute the target bucket number, and convert to block number. |
1583 | | */ |
1584 | 0 | bucket = _hash_hashkey2bucket(hashkey, |
1585 | 0 | metap->hashm_maxbucket, |
1586 | 0 | metap->hashm_highmask, |
1587 | 0 | metap->hashm_lowmask); |
1588 | |
|
1589 | 0 | blkno = BUCKET_TO_BLKNO(metap, bucket); |
1590 | | |
1591 | | /* Fetch the primary bucket page for the bucket */ |
1592 | 0 | buf = _hash_getbuf(rel, blkno, access, LH_BUCKET_PAGE); |
1593 | 0 | page = BufferGetPage(buf); |
1594 | 0 | opaque = HashPageGetOpaque(page); |
1595 | 0 | Assert(opaque->hasho_bucket == bucket); |
1596 | 0 | Assert(opaque->hasho_prevblkno != InvalidBlockNumber); |
1597 | | |
1598 | | /* |
1599 | | * If this bucket hasn't been split, we're done. |
1600 | | */ |
1601 | 0 | if (opaque->hasho_prevblkno <= metap->hashm_maxbucket) |
1602 | 0 | break; |
1603 | | |
1604 | | /* Drop lock on this buffer, update cached metapage, and retry. */ |
1605 | 0 | _hash_relbuf(rel, buf); |
1606 | 0 | metap = _hash_getcachedmetap(rel, &metabuf, true); |
1607 | 0 | Assert(metap != NULL); |
1608 | 0 | } |
1609 | |
|
1610 | 0 | if (BufferIsValid(metabuf)) |
1611 | 0 | _hash_dropbuf(rel, metabuf); |
1612 | |
|
1613 | 0 | if (cachedmetap) |
1614 | 0 | *cachedmetap = metap; |
1615 | |
|
1616 | 0 | return buf; |
1617 | 0 | } |