/src/postgres/src/backend/access/gist/gistbuildbuffers.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * gistbuildbuffers.c |
4 | | * node buffer management functions for GiST buffering build algorithm. |
5 | | * |
6 | | * |
7 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
8 | | * Portions Copyright (c) 1994, Regents of the University of California |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/access/gist/gistbuildbuffers.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | #include "postgres.h" |
16 | | |
17 | | #include "access/gist_private.h" |
18 | | #include "storage/buffile.h" |
19 | | #include "storage/bufmgr.h" |
20 | | #include "utils/rel.h" |
21 | | |
22 | | static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb); |
23 | | static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb, |
24 | | GISTNodeBuffer *nodeBuffer); |
25 | | static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb, |
26 | | GISTNodeBuffer *nodeBuffer); |
27 | | static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, |
28 | | GISTNodeBuffer *nodeBuffer); |
29 | | static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, |
30 | | IndexTuple itup); |
31 | | static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, |
32 | | IndexTuple *itup); |
33 | | static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb); |
34 | | static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum); |
35 | | |
36 | | static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr); |
37 | | static void WriteTempFileBlock(BufFile *file, long blknum, const void *ptr); |
38 | | |
39 | | |
40 | | /* |
41 | | * Initialize GiST build buffers. |
42 | | */ |
43 | | GISTBuildBuffers * |
44 | | gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel) |
45 | 0 | { |
46 | 0 | GISTBuildBuffers *gfbb; |
47 | 0 | HASHCTL hashCtl; |
48 | |
|
49 | 0 | gfbb = palloc(sizeof(GISTBuildBuffers)); |
50 | 0 | gfbb->pagesPerBuffer = pagesPerBuffer; |
51 | 0 | gfbb->levelStep = levelStep; |
52 | | |
53 | | /* |
54 | | * Create a temporary file to hold buffer pages that are swapped out of |
55 | | * memory. |
56 | | */ |
57 | 0 | gfbb->pfile = BufFileCreateTemp(false); |
58 | 0 | gfbb->nFileBlocks = 0; |
59 | | |
60 | | /* Initialize free page management. */ |
61 | 0 | gfbb->nFreeBlocks = 0; |
62 | 0 | gfbb->freeBlocksLen = 32; |
63 | 0 | gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long)); |
64 | | |
65 | | /* |
66 | | * Current memory context will be used for all in-memory data structures |
67 | | * of buffers which are persistent during buffering build. |
68 | | */ |
69 | 0 | gfbb->context = CurrentMemoryContext; |
70 | | |
71 | | /* |
72 | | * nodeBuffersTab hash is association between index blocks and it's |
73 | | * buffers. |
74 | | */ |
75 | 0 | hashCtl.keysize = sizeof(BlockNumber); |
76 | 0 | hashCtl.entrysize = sizeof(GISTNodeBuffer); |
77 | 0 | hashCtl.hcxt = CurrentMemoryContext; |
78 | 0 | gfbb->nodeBuffersTab = hash_create("gistbuildbuffers", |
79 | 0 | 1024, |
80 | 0 | &hashCtl, |
81 | 0 | HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); |
82 | |
|
83 | 0 | gfbb->bufferEmptyingQueue = NIL; |
84 | | |
85 | | /* |
86 | | * Per-level node buffers lists for final buffers emptying process. Node |
87 | | * buffers are inserted here when they are created. |
88 | | */ |
89 | 0 | gfbb->buffersOnLevelsLen = 1; |
90 | 0 | gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) * |
91 | 0 | gfbb->buffersOnLevelsLen); |
92 | 0 | gfbb->buffersOnLevels[0] = NIL; |
93 | | |
94 | | /* |
95 | | * Block numbers of node buffers which last pages are currently loaded |
96 | | * into main memory. |
97 | | */ |
98 | 0 | gfbb->loadedBuffersLen = 32; |
99 | 0 | gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen * |
100 | 0 | sizeof(GISTNodeBuffer *)); |
101 | 0 | gfbb->loadedBuffersCount = 0; |
102 | |
|
103 | 0 | gfbb->rootlevel = maxLevel; |
104 | |
|
105 | 0 | return gfbb; |
106 | 0 | } |
107 | | |
108 | | /* |
109 | | * Returns a node buffer for given block. The buffer is created if it |
110 | | * doesn't exist yet. |
111 | | */ |
112 | | GISTNodeBuffer * |
113 | | gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate, |
114 | | BlockNumber nodeBlocknum, int level) |
115 | 0 | { |
116 | 0 | GISTNodeBuffer *nodeBuffer; |
117 | 0 | bool found; |
118 | | |
119 | | /* Find node buffer in hash table */ |
120 | 0 | nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab, |
121 | 0 | &nodeBlocknum, |
122 | 0 | HASH_ENTER, |
123 | 0 | &found); |
124 | 0 | if (!found) |
125 | 0 | { |
126 | | /* |
127 | | * Node buffer wasn't found. Initialize the new buffer as empty. |
128 | | */ |
129 | 0 | MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); |
130 | | |
131 | | /* nodeBuffer->nodeBlocknum is the hash key and was filled in already */ |
132 | 0 | nodeBuffer->blocksCount = 0; |
133 | 0 | nodeBuffer->pageBlocknum = InvalidBlockNumber; |
134 | 0 | nodeBuffer->pageBuffer = NULL; |
135 | 0 | nodeBuffer->queuedForEmptying = false; |
136 | 0 | nodeBuffer->isTemp = false; |
137 | 0 | nodeBuffer->level = level; |
138 | | |
139 | | /* |
140 | | * Add this buffer to the list of buffers on this level. Enlarge |
141 | | * buffersOnLevels array if needed. |
142 | | */ |
143 | 0 | if (level >= gfbb->buffersOnLevelsLen) |
144 | 0 | { |
145 | 0 | int i; |
146 | |
|
147 | 0 | gfbb->buffersOnLevels = |
148 | 0 | (List **) repalloc(gfbb->buffersOnLevels, |
149 | 0 | (level + 1) * sizeof(List *)); |
150 | | |
151 | | /* initialize the enlarged portion */ |
152 | 0 | for (i = gfbb->buffersOnLevelsLen; i <= level; i++) |
153 | 0 | gfbb->buffersOnLevels[i] = NIL; |
154 | 0 | gfbb->buffersOnLevelsLen = level + 1; |
155 | 0 | } |
156 | | |
157 | | /* |
158 | | * Prepend the new buffer to the list of buffers on this level. It's |
159 | | * not arbitrary that the new buffer is put to the beginning of the |
160 | | * list: in the final emptying phase we loop through all buffers at |
161 | | * each level, and flush them. If a page is split during the emptying, |
162 | | * it's more efficient to flush the new split pages first, before |
163 | | * moving on to pre-existing pages on the level. The buffers just |
164 | | * created during the page split are likely still in cache, so |
165 | | * flushing them immediately is more efficient than putting them to |
166 | | * the end of the queue. |
167 | | */ |
168 | 0 | gfbb->buffersOnLevels[level] = lcons(nodeBuffer, |
169 | 0 | gfbb->buffersOnLevels[level]); |
170 | |
|
171 | 0 | MemoryContextSwitchTo(oldcxt); |
172 | 0 | } |
173 | |
|
174 | 0 | return nodeBuffer; |
175 | 0 | } |
176 | | |
177 | | /* |
178 | | * Allocate memory for a buffer page. |
179 | | */ |
180 | | static GISTNodeBufferPage * |
181 | | gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb) |
182 | 0 | { |
183 | 0 | GISTNodeBufferPage *pageBuffer; |
184 | |
|
185 | 0 | pageBuffer = (GISTNodeBufferPage *) MemoryContextAllocZero(gfbb->context, |
186 | 0 | BLCKSZ); |
187 | 0 | pageBuffer->prev = InvalidBlockNumber; |
188 | | |
189 | | /* Set page free space */ |
190 | 0 | PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET; |
191 | 0 | return pageBuffer; |
192 | 0 | } |
193 | | |
194 | | /* |
195 | | * Add specified buffer into loadedBuffers array. |
196 | | */ |
197 | | static void |
198 | | gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) |
199 | 0 | { |
200 | | /* Never add a temporary buffer to the array */ |
201 | 0 | if (nodeBuffer->isTemp) |
202 | 0 | return; |
203 | | |
204 | | /* Enlarge the array if needed */ |
205 | 0 | if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen) |
206 | 0 | { |
207 | 0 | gfbb->loadedBuffersLen *= 2; |
208 | 0 | gfbb->loadedBuffers = (GISTNodeBuffer **) |
209 | 0 | repalloc(gfbb->loadedBuffers, |
210 | 0 | gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *)); |
211 | 0 | } |
212 | |
|
213 | 0 | gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer; |
214 | 0 | gfbb->loadedBuffersCount++; |
215 | 0 | } |
216 | | |
217 | | /* |
218 | | * Load last page of node buffer into main memory. |
219 | | */ |
220 | | static void |
221 | | gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) |
222 | 0 | { |
223 | | /* Check if we really should load something */ |
224 | 0 | if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0) |
225 | 0 | { |
226 | | /* Allocate memory for page */ |
227 | 0 | nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); |
228 | | |
229 | | /* Read block from temporary file */ |
230 | 0 | ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum, |
231 | 0 | nodeBuffer->pageBuffer); |
232 | | |
233 | | /* Mark file block as free */ |
234 | 0 | gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum); |
235 | | |
236 | | /* Mark node buffer as loaded */ |
237 | 0 | gistAddLoadedBuffer(gfbb, nodeBuffer); |
238 | 0 | nodeBuffer->pageBlocknum = InvalidBlockNumber; |
239 | 0 | } |
240 | 0 | } |
241 | | |
242 | | /* |
243 | | * Write last page of node buffer to the disk. |
244 | | */ |
245 | | static void |
246 | | gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer) |
247 | 0 | { |
248 | | /* Check if we have something to write */ |
249 | 0 | if (nodeBuffer->pageBuffer) |
250 | 0 | { |
251 | 0 | BlockNumber blkno; |
252 | | |
253 | | /* Get free file block */ |
254 | 0 | blkno = gistBuffersGetFreeBlock(gfbb); |
255 | | |
256 | | /* Write block to the temporary file */ |
257 | 0 | WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); |
258 | | |
259 | | /* Free memory of that page */ |
260 | 0 | pfree(nodeBuffer->pageBuffer); |
261 | 0 | nodeBuffer->pageBuffer = NULL; |
262 | | |
263 | | /* Save block number */ |
264 | 0 | nodeBuffer->pageBlocknum = blkno; |
265 | 0 | } |
266 | 0 | } |
267 | | |
268 | | /* |
269 | | * Write last pages of all node buffers to the disk. |
270 | | */ |
271 | | void |
272 | | gistUnloadNodeBuffers(GISTBuildBuffers *gfbb) |
273 | 0 | { |
274 | 0 | int i; |
275 | | |
276 | | /* Unload all the buffers that have a page loaded in memory. */ |
277 | 0 | for (i = 0; i < gfbb->loadedBuffersCount; i++) |
278 | 0 | gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]); |
279 | | |
280 | | /* Now there are no node buffers with loaded last page */ |
281 | 0 | gfbb->loadedBuffersCount = 0; |
282 | 0 | } |
283 | | |
284 | | /* |
285 | | * Add index tuple to buffer page. |
286 | | */ |
287 | | static void |
288 | | gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup) |
289 | 0 | { |
290 | 0 | Size itupsz = IndexTupleSize(itup); |
291 | 0 | char *ptr; |
292 | | |
293 | | /* There should be enough of space. */ |
294 | 0 | Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz)); |
295 | | |
296 | | /* Reduce free space value of page to reserve a spot for the tuple. */ |
297 | 0 | PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz); |
298 | | |
299 | | /* Get pointer to the spot we reserved (ie. end of free space). */ |
300 | 0 | ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET |
301 | 0 | + PAGE_FREE_SPACE(pageBuffer); |
302 | | |
303 | | /* Copy the index tuple there. */ |
304 | 0 | memcpy(ptr, itup, itupsz); |
305 | 0 | } |
306 | | |
307 | | /* |
308 | | * Get last item from buffer page and remove it from page. |
309 | | */ |
310 | | static void |
311 | | gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup) |
312 | 0 | { |
313 | 0 | IndexTuple ptr; |
314 | 0 | Size itupsz; |
315 | |
|
316 | 0 | Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */ |
317 | | |
318 | | /* Get pointer to last index tuple */ |
319 | 0 | ptr = (IndexTuple) ((char *) pageBuffer |
320 | 0 | + BUFFER_PAGE_DATA_OFFSET |
321 | 0 | + PAGE_FREE_SPACE(pageBuffer)); |
322 | 0 | itupsz = IndexTupleSize(ptr); |
323 | | |
324 | | /* Make a copy of the tuple */ |
325 | 0 | *itup = (IndexTuple) palloc(itupsz); |
326 | 0 | memcpy(*itup, ptr, itupsz); |
327 | | |
328 | | /* Mark the space used by the tuple as free */ |
329 | 0 | PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz); |
330 | 0 | } |
331 | | |
332 | | /* |
333 | | * Push an index tuple to node buffer. |
334 | | */ |
335 | | void |
336 | | gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, |
337 | | IndexTuple itup) |
338 | 0 | { |
339 | | /* |
340 | | * Most part of memory operations will be in buffering build persistent |
341 | | * context. So, let's switch to it. |
342 | | */ |
343 | 0 | MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context); |
344 | | |
345 | | /* |
346 | | * If the buffer is currently empty, create the first page. |
347 | | */ |
348 | 0 | if (nodeBuffer->blocksCount == 0) |
349 | 0 | { |
350 | 0 | nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb); |
351 | 0 | nodeBuffer->blocksCount = 1; |
352 | 0 | gistAddLoadedBuffer(gfbb, nodeBuffer); |
353 | 0 | } |
354 | | |
355 | | /* Load last page of node buffer if it wasn't in memory already */ |
356 | 0 | if (!nodeBuffer->pageBuffer) |
357 | 0 | gistLoadNodeBuffer(gfbb, nodeBuffer); |
358 | | |
359 | | /* |
360 | | * Check if there is enough space on the last page for the tuple. |
361 | | */ |
362 | 0 | if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup)) |
363 | 0 | { |
364 | | /* |
365 | | * Nope. Swap previous block to disk and allocate a new one. |
366 | | */ |
367 | 0 | BlockNumber blkno; |
368 | | |
369 | | /* Write filled page to the disk */ |
370 | 0 | blkno = gistBuffersGetFreeBlock(gfbb); |
371 | 0 | WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer); |
372 | | |
373 | | /* |
374 | | * Reset the in-memory page as empty, and link the previous block to |
375 | | * the new page by storing its block number in the prev-link. |
376 | | */ |
377 | 0 | PAGE_FREE_SPACE(nodeBuffer->pageBuffer) = |
378 | 0 | BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata)); |
379 | 0 | nodeBuffer->pageBuffer->prev = blkno; |
380 | | |
381 | | /* We've just added one more page */ |
382 | 0 | nodeBuffer->blocksCount++; |
383 | 0 | } |
384 | |
|
385 | 0 | gistPlaceItupToPage(nodeBuffer->pageBuffer, itup); |
386 | | |
387 | | /* |
388 | | * If the buffer just overflowed, add it to the emptying queue. |
389 | | */ |
390 | 0 | if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying) |
391 | 0 | { |
392 | 0 | gfbb->bufferEmptyingQueue = lcons(nodeBuffer, |
393 | 0 | gfbb->bufferEmptyingQueue); |
394 | 0 | nodeBuffer->queuedForEmptying = true; |
395 | 0 | } |
396 | | |
397 | | /* Restore memory context */ |
398 | 0 | MemoryContextSwitchTo(oldcxt); |
399 | 0 | } |
400 | | |
401 | | /* |
402 | | * Removes one index tuple from node buffer. Returns true if success and false |
403 | | * if node buffer is empty. |
404 | | */ |
405 | | bool |
406 | | gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer, |
407 | | IndexTuple *itup) |
408 | 0 | { |
409 | | /* |
410 | | * If node buffer is empty then return false. |
411 | | */ |
412 | 0 | if (nodeBuffer->blocksCount <= 0) |
413 | 0 | return false; |
414 | | |
415 | | /* Load last page of node buffer if needed */ |
416 | 0 | if (!nodeBuffer->pageBuffer) |
417 | 0 | gistLoadNodeBuffer(gfbb, nodeBuffer); |
418 | | |
419 | | /* |
420 | | * Get index tuple from last non-empty page. |
421 | | */ |
422 | 0 | gistGetItupFromPage(nodeBuffer->pageBuffer, itup); |
423 | | |
424 | | /* |
425 | | * If we just removed the last tuple from the page, fetch previous page on |
426 | | * this node buffer (if any). |
427 | | */ |
428 | 0 | if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer)) |
429 | 0 | { |
430 | 0 | BlockNumber prevblkno; |
431 | | |
432 | | /* |
433 | | * blocksCount includes the page in pageBuffer, so decrease it now. |
434 | | */ |
435 | 0 | nodeBuffer->blocksCount--; |
436 | | |
437 | | /* |
438 | | * If there's more pages, fetch previous one. |
439 | | */ |
440 | 0 | prevblkno = nodeBuffer->pageBuffer->prev; |
441 | 0 | if (prevblkno != InvalidBlockNumber) |
442 | 0 | { |
443 | | /* There is a previous page. Fetch it. */ |
444 | 0 | Assert(nodeBuffer->blocksCount > 0); |
445 | 0 | ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer); |
446 | | |
447 | | /* |
448 | | * Now that we've read the block in memory, we can release its |
449 | | * on-disk block for reuse. |
450 | | */ |
451 | 0 | gistBuffersReleaseBlock(gfbb, prevblkno); |
452 | 0 | } |
453 | 0 | else |
454 | 0 | { |
455 | | /* No more pages. Free memory. */ |
456 | 0 | Assert(nodeBuffer->blocksCount == 0); |
457 | 0 | pfree(nodeBuffer->pageBuffer); |
458 | 0 | nodeBuffer->pageBuffer = NULL; |
459 | 0 | } |
460 | 0 | } |
461 | 0 | return true; |
462 | 0 | } |
463 | | |
464 | | /* |
465 | | * Select a currently unused block for writing to. |
466 | | */ |
467 | | static long |
468 | | gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb) |
469 | 0 | { |
470 | | /* |
471 | | * If there are multiple free blocks, we select the one appearing last in |
472 | | * freeBlocks[]. If there are none, assign the next block at the end of |
473 | | * the file (causing the file to be extended). |
474 | | */ |
475 | 0 | if (gfbb->nFreeBlocks > 0) |
476 | 0 | return gfbb->freeBlocks[--gfbb->nFreeBlocks]; |
477 | 0 | else |
478 | 0 | return gfbb->nFileBlocks++; |
479 | 0 | } |
480 | | |
481 | | /* |
482 | | * Return a block# to the freelist. |
483 | | */ |
484 | | static void |
485 | | gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum) |
486 | 0 | { |
487 | 0 | int ndx; |
488 | | |
489 | | /* Enlarge freeBlocks array if full. */ |
490 | 0 | if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen) |
491 | 0 | { |
492 | 0 | gfbb->freeBlocksLen *= 2; |
493 | 0 | gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks, |
494 | 0 | gfbb->freeBlocksLen * |
495 | 0 | sizeof(long)); |
496 | 0 | } |
497 | | |
498 | | /* Add blocknum to array */ |
499 | 0 | ndx = gfbb->nFreeBlocks++; |
500 | 0 | gfbb->freeBlocks[ndx] = blocknum; |
501 | 0 | } |
502 | | |
503 | | /* |
504 | | * Free buffering build data structure. |
505 | | */ |
506 | | void |
507 | | gistFreeBuildBuffers(GISTBuildBuffers *gfbb) |
508 | 0 | { |
509 | | /* Close buffers file. */ |
510 | 0 | BufFileClose(gfbb->pfile); |
511 | | |
512 | | /* All other things will be freed on memory context release */ |
513 | 0 | } |
514 | | |
515 | | /* |
516 | | * Data structure representing information about node buffer for index tuples |
517 | | * relocation from split node buffer. |
518 | | */ |
519 | | typedef struct |
520 | | { |
521 | | GISTENTRY entry[INDEX_MAX_KEYS]; |
522 | | bool isnull[INDEX_MAX_KEYS]; |
523 | | GISTPageSplitInfo *splitinfo; |
524 | | GISTNodeBuffer *nodeBuffer; |
525 | | } RelocationBufferInfo; |
526 | | |
527 | | /* |
528 | | * At page split, distribute tuples from the buffer of the split page to |
529 | | * new buffers for the created page halves. This also adjusts the downlinks |
530 | | * in 'splitinfo' to include the tuples in the buffers. |
531 | | */ |
532 | | void |
533 | | gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate, |
534 | | Relation r, int level, |
535 | | Buffer buffer, List *splitinfo) |
536 | 0 | { |
537 | 0 | RelocationBufferInfo *relocationBuffersInfos; |
538 | 0 | bool found; |
539 | 0 | GISTNodeBuffer *nodeBuffer; |
540 | 0 | BlockNumber blocknum; |
541 | 0 | IndexTuple itup; |
542 | 0 | int splitPagesCount = 0; |
543 | 0 | GISTENTRY entry[INDEX_MAX_KEYS]; |
544 | 0 | bool isnull[INDEX_MAX_KEYS]; |
545 | 0 | GISTNodeBuffer oldBuf; |
546 | 0 | ListCell *lc; |
547 | | |
548 | | /* If the split page doesn't have buffers, we have nothing to do. */ |
549 | 0 | if (!LEVEL_HAS_BUFFERS(level, gfbb)) |
550 | 0 | return; |
551 | | |
552 | | /* |
553 | | * Get the node buffer of the split page. |
554 | | */ |
555 | 0 | blocknum = BufferGetBlockNumber(buffer); |
556 | 0 | nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum, |
557 | 0 | HASH_FIND, &found); |
558 | 0 | if (!found) |
559 | 0 | { |
560 | | /* The page has no buffer, so we have nothing to do. */ |
561 | 0 | return; |
562 | 0 | } |
563 | | |
564 | | /* |
565 | | * Make a copy of the old buffer, as we're going reuse it as the buffer |
566 | | * for the new left page, which is on the same block as the old page. |
567 | | * That's not true for the root page, but that's fine because we never |
568 | | * have a buffer on the root page anyway. The original algorithm as |
569 | | * described by Arge et al did, but it's of no use, as you might as well |
570 | | * read the tuples straight from the heap instead of the root buffer. |
571 | | */ |
572 | 0 | Assert(blocknum != GIST_ROOT_BLKNO); |
573 | 0 | memcpy(&oldBuf, nodeBuffer, sizeof(GISTNodeBuffer)); |
574 | 0 | oldBuf.isTemp = true; |
575 | | |
576 | | /* Reset the old buffer, used for the new left page from now on */ |
577 | 0 | nodeBuffer->blocksCount = 0; |
578 | 0 | nodeBuffer->pageBuffer = NULL; |
579 | 0 | nodeBuffer->pageBlocknum = InvalidBlockNumber; |
580 | | |
581 | | /* |
582 | | * Allocate memory for information about relocation buffers. |
583 | | */ |
584 | 0 | splitPagesCount = list_length(splitinfo); |
585 | 0 | relocationBuffersInfos = |
586 | 0 | (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) * |
587 | 0 | splitPagesCount); |
588 | | |
589 | | /* |
590 | | * Fill relocation buffers information for node buffers of pages produced |
591 | | * by split. |
592 | | */ |
593 | 0 | foreach(lc, splitinfo) |
594 | 0 | { |
595 | 0 | GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc); |
596 | 0 | GISTNodeBuffer *newNodeBuffer; |
597 | 0 | int i = foreach_current_index(lc); |
598 | | |
599 | | /* Decompress parent index tuple of node buffer page. */ |
600 | 0 | gistDeCompressAtt(giststate, r, |
601 | 0 | si->downlink, NULL, (OffsetNumber) 0, |
602 | 0 | relocationBuffersInfos[i].entry, |
603 | 0 | relocationBuffersInfos[i].isnull); |
604 | | |
605 | | /* |
606 | | * Create a node buffer for the page. The leftmost half is on the same |
607 | | * block as the old page before split, so for the leftmost half this |
608 | | * will return the original buffer. The tuples on the original buffer |
609 | | * were relinked to the temporary buffer, so the original one is now |
610 | | * empty. |
611 | | */ |
612 | 0 | newNodeBuffer = gistGetNodeBuffer(gfbb, giststate, BufferGetBlockNumber(si->buf), level); |
613 | |
|
614 | 0 | relocationBuffersInfos[i].nodeBuffer = newNodeBuffer; |
615 | 0 | relocationBuffersInfos[i].splitinfo = si; |
616 | 0 | } |
617 | | |
618 | | /* |
619 | | * Loop through all index tuples in the buffer of the page being split, |
620 | | * moving them to buffers for the new pages. We try to move each tuple to |
621 | | * the page that will result in the lowest penalty for the leading column |
622 | | * or, in the case of a tie, the lowest penalty for the earliest column |
623 | | * that is not tied. |
624 | | * |
625 | | * The page searching logic is very similar to gistchoose(). |
626 | | */ |
627 | 0 | while (gistPopItupFromNodeBuffer(gfbb, &oldBuf, &itup)) |
628 | 0 | { |
629 | 0 | float best_penalty[INDEX_MAX_KEYS]; |
630 | 0 | int i, |
631 | 0 | which; |
632 | 0 | IndexTuple newtup; |
633 | 0 | RelocationBufferInfo *targetBufferInfo; |
634 | |
|
635 | 0 | gistDeCompressAtt(giststate, r, |
636 | 0 | itup, NULL, (OffsetNumber) 0, entry, isnull); |
637 | | |
638 | | /* default to using first page (shouldn't matter) */ |
639 | 0 | which = 0; |
640 | | |
641 | | /* |
642 | | * best_penalty[j] is the best penalty we have seen so far for column |
643 | | * j, or -1 when we haven't yet examined column j. Array entries to |
644 | | * the right of the first -1 are undefined. |
645 | | */ |
646 | 0 | best_penalty[0] = -1; |
647 | | |
648 | | /* |
649 | | * Loop over possible target pages, looking for one to move this tuple |
650 | | * to. |
651 | | */ |
652 | 0 | for (i = 0; i < splitPagesCount; i++) |
653 | 0 | { |
654 | 0 | RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i]; |
655 | 0 | bool zero_penalty; |
656 | 0 | int j; |
657 | |
|
658 | 0 | zero_penalty = true; |
659 | | |
660 | | /* Loop over index attributes. */ |
661 | 0 | for (j = 0; j < IndexRelationGetNumberOfKeyAttributes(r); j++) |
662 | 0 | { |
663 | 0 | float usize; |
664 | | |
665 | | /* Compute penalty for this column. */ |
666 | 0 | usize = gistpenalty(giststate, j, |
667 | 0 | &splitPageInfo->entry[j], |
668 | 0 | splitPageInfo->isnull[j], |
669 | 0 | &entry[j], isnull[j]); |
670 | 0 | if (usize > 0) |
671 | 0 | zero_penalty = false; |
672 | |
|
673 | 0 | if (best_penalty[j] < 0 || usize < best_penalty[j]) |
674 | 0 | { |
675 | | /* |
676 | | * New best penalty for column. Tentatively select this |
677 | | * page as the target, and record the best penalty. Then |
678 | | * reset the next column's penalty to "unknown" (and |
679 | | * indirectly, the same for all the ones to its right). |
680 | | * This will force us to adopt this page's penalty values |
681 | | * as the best for all the remaining columns during |
682 | | * subsequent loop iterations. |
683 | | */ |
684 | 0 | which = i; |
685 | 0 | best_penalty[j] = usize; |
686 | |
|
687 | 0 | if (j < IndexRelationGetNumberOfKeyAttributes(r) - 1) |
688 | 0 | best_penalty[j + 1] = -1; |
689 | 0 | } |
690 | 0 | else if (best_penalty[j] == usize) |
691 | 0 | { |
692 | | /* |
693 | | * The current page is exactly as good for this column as |
694 | | * the best page seen so far. The next iteration of this |
695 | | * loop will compare the next column. |
696 | | */ |
697 | 0 | } |
698 | 0 | else |
699 | 0 | { |
700 | | /* |
701 | | * The current page is worse for this column than the best |
702 | | * page seen so far. Skip the remaining columns and move |
703 | | * on to the next page, if any. |
704 | | */ |
705 | 0 | zero_penalty = false; /* so outer loop won't exit */ |
706 | 0 | break; |
707 | 0 | } |
708 | 0 | } |
709 | | |
710 | | /* |
711 | | * If we find a page with zero penalty for all columns, there's no |
712 | | * need to examine remaining pages; just break out of the loop and |
713 | | * return it. |
714 | | */ |
715 | 0 | if (zero_penalty) |
716 | 0 | break; |
717 | 0 | } |
718 | | |
719 | | /* OK, "which" is the page index to push the tuple to */ |
720 | 0 | targetBufferInfo = &relocationBuffersInfos[which]; |
721 | | |
722 | | /* Push item to selected node buffer */ |
723 | 0 | gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup); |
724 | | |
725 | | /* Adjust the downlink for this page, if needed. */ |
726 | 0 | newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink, |
727 | 0 | itup, giststate); |
728 | 0 | if (newtup) |
729 | 0 | { |
730 | 0 | gistDeCompressAtt(giststate, r, |
731 | 0 | newtup, NULL, (OffsetNumber) 0, |
732 | 0 | targetBufferInfo->entry, |
733 | 0 | targetBufferInfo->isnull); |
734 | |
|
735 | 0 | targetBufferInfo->splitinfo->downlink = newtup; |
736 | 0 | } |
737 | 0 | } |
738 | |
|
739 | 0 | pfree(relocationBuffersInfos); |
740 | 0 | } |
741 | | |
742 | | |
743 | | /* |
744 | | * Wrappers around BufFile operations. The main difference is that these |
745 | | * wrappers report errors with ereport(), so that the callers don't need |
746 | | * to check the return code. |
747 | | */ |
748 | | |
749 | | static void |
750 | | ReadTempFileBlock(BufFile *file, long blknum, void *ptr) |
751 | 0 | { |
752 | 0 | if (BufFileSeekBlock(file, blknum) != 0) |
753 | 0 | elog(ERROR, "could not seek to block %ld in temporary file", blknum); |
754 | 0 | BufFileReadExact(file, ptr, BLCKSZ); |
755 | 0 | } |
756 | | |
757 | | static void |
758 | | WriteTempFileBlock(BufFile *file, long blknum, const void *ptr) |
759 | 0 | { |
760 | 0 | if (BufFileSeekBlock(file, blknum) != 0) |
761 | 0 | elog(ERROR, "could not seek to block %ld in temporary file", blknum); |
762 | 0 | BufFileWrite(file, ptr, BLCKSZ); |
763 | 0 | } |