Coverage Report

Created: 2025-08-12 06:43

/src/postgres/src/backend/access/gist/gistbuild.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * gistbuild.c
4
 *    build algorithm for GiST indexes implementation.
5
 *
6
 * There are two different strategies:
7
 *
8
 * 1. Sort all input tuples, pack them into GiST leaf pages in the sorted
9
 *    order, and create downlinks and internal pages as we go. This builds
10
 *    the index from the bottom up, similar to how B-tree index build
11
 *    works.
12
 *
13
 * 2. Start with an empty index, and insert all tuples one by one.
14
 *
15
 * The sorted method is used if the operator classes for all columns have
16
 * a 'sortsupport' defined. Otherwise, we resort to the second strategy.
17
 *
18
 * The second strategy can optionally use buffers at different levels of
19
 * the tree to reduce I/O, see "Buffering build algorithm" in the README
20
 * for a more detailed explanation. It initially calls insert over and
21
 * over, but switches to the buffered algorithm after a certain number of
22
 * tuples (unless buffering mode is disabled).
23
 *
24
 *
25
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
26
 * Portions Copyright (c) 1994, Regents of the University of California
27
 *
28
 * IDENTIFICATION
29
 *    src/backend/access/gist/gistbuild.c
30
 *
31
 *-------------------------------------------------------------------------
32
 */
33
#include "postgres.h"
34
35
#include <math.h>
36
37
#include "access/genam.h"
38
#include "access/gist_private.h"
39
#include "access/tableam.h"
40
#include "access/xloginsert.h"
41
#include "miscadmin.h"
42
#include "nodes/execnodes.h"
43
#include "optimizer/optimizer.h"
44
#include "storage/bufmgr.h"
45
#include "storage/bulk_write.h"
46
47
#include "utils/memutils.h"
48
#include "utils/rel.h"
49
#include "utils/tuplesort.h"
50
51
/* Step of index tuples for check whether to switch to buffering build mode */
52
0
#define BUFFERING_MODE_SWITCH_CHECK_STEP 256
53
54
/*
55
 * Number of tuples to process in the slow way before switching to buffering
56
 * mode, when buffering is explicitly turned on. Also, the number of tuples
57
 * to process between readjusting the buffer size parameter, while in
58
 * buffering mode.
59
 */
60
0
#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
61
62
/*
63
 * Strategy used to build the index. It can change between the
64
 * GIST_BUFFERING_* modes on the fly, but if the Sorted method is used,
65
 * that needs to be decided up-front and cannot be changed afterwards.
66
 */
67
typedef enum
68
{
69
  GIST_SORTED_BUILD,      /* bottom-up build by sorting */
70
  GIST_BUFFERING_DISABLED,  /* in regular build mode and aren't going to
71
                 * switch */
72
  GIST_BUFFERING_AUTO,    /* in regular build mode, but will switch to
73
                 * buffering build mode if the index grows too
74
                 * big */
75
  GIST_BUFFERING_STATS,   /* gathering statistics of index tuple size
76
                 * before switching to the buffering build
77
                 * mode */
78
  GIST_BUFFERING_ACTIVE,    /* in buffering build mode */
79
} GistBuildMode;
80
81
/* Working state for gistbuild and its callback */
82
typedef struct
83
{
84
  Relation  indexrel;
85
  Relation  heaprel;
86
  GISTSTATE  *giststate;
87
88
  Size    freespace;    /* amount of free space to leave on pages */
89
90
  GistBuildMode buildMode;
91
92
  int64   indtuples;    /* number of tuples indexed */
93
94
  /*
95
   * Extra data structures used during a buffering build. 'gfbb' contains
96
   * information related to managing the build buffers. 'parentMap' is a
97
   * lookup table of the parent of each internal page.
98
   */
99
  int64   indtuplesSize;  /* total size of all indexed tuples */
100
  GISTBuildBuffers *gfbb;
101
  HTAB     *parentMap;
102
103
  /*
104
   * Extra data structures used during a sorting build.
105
   */
106
  Tuplesortstate *sortstate;  /* state data for tuplesort.c */
107
108
  BlockNumber pages_allocated;
109
110
  BulkWriteState *bulkstate;
111
} GISTBuildState;
112
113
0
#define GIST_SORTED_BUILD_PAGE_NUM 4
114
115
/*
116
 * In sorted build, we use a stack of these structs, one for each level,
117
 * to hold an in-memory buffer of last pages at the level.
118
 *
119
 * Sorting GiST build requires good linearization of the sort opclass. This is
120
 * not always the case in multidimensional data. To tackle the anomalies, we
121
 * buffer index tuples and apply picksplit that can be multidimension-aware.
122
 */
123
typedef struct GistSortedBuildLevelState
124
{
125
  int     current_page;
126
  BlockNumber last_blkno;
127
  struct GistSortedBuildLevelState *parent; /* Upper level, if any */
128
  Page    pages[GIST_SORTED_BUILD_PAGE_NUM];
129
} GistSortedBuildLevelState;
130
131
/* prototypes for private functions */
132
133
static void gistSortedBuildCallback(Relation index, ItemPointer tid,
134
                  Datum *values, bool *isnull,
135
                  bool tupleIsAlive, void *state);
136
static void gist_indexsortbuild(GISTBuildState *state);
137
static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
138
                         GistSortedBuildLevelState *levelstate,
139
                         IndexTuple itup);
140
static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
141
                         GistSortedBuildLevelState *levelstate);
142
143
static void gistInitBuffering(GISTBuildState *buildstate);
144
static int  calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
145
static void gistBuildCallback(Relation index,
146
                ItemPointer tid,
147
                Datum *values,
148
                bool *isnull,
149
                bool tupleIsAlive,
150
                void *state);
151
static void gistBufferingBuildInsert(GISTBuildState *buildstate,
152
                   IndexTuple itup);
153
static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
154
              BlockNumber startblkno, int startlevel);
155
static BlockNumber gistbufferinginserttuples(GISTBuildState *buildstate,
156
                       Buffer buffer, int level,
157
                       IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
158
                       BlockNumber parentblk, OffsetNumber downlinkoffnum);
159
static Buffer gistBufferingFindCorrectParent(GISTBuildState *buildstate,
160
                       BlockNumber childblkno, int level,
161
                       BlockNumber *parentblkno,
162
                       OffsetNumber *downlinkoffnum);
163
static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
164
static void gistEmptyAllBuffers(GISTBuildState *buildstate);
165
static int  gistGetMaxLevel(Relation index);
166
167
static void gistInitParentMap(GISTBuildState *buildstate);
168
static void gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child,
169
                 BlockNumber parent);
170
static void gistMemorizeAllDownlinks(GISTBuildState *buildstate,
171
                   Buffer parentbuf);
172
static BlockNumber gistGetParent(GISTBuildState *buildstate, BlockNumber child);
173
174
175
/*
176
 * Main entry point to GiST index build.
177
 */
178
IndexBuildResult *
179
gistbuild(Relation heap, Relation index, IndexInfo *indexInfo)
180
0
{
181
0
  IndexBuildResult *result;
182
0
  double    reltuples;
183
0
  GISTBuildState buildstate;
184
0
  MemoryContext oldcxt = CurrentMemoryContext;
185
0
  int     fillfactor;
186
0
  Oid     SortSupportFnOids[INDEX_MAX_KEYS];
187
0
  GiSTOptions *options = (GiSTOptions *) index->rd_options;
188
189
  /*
190
   * We expect to be called exactly once for any index relation. If that's
191
   * not the case, big trouble's what we have.
192
   */
193
0
  if (RelationGetNumberOfBlocks(index) != 0)
194
0
    elog(ERROR, "index \"%s\" already contains data",
195
0
       RelationGetRelationName(index));
196
197
0
  buildstate.indexrel = index;
198
0
  buildstate.heaprel = heap;
199
0
  buildstate.sortstate = NULL;
200
0
  buildstate.giststate = initGISTstate(index);
201
202
  /*
203
   * Create a temporary memory context that is reset once for each tuple
204
   * processed.  (Note: we don't bother to make this a child of the
205
   * giststate's scanCxt, so we have to delete it separately at the end.)
206
   */
207
0
  buildstate.giststate->tempCxt = createTempGistContext();
208
209
  /*
210
   * Choose build strategy.  First check whether the user specified to use
211
   * buffering mode.  (The use-case for that in the field is somewhat
212
   * questionable perhaps, but it's important for testing purposes.)
213
   */
214
0
  if (options)
215
0
  {
216
0
    if (options->buffering_mode == GIST_OPTION_BUFFERING_ON)
217
0
      buildstate.buildMode = GIST_BUFFERING_STATS;
218
0
    else if (options->buffering_mode == GIST_OPTION_BUFFERING_OFF)
219
0
      buildstate.buildMode = GIST_BUFFERING_DISABLED;
220
0
    else          /* must be "auto" */
221
0
      buildstate.buildMode = GIST_BUFFERING_AUTO;
222
0
  }
223
0
  else
224
0
  {
225
0
    buildstate.buildMode = GIST_BUFFERING_AUTO;
226
0
  }
227
228
  /*
229
   * Unless buffering mode was forced, see if we can use sorting instead.
230
   */
231
0
  if (buildstate.buildMode != GIST_BUFFERING_STATS)
232
0
  {
233
0
    bool    hasallsortsupports = true;
234
0
    int     keyscount = IndexRelationGetNumberOfKeyAttributes(index);
235
236
0
    for (int i = 0; i < keyscount; i++)
237
0
    {
238
0
      SortSupportFnOids[i] = index_getprocid(index, i + 1,
239
0
                           GIST_SORTSUPPORT_PROC);
240
0
      if (!OidIsValid(SortSupportFnOids[i]))
241
0
      {
242
0
        hasallsortsupports = false;
243
0
        break;
244
0
      }
245
0
    }
246
0
    if (hasallsortsupports)
247
0
      buildstate.buildMode = GIST_SORTED_BUILD;
248
0
  }
249
250
  /*
251
   * Calculate target amount of free space to leave on pages.
252
   */
253
0
  fillfactor = options ? options->fillfactor : GIST_DEFAULT_FILLFACTOR;
254
0
  buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
255
256
  /*
257
   * Build the index using the chosen strategy.
258
   */
259
0
  buildstate.indtuples = 0;
260
0
  buildstate.indtuplesSize = 0;
261
262
0
  if (buildstate.buildMode == GIST_SORTED_BUILD)
263
0
  {
264
    /*
265
     * Sort all data, build the index from bottom up.
266
     */
267
0
    buildstate.sortstate = tuplesort_begin_index_gist(heap,
268
0
                              index,
269
0
                              maintenance_work_mem,
270
0
                              NULL,
271
0
                              TUPLESORT_NONE);
272
273
    /* Scan the table, adding all tuples to the tuplesort */
274
0
    reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
275
0
                       gistSortedBuildCallback,
276
0
                       &buildstate, NULL);
277
278
    /*
279
     * Perform the sort and build index pages.
280
     */
281
0
    tuplesort_performsort(buildstate.sortstate);
282
283
0
    gist_indexsortbuild(&buildstate);
284
285
0
    tuplesort_end(buildstate.sortstate);
286
0
  }
287
0
  else
288
0
  {
289
    /*
290
     * Initialize an empty index and insert all tuples, possibly using
291
     * buffers on intermediate levels.
292
     */
293
0
    Buffer    buffer;
294
0
    Page    page;
295
296
    /* initialize the root page */
297
0
    buffer = gistNewBuffer(index, heap);
298
0
    Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
299
0
    page = BufferGetPage(buffer);
300
301
0
    START_CRIT_SECTION();
302
303
0
    GISTInitBuffer(buffer, F_LEAF);
304
305
0
    MarkBufferDirty(buffer);
306
0
    PageSetLSN(page, GistBuildLSN);
307
308
0
    UnlockReleaseBuffer(buffer);
309
310
0
    END_CRIT_SECTION();
311
312
    /* Scan the table, inserting all the tuples to the index. */
313
0
    reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
314
0
                       gistBuildCallback,
315
0
                       &buildstate, NULL);
316
317
    /*
318
     * If buffering was used, flush out all the tuples that are still in
319
     * the buffers.
320
     */
321
0
    if (buildstate.buildMode == GIST_BUFFERING_ACTIVE)
322
0
    {
323
0
      elog(DEBUG1, "all tuples processed, emptying buffers");
324
0
      gistEmptyAllBuffers(&buildstate);
325
0
      gistFreeBuildBuffers(buildstate.gfbb);
326
0
    }
327
328
    /*
329
     * We didn't write WAL records as we built the index, so if
330
     * WAL-logging is required, write all pages to the WAL now.
331
     */
332
0
    if (RelationNeedsWAL(index))
333
0
    {
334
0
      log_newpage_range(index, MAIN_FORKNUM,
335
0
                0, RelationGetNumberOfBlocks(index),
336
0
                true);
337
0
    }
338
0
  }
339
340
  /* okay, all heap tuples are indexed */
341
0
  MemoryContextSwitchTo(oldcxt);
342
0
  MemoryContextDelete(buildstate.giststate->tempCxt);
343
344
0
  freeGISTstate(buildstate.giststate);
345
346
  /*
347
   * Return statistics
348
   */
349
0
  result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
350
351
0
  result->heap_tuples = reltuples;
352
0
  result->index_tuples = (double) buildstate.indtuples;
353
354
0
  return result;
355
0
}
356
357
/*-------------------------------------------------------------------------
358
 * Routines for sorted build
359
 *-------------------------------------------------------------------------
360
 */
361
362
/*
363
 * Per-tuple callback for table_index_build_scan.
364
 */
365
static void
366
gistSortedBuildCallback(Relation index,
367
            ItemPointer tid,
368
            Datum *values,
369
            bool *isnull,
370
            bool tupleIsAlive,
371
            void *state)
372
0
{
373
0
  GISTBuildState *buildstate = (GISTBuildState *) state;
374
0
  MemoryContext oldCtx;
375
0
  Datum   compressed_values[INDEX_MAX_KEYS];
376
377
0
  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
378
379
  /* Form an index tuple and point it at the heap tuple */
380
0
  gistCompressValues(buildstate->giststate, index,
381
0
             values, isnull,
382
0
             true, compressed_values);
383
384
0
  tuplesort_putindextuplevalues(buildstate->sortstate,
385
0
                  buildstate->indexrel,
386
0
                  tid,
387
0
                  compressed_values, isnull);
388
389
0
  MemoryContextSwitchTo(oldCtx);
390
0
  MemoryContextReset(buildstate->giststate->tempCxt);
391
392
  /* Update tuple count. */
393
0
  buildstate->indtuples += 1;
394
0
}
395
396
/*
397
 * Build GiST index from bottom up from pre-sorted tuples.
398
 */
399
static void
400
gist_indexsortbuild(GISTBuildState *state)
401
0
{
402
0
  IndexTuple  itup;
403
0
  GistSortedBuildLevelState *levelstate;
404
0
  BulkWriteBuffer rootbuf;
405
406
  /* Reserve block 0 for the root page */
407
0
  state->pages_allocated = 1;
408
409
0
  state->bulkstate = smgr_bulk_start_rel(state->indexrel, MAIN_FORKNUM);
410
411
  /* Allocate a temporary buffer for the first leaf page batch. */
412
0
  levelstate = palloc0(sizeof(GistSortedBuildLevelState));
413
0
  levelstate->pages[0] = palloc(BLCKSZ);
414
0
  levelstate->parent = NULL;
415
0
  gistinitpage(levelstate->pages[0], F_LEAF);
416
417
  /*
418
   * Fill index pages with tuples in the sorted order.
419
   */
420
0
  while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
421
0
  {
422
0
    gist_indexsortbuild_levelstate_add(state, levelstate, itup);
423
0
    MemoryContextReset(state->giststate->tempCxt);
424
0
  }
425
426
  /*
427
   * Write out the partially full non-root pages.
428
   *
429
   * Keep in mind that flush can build a new root. If number of pages is > 1
430
   * then new root is required.
431
   */
432
0
  while (levelstate->parent != NULL || levelstate->current_page != 0)
433
0
  {
434
0
    GistSortedBuildLevelState *parent;
435
436
0
    gist_indexsortbuild_levelstate_flush(state, levelstate);
437
0
    parent = levelstate->parent;
438
0
    for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
439
0
      if (levelstate->pages[i])
440
0
        pfree(levelstate->pages[i]);
441
0
    pfree(levelstate);
442
0
    levelstate = parent;
443
0
  }
444
445
  /* Write out the root */
446
0
  PageSetLSN(levelstate->pages[0], GistBuildLSN);
447
0
  rootbuf = smgr_bulk_get_buf(state->bulkstate);
448
0
  memcpy(rootbuf, levelstate->pages[0], BLCKSZ);
449
0
  smgr_bulk_write(state->bulkstate, GIST_ROOT_BLKNO, rootbuf, true);
450
451
0
  pfree(levelstate);
452
453
0
  smgr_bulk_finish(state->bulkstate);
454
0
}
455
456
/*
457
 * Add tuple to a page. If the pages are full, write them out and re-initialize
458
 * a new page first.
459
 */
460
static void
461
gist_indexsortbuild_levelstate_add(GISTBuildState *state,
462
                   GistSortedBuildLevelState *levelstate,
463
                   IndexTuple itup)
464
0
{
465
0
  Size    sizeNeeded;
466
467
  /* Check if tuple can be added to the current page */
468
0
  sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
469
0
  if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
470
0
  {
471
0
    Page    newPage;
472
0
    Page    old_page = levelstate->pages[levelstate->current_page];
473
0
    uint16    old_page_flags = GistPageGetOpaque(old_page)->flags;
474
475
0
    if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
476
0
    {
477
0
      gist_indexsortbuild_levelstate_flush(state, levelstate);
478
0
    }
479
0
    else
480
0
      levelstate->current_page++;
481
482
0
    if (levelstate->pages[levelstate->current_page] == NULL)
483
0
      levelstate->pages[levelstate->current_page] = palloc0(BLCKSZ);
484
485
0
    newPage = levelstate->pages[levelstate->current_page];
486
0
    gistinitpage(newPage, old_page_flags);
487
0
  }
488
489
0
  gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
490
0
}
491
492
static void
493
gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
494
                   GistSortedBuildLevelState *levelstate)
495
0
{
496
0
  GistSortedBuildLevelState *parent;
497
0
  BlockNumber blkno;
498
0
  MemoryContext oldCtx;
499
0
  IndexTuple  union_tuple;
500
0
  SplitPageLayout *dist;
501
0
  IndexTuple *itvec;
502
0
  int     vect_len;
503
0
  bool    isleaf = GistPageIsLeaf(levelstate->pages[0]);
504
505
0
  CHECK_FOR_INTERRUPTS();
506
507
0
  oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
508
509
  /* Get index tuples from first page */
510
0
  itvec = gistextractpage(levelstate->pages[0], &vect_len);
511
0
  if (levelstate->current_page > 0)
512
0
  {
513
    /* Append tuples from each page */
514
0
    for (int i = 1; i < levelstate->current_page + 1; i++)
515
0
    {
516
0
      int     len_local;
517
0
      IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
518
519
0
      itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
520
0
      pfree(itvec_local);
521
0
    }
522
523
    /* Apply picksplit to list of all collected tuples */
524
0
    dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
525
0
  }
526
0
  else
527
0
  {
528
    /* Create split layout from single page */
529
0
    dist = (SplitPageLayout *) palloc0(sizeof(SplitPageLayout));
530
0
    union_tuple = gistunion(state->indexrel, itvec, vect_len,
531
0
                state->giststate);
532
0
    dist->itup = union_tuple;
533
0
    dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
534
0
    dist->block.num = vect_len;
535
0
  }
536
537
0
  MemoryContextSwitchTo(oldCtx);
538
539
  /* Reset page counter */
540
0
  levelstate->current_page = 0;
541
542
  /* Create pages for all partitions in split result */
543
0
  for (; dist != NULL; dist = dist->next)
544
0
  {
545
0
    char     *data;
546
0
    BulkWriteBuffer buf;
547
0
    Page    target;
548
549
    /* check once per page */
550
0
    CHECK_FOR_INTERRUPTS();
551
552
    /* Create page and copy data */
553
0
    data = (char *) (dist->list);
554
0
    buf = smgr_bulk_get_buf(state->bulkstate);
555
0
    target = (Page) buf;
556
0
    gistinitpage(target, isleaf ? F_LEAF : 0);
557
0
    for (int i = 0; i < dist->block.num; i++)
558
0
    {
559
0
      IndexTuple  thistup = (IndexTuple) data;
560
561
0
      if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
562
0
        elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
563
564
0
      data += IndexTupleSize(thistup);
565
0
    }
566
0
    union_tuple = dist->itup;
567
568
    /*
569
     * Set the right link to point to the previous page. This is just for
570
     * debugging purposes: GiST only follows the right link if a page is
571
     * split concurrently to a scan, and that cannot happen during index
572
     * build.
573
     *
574
     * It's a bit counterintuitive that we set the right link on the new
575
     * page to point to the previous page, not the other way around. But
576
     * GiST pages are not ordered like B-tree pages are, so as long as the
577
     * right-links form a chain through all the pages at the same level,
578
     * the order doesn't matter.
579
     */
580
0
    if (levelstate->last_blkno)
581
0
      GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
582
583
    /*
584
     * The page is now complete. Assign a block number to it, and pass it
585
     * to the bulk writer.
586
     */
587
0
    blkno = state->pages_allocated++;
588
0
    PageSetLSN(target, GistBuildLSN);
589
0
    smgr_bulk_write(state->bulkstate, blkno, buf, true);
590
0
    ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
591
0
    levelstate->last_blkno = blkno;
592
593
    /*
594
     * Insert the downlink to the parent page. If this was the root,
595
     * create a new page as the parent, which becomes the new root.
596
     */
597
0
    parent = levelstate->parent;
598
0
    if (parent == NULL)
599
0
    {
600
0
      parent = palloc0(sizeof(GistSortedBuildLevelState));
601
0
      parent->pages[0] = palloc(BLCKSZ);
602
0
      parent->parent = NULL;
603
0
      gistinitpage(parent->pages[0], 0);
604
605
0
      levelstate->parent = parent;
606
0
    }
607
0
    gist_indexsortbuild_levelstate_add(state, parent, union_tuple);
608
0
  }
609
0
}
610
611
612
/*-------------------------------------------------------------------------
613
 * Routines for non-sorted build
614
 *-------------------------------------------------------------------------
615
 */
616
617
/*
618
 * Attempt to switch to buffering mode.
619
 *
620
 * If there is not enough memory for buffering build, sets bufferingMode
621
 * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
622
 * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
623
 * GIST_BUFFERING_ACTIVE.
624
 */
625
static void
626
gistInitBuffering(GISTBuildState *buildstate)
627
0
{
628
0
  Relation  index = buildstate->indexrel;
629
0
  int     pagesPerBuffer;
630
0
  Size    pageFreeSpace;
631
0
  Size    itupAvgSize,
632
0
        itupMinSize;
633
0
  double    avgIndexTuplesPerPage,
634
0
        maxIndexTuplesPerPage;
635
0
  int     i;
636
0
  int     levelStep;
637
638
  /* Calc space of index page which is available for index tuples */
639
0
  pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
640
0
    - sizeof(ItemIdData)
641
0
    - buildstate->freespace;
642
643
  /*
644
   * Calculate average size of already inserted index tuples using gathered
645
   * statistics.
646
   */
647
0
  itupAvgSize = (double) buildstate->indtuplesSize /
648
0
    (double) buildstate->indtuples;
649
650
  /*
651
   * Calculate minimal possible size of index tuple by index metadata.
652
   * Minimal possible size of varlena is VARHDRSZ.
653
   *
654
   * XXX: that's not actually true, as a short varlen can be just 2 bytes.
655
   * And we should take padding into account here.
656
   */
657
0
  itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
658
0
  for (i = 0; i < index->rd_att->natts; i++)
659
0
  {
660
0
    CompactAttribute *attr = TupleDescCompactAttr(index->rd_att, i);
661
662
0
    if (attr->attlen < 0)
663
0
      itupMinSize += VARHDRSZ;
664
0
    else
665
0
      itupMinSize += attr->attlen;
666
0
  }
667
668
  /* Calculate average and maximal number of index tuples which fit to page */
669
0
  avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
670
0
  maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
671
672
  /*
673
   * We need to calculate two parameters for the buffering algorithm:
674
   * levelStep and pagesPerBuffer.
675
   *
676
   * levelStep determines the size of subtree that we operate on, while
677
   * emptying a buffer. A higher value is better, as you need fewer buffer
678
   * emptying steps to build the index. However, if you set it too high, the
679
   * subtree doesn't fit in cache anymore, and you quickly lose the benefit
680
   * of the buffers.
681
   *
682
   * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
683
   * the number of tuples on page (ie. fanout), and M is the amount of
684
   * internal memory available. Curiously, they doesn't explain *why* that
685
   * setting is optimal. We calculate it by taking the highest levelStep so
686
   * that a subtree still fits in cache. For a small B, our way of
687
   * calculating levelStep is very close to Arge et al's formula. For a
688
   * large B, our formula gives a value that is 2x higher.
689
   *
690
   * The average size (in pages) of a subtree of depth n can be calculated
691
   * as a geometric series:
692
   *
693
   * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
694
   *
695
   * where B is the average number of index tuples on page. The subtree is
696
   * cached in the shared buffer cache and the OS cache, so we choose
697
   * levelStep so that the subtree size is comfortably smaller than
698
   * effective_cache_size, with a safety factor of 4.
699
   *
700
   * The estimate on the average number of index tuples on page is based on
701
   * average tuple sizes observed before switching to buffered build, so the
702
   * real subtree size can be somewhat larger. Also, it would selfish to
703
   * gobble the whole cache for our index build. The safety factor of 4
704
   * should account for those effects.
705
   *
706
   * The other limiting factor for setting levelStep is that while
707
   * processing a subtree, we need to hold one page for each buffer at the
708
   * next lower buffered level. The max. number of buffers needed for that
709
   * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
710
   * hopefully maintenance_work_mem is set high enough that you're
711
   * constrained by effective_cache_size rather than maintenance_work_mem.
712
   *
713
   * XXX: the buffer hash table consumes a fair amount of memory too per
714
   * buffer, but that is not currently taken into account. That scales on
715
   * the total number of buffers used, ie. the index size and on levelStep.
716
   * Note that a higher levelStep *reduces* the amount of memory needed for
717
   * the hash table.
718
   */
719
0
  levelStep = 1;
720
0
  for (;;)
721
0
  {
722
0
    double    subtreesize;
723
0
    double    maxlowestlevelpages;
724
725
    /* size of an average subtree at this levelStep (in pages). */
726
0
    subtreesize =
727
0
      (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) /
728
0
      (1 - avgIndexTuplesPerPage);
729
730
    /* max number of pages at the lowest level of a subtree */
731
0
    maxlowestlevelpages = pow(maxIndexTuplesPerPage, (double) levelStep);
732
733
    /* subtree must fit in cache (with safety factor of 4) */
734
0
    if (subtreesize > effective_cache_size / 4)
735
0
      break;
736
737
    /* each node in the lowest level of a subtree has one page in memory */
738
0
    if (maxlowestlevelpages > ((double) maintenance_work_mem * 1024) / BLCKSZ)
739
0
      break;
740
741
    /* Good, we can handle this levelStep. See if we can go one higher. */
742
0
    levelStep++;
743
0
  }
744
745
  /*
746
   * We just reached an unacceptable value of levelStep in previous loop.
747
   * So, decrease levelStep to get last acceptable value.
748
   */
749
0
  levelStep--;
750
751
  /*
752
   * If there's not enough cache or maintenance_work_mem, fall back to plain
753
   * inserts.
754
   */
755
0
  if (levelStep <= 0)
756
0
  {
757
0
    elog(DEBUG1, "failed to switch to buffered GiST build");
758
0
    buildstate->buildMode = GIST_BUFFERING_DISABLED;
759
0
    return;
760
0
  }
761
762
  /*
763
   * The second parameter to set is pagesPerBuffer, which determines the
764
   * size of each buffer. We adjust pagesPerBuffer also during the build,
765
   * which is why this calculation is in a separate function.
766
   */
767
0
  pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
768
769
  /* Initialize GISTBuildBuffers with these parameters */
770
0
  buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
771
0
                      gistGetMaxLevel(index));
772
773
0
  gistInitParentMap(buildstate);
774
775
0
  buildstate->buildMode = GIST_BUFFERING_ACTIVE;
776
777
0
  elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
778
0
     levelStep, pagesPerBuffer);
779
0
}
780
781
/*
782
 * Calculate pagesPerBuffer parameter for the buffering algorithm.
783
 *
784
 * Buffer size is chosen so that assuming that tuples are distributed
785
 * randomly, emptying half a buffer fills on average one page in every buffer
786
 * at the next lower level.
787
 */
788
static int
789
calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
790
0
{
791
0
  double    pagesPerBuffer;
792
0
  double    avgIndexTuplesPerPage;
793
0
  double    itupAvgSize;
794
0
  Size    pageFreeSpace;
795
796
  /* Calc space of index page which is available for index tuples */
797
0
  pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
798
0
    - sizeof(ItemIdData)
799
0
    - buildstate->freespace;
800
801
  /*
802
   * Calculate average size of already inserted index tuples using gathered
803
   * statistics.
804
   */
805
0
  itupAvgSize = (double) buildstate->indtuplesSize /
806
0
    (double) buildstate->indtuples;
807
808
0
  avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
809
810
  /*
811
   * Recalculate required size of buffers.
812
   */
813
0
  pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
814
815
0
  return (int) rint(pagesPerBuffer);
816
0
}
817
818
/*
819
 * Per-tuple callback for table_index_build_scan.
820
 */
821
static void
822
gistBuildCallback(Relation index,
823
          ItemPointer tid,
824
          Datum *values,
825
          bool *isnull,
826
          bool tupleIsAlive,
827
          void *state)
828
0
{
829
0
  GISTBuildState *buildstate = (GISTBuildState *) state;
830
0
  IndexTuple  itup;
831
0
  MemoryContext oldCtx;
832
833
0
  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
834
835
  /* form an index tuple and point it at the heap tuple */
836
0
  itup = gistFormTuple(buildstate->giststate, index,
837
0
             values, isnull,
838
0
             true);
839
0
  itup->t_tid = *tid;
840
841
  /* Update tuple count and total size. */
842
0
  buildstate->indtuples += 1;
843
0
  buildstate->indtuplesSize += IndexTupleSize(itup);
844
845
  /*
846
   * XXX In buffering builds, the tempCxt is also reset down inside
847
   * gistProcessEmptyingQueue().  This is not great because it risks
848
   * confusion and possible use of dangling pointers (for example, itup
849
   * might be already freed when control returns here).  It's generally
850
   * better that a memory context be "owned" by only one function.  However,
851
   * currently this isn't causing issues so it doesn't seem worth the amount
852
   * of refactoring that would be needed to avoid it.
853
   */
854
0
  if (buildstate->buildMode == GIST_BUFFERING_ACTIVE)
855
0
  {
856
    /* We have buffers, so use them. */
857
0
    gistBufferingBuildInsert(buildstate, itup);
858
0
  }
859
0
  else
860
0
  {
861
    /*
862
     * There's no buffers (yet). Since we already have the index relation
863
     * locked, we call gistdoinsert directly.
864
     */
865
0
    gistdoinsert(index, itup, buildstate->freespace,
866
0
           buildstate->giststate, buildstate->heaprel, true);
867
0
  }
868
869
0
  MemoryContextSwitchTo(oldCtx);
870
0
  MemoryContextReset(buildstate->giststate->tempCxt);
871
872
0
  if (buildstate->buildMode == GIST_BUFFERING_ACTIVE &&
873
0
    buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
874
0
  {
875
    /* Adjust the target buffer size now */
876
0
    buildstate->gfbb->pagesPerBuffer =
877
0
      calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
878
0
  }
879
880
  /*
881
   * In 'auto' mode, check if the index has grown too large to fit in cache,
882
   * and switch to buffering mode if it has.
883
   *
884
   * To avoid excessive calls to smgrnblocks(), only check this every
885
   * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples.
886
   *
887
   * In 'stats' state, switch as soon as we have seen enough tuples to have
888
   * some idea of the average tuple size.
889
   */
890
0
  if ((buildstate->buildMode == GIST_BUFFERING_AUTO &&
891
0
     buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
892
0
     effective_cache_size < smgrnblocks(RelationGetSmgr(index),
893
0
                      MAIN_FORKNUM)) ||
894
0
    (buildstate->buildMode == GIST_BUFFERING_STATS &&
895
0
     buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
896
0
  {
897
    /*
898
     * Index doesn't fit in effective cache anymore. Try to switch to
899
     * buffering build mode.
900
     */
901
0
    gistInitBuffering(buildstate);
902
0
  }
903
0
}
904
905
/*
906
 * Insert function for buffering index build.
907
 */
908
static void
909
gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
910
0
{
911
  /* Insert the tuple to buffers. */
912
0
  gistProcessItup(buildstate, itup, 0, buildstate->gfbb->rootlevel);
913
914
  /* If we filled up (half of a) buffer, process buffer emptying. */
915
0
  gistProcessEmptyingQueue(buildstate);
916
0
}
917
918
/*
919
 * Process an index tuple. Runs the tuple down the tree until we reach a leaf
920
 * page or node buffer, and inserts the tuple there. Returns true if we have
921
 * to stop buffer emptying process (because one of child buffers can't take
922
 * index tuples anymore).
923
 */
924
static bool
925
gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
926
        BlockNumber startblkno, int startlevel)
927
0
{
928
0
  GISTSTATE  *giststate = buildstate->giststate;
929
0
  GISTBuildBuffers *gfbb = buildstate->gfbb;
930
0
  Relation  indexrel = buildstate->indexrel;
931
0
  BlockNumber childblkno;
932
0
  Buffer    buffer;
933
0
  bool    result = false;
934
0
  BlockNumber blkno;
935
0
  int     level;
936
0
  OffsetNumber downlinkoffnum = InvalidOffsetNumber;
937
0
  BlockNumber parentblkno = InvalidBlockNumber;
938
939
0
  CHECK_FOR_INTERRUPTS();
940
941
  /*
942
   * Loop until we reach a leaf page (level == 0) or a level with buffers
943
   * (not including the level we start at, because we would otherwise make
944
   * no progress).
945
   */
946
0
  blkno = startblkno;
947
0
  level = startlevel;
948
0
  for (;;)
949
0
  {
950
0
    ItemId    iid;
951
0
    IndexTuple  idxtuple,
952
0
          newtup;
953
0
    Page    page;
954
0
    OffsetNumber childoffnum;
955
956
    /* Have we reached a level with buffers? */
957
0
    if (LEVEL_HAS_BUFFERS(level, gfbb) && level != startlevel)
958
0
      break;
959
960
    /* Have we reached a leaf page? */
961
0
    if (level == 0)
962
0
      break;
963
964
    /*
965
     * Nope. Descend down to the next level then. Choose a child to
966
     * descend down to.
967
     */
968
969
0
    buffer = ReadBuffer(indexrel, blkno);
970
0
    LockBuffer(buffer, GIST_EXCLUSIVE);
971
972
0
    page = (Page) BufferGetPage(buffer);
973
0
    childoffnum = gistchoose(indexrel, page, itup, giststate);
974
0
    iid = PageGetItemId(page, childoffnum);
975
0
    idxtuple = (IndexTuple) PageGetItem(page, iid);
976
0
    childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
977
978
0
    if (level > 1)
979
0
      gistMemorizeParent(buildstate, childblkno, blkno);
980
981
    /*
982
     * Check that the key representing the target child node is consistent
983
     * with the key we're inserting. Update it if it's not.
984
     */
985
0
    newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
986
0
    if (newtup)
987
0
    {
988
0
      blkno = gistbufferinginserttuples(buildstate,
989
0
                        buffer,
990
0
                        level,
991
0
                        &newtup,
992
0
                        1,
993
0
                        childoffnum,
994
0
                        InvalidBlockNumber,
995
0
                        InvalidOffsetNumber);
996
      /* gistbufferinginserttuples() released the buffer */
997
0
    }
998
0
    else
999
0
      UnlockReleaseBuffer(buffer);
1000
1001
    /* Descend to the child */
1002
0
    parentblkno = blkno;
1003
0
    blkno = childblkno;
1004
0
    downlinkoffnum = childoffnum;
1005
0
    Assert(level > 0);
1006
0
    level--;
1007
0
  }
1008
1009
0
  if (LEVEL_HAS_BUFFERS(level, gfbb))
1010
0
  {
1011
    /*
1012
     * We've reached level with buffers. Place the index tuple to the
1013
     * buffer, and add the buffer to the emptying queue if it overflows.
1014
     */
1015
0
    GISTNodeBuffer *childNodeBuffer;
1016
1017
    /* Find the buffer or create a new one */
1018
0
    childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, blkno, level);
1019
1020
    /* Add index tuple to it */
1021
0
    gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
1022
1023
0
    if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
1024
0
      result = true;
1025
0
  }
1026
0
  else
1027
0
  {
1028
    /*
1029
     * We've reached a leaf page. Place the tuple here.
1030
     */
1031
0
    Assert(level == 0);
1032
0
    buffer = ReadBuffer(indexrel, blkno);
1033
0
    LockBuffer(buffer, GIST_EXCLUSIVE);
1034
0
    gistbufferinginserttuples(buildstate, buffer, level,
1035
0
                  &itup, 1, InvalidOffsetNumber,
1036
0
                  parentblkno, downlinkoffnum);
1037
    /* gistbufferinginserttuples() released the buffer */
1038
0
  }
1039
1040
0
  return result;
1041
0
}
1042
1043
/*
1044
 * Insert tuples to a given page.
1045
 *
1046
 * This is analogous with gistinserttuples() in the regular insertion code.
1047
 *
1048
 * Returns the block number of the page where the (first) new or updated tuple
1049
 * was inserted. Usually that's the original page, but might be a sibling page
1050
 * if the original page was split.
1051
 *
1052
 * Caller should hold a lock on 'buffer' on entry. This function will unlock
1053
 * and unpin it.
1054
 */
1055
static BlockNumber
1056
gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer, int level,
1057
              IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
1058
              BlockNumber parentblk, OffsetNumber downlinkoffnum)
1059
0
{
1060
0
  GISTBuildBuffers *gfbb = buildstate->gfbb;
1061
0
  List     *splitinfo;
1062
0
  bool    is_split;
1063
0
  BlockNumber placed_to_blk = InvalidBlockNumber;
1064
1065
0
  is_split = gistplacetopage(buildstate->indexrel,
1066
0
                 buildstate->freespace,
1067
0
                 buildstate->giststate,
1068
0
                 buffer,
1069
0
                 itup, ntup, oldoffnum, &placed_to_blk,
1070
0
                 InvalidBuffer,
1071
0
                 &splitinfo,
1072
0
                 false,
1073
0
                 buildstate->heaprel, true);
1074
1075
  /*
1076
   * If this is a root split, update the root path item kept in memory. This
1077
   * ensures that all path stacks are always complete, including all parent
1078
   * nodes up to the root. That simplifies the algorithm to re-find correct
1079
   * parent.
1080
   */
1081
0
  if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
1082
0
  {
1083
0
    Page    page = BufferGetPage(buffer);
1084
0
    OffsetNumber off;
1085
0
    OffsetNumber maxoff;
1086
1087
0
    Assert(level == gfbb->rootlevel);
1088
0
    gfbb->rootlevel++;
1089
1090
0
    elog(DEBUG2, "splitting GiST root page, now %d levels deep", gfbb->rootlevel);
1091
1092
    /*
1093
     * All the downlinks on the old root page are now on one of the child
1094
     * pages. Visit all the new child pages to memorize the parents of the
1095
     * grandchildren.
1096
     */
1097
0
    if (gfbb->rootlevel > 1)
1098
0
    {
1099
0
      maxoff = PageGetMaxOffsetNumber(page);
1100
0
      for (off = FirstOffsetNumber; off <= maxoff; off++)
1101
0
      {
1102
0
        ItemId    iid = PageGetItemId(page, off);
1103
0
        IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
1104
0
        BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1105
0
        Buffer    childbuf = ReadBuffer(buildstate->indexrel, childblkno);
1106
1107
0
        LockBuffer(childbuf, GIST_SHARE);
1108
0
        gistMemorizeAllDownlinks(buildstate, childbuf);
1109
0
        UnlockReleaseBuffer(childbuf);
1110
1111
        /*
1112
         * Also remember that the parent of the new child page is the
1113
         * root block.
1114
         */
1115
0
        gistMemorizeParent(buildstate, childblkno, GIST_ROOT_BLKNO);
1116
0
      }
1117
0
    }
1118
0
  }
1119
1120
0
  if (splitinfo)
1121
0
  {
1122
    /*
1123
     * Insert the downlinks to the parent. This is analogous with
1124
     * gistfinishsplit() in the regular insertion code, but the locking is
1125
     * simpler, and we have to maintain the buffers on internal nodes and
1126
     * the parent map.
1127
     */
1128
0
    IndexTuple *downlinks;
1129
0
    int     ndownlinks,
1130
0
          i;
1131
0
    Buffer    parentBuffer;
1132
0
    ListCell   *lc;
1133
1134
    /* Parent may have changed since we memorized this path. */
1135
0
    parentBuffer =
1136
0
      gistBufferingFindCorrectParent(buildstate,
1137
0
                       BufferGetBlockNumber(buffer),
1138
0
                       level,
1139
0
                       &parentblk,
1140
0
                       &downlinkoffnum);
1141
1142
    /*
1143
     * If there's a buffer associated with this page, that needs to be
1144
     * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
1145
     * downlinks in 'splitinfo', to make sure they're consistent not only
1146
     * with the tuples already on the pages, but also the tuples in the
1147
     * buffers that will eventually be inserted to them.
1148
     */
1149
0
    gistRelocateBuildBuffersOnSplit(gfbb,
1150
0
                    buildstate->giststate,
1151
0
                    buildstate->indexrel,
1152
0
                    level,
1153
0
                    buffer, splitinfo);
1154
1155
    /* Create an array of all the downlink tuples */
1156
0
    ndownlinks = list_length(splitinfo);
1157
0
    downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
1158
0
    i = 0;
1159
0
    foreach(lc, splitinfo)
1160
0
    {
1161
0
      GISTPageSplitInfo *splitinfo = lfirst(lc);
1162
1163
      /*
1164
       * Remember the parent of each new child page in our parent map.
1165
       * This assumes that the downlinks fit on the parent page. If the
1166
       * parent page is split, too, when we recurse up to insert the
1167
       * downlinks, the recursive gistbufferinginserttuples() call will
1168
       * update the map again.
1169
       */
1170
0
      if (level > 0)
1171
0
        gistMemorizeParent(buildstate,
1172
0
                   BufferGetBlockNumber(splitinfo->buf),
1173
0
                   BufferGetBlockNumber(parentBuffer));
1174
1175
      /*
1176
       * Also update the parent map for all the downlinks that got moved
1177
       * to a different page. (actually this also loops through the
1178
       * downlinks that stayed on the original page, but it does no
1179
       * harm).
1180
       */
1181
0
      if (level > 1)
1182
0
        gistMemorizeAllDownlinks(buildstate, splitinfo->buf);
1183
1184
      /*
1185
       * Since there's no concurrent access, we can release the lower
1186
       * level buffers immediately. This includes the original page.
1187
       */
1188
0
      UnlockReleaseBuffer(splitinfo->buf);
1189
0
      downlinks[i++] = splitinfo->downlink;
1190
0
    }
1191
1192
    /* Insert them into parent. */
1193
0
    gistbufferinginserttuples(buildstate, parentBuffer, level + 1,
1194
0
                  downlinks, ndownlinks, downlinkoffnum,
1195
0
                  InvalidBlockNumber, InvalidOffsetNumber);
1196
1197
0
    list_free_deep(splitinfo);  /* we don't need this anymore */
1198
0
  }
1199
0
  else
1200
0
    UnlockReleaseBuffer(buffer);
1201
1202
0
  return placed_to_blk;
1203
0
}
1204
1205
/*
1206
 * Find the downlink pointing to a child page.
1207
 *
1208
 * 'childblkno' indicates the child page to find the parent for. 'level' is
1209
 * the level of the child. On entry, *parentblkno and *downlinkoffnum can
1210
 * point to a location where the downlink used to be - we will check that
1211
 * location first, and save some cycles if it hasn't moved. The function
1212
 * returns a buffer containing the downlink, exclusively-locked, and
1213
 * *parentblkno and *downlinkoffnum are set to the real location of the
1214
 * downlink.
1215
 *
1216
 * If the child page is a leaf (level == 0), the caller must supply a correct
1217
 * parentblkno. Otherwise we use the parent map hash table to find the parent
1218
 * block.
1219
 *
1220
 * This function serves the same purpose as gistFindCorrectParent() during
1221
 * normal index inserts, but this is simpler because we don't need to deal
1222
 * with concurrent inserts.
1223
 */
1224
static Buffer
1225
gistBufferingFindCorrectParent(GISTBuildState *buildstate,
1226
                 BlockNumber childblkno, int level,
1227
                 BlockNumber *parentblkno,
1228
                 OffsetNumber *downlinkoffnum)
1229
0
{
1230
0
  BlockNumber parent;
1231
0
  Buffer    buffer;
1232
0
  Page    page;
1233
0
  OffsetNumber maxoff;
1234
0
  OffsetNumber off;
1235
1236
0
  if (level > 0)
1237
0
    parent = gistGetParent(buildstate, childblkno);
1238
0
  else
1239
0
  {
1240
    /*
1241
     * For a leaf page, the caller must supply a correct parent block
1242
     * number.
1243
     */
1244
0
    if (*parentblkno == InvalidBlockNumber)
1245
0
      elog(ERROR, "no parent buffer provided of child %u", childblkno);
1246
0
    parent = *parentblkno;
1247
0
  }
1248
1249
0
  buffer = ReadBuffer(buildstate->indexrel, parent);
1250
0
  page = BufferGetPage(buffer);
1251
0
  LockBuffer(buffer, GIST_EXCLUSIVE);
1252
0
  gistcheckpage(buildstate->indexrel, buffer);
1253
0
  maxoff = PageGetMaxOffsetNumber(page);
1254
1255
  /* Check if it was not moved */
1256
0
  if (parent == *parentblkno && *parentblkno != InvalidBlockNumber &&
1257
0
    *downlinkoffnum != InvalidOffsetNumber && *downlinkoffnum <= maxoff)
1258
0
  {
1259
0
    ItemId    iid = PageGetItemId(page, *downlinkoffnum);
1260
0
    IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
1261
1262
0
    if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1263
0
    {
1264
      /* Still there */
1265
0
      return buffer;
1266
0
    }
1267
0
  }
1268
1269
  /*
1270
   * Downlink was not at the offset where it used to be. Scan the page to
1271
   * find it. During normal gist insertions, it might've moved to another
1272
   * page, to the right, but during a buffering build, we keep track of the
1273
   * parent of each page in the lookup table so we should always know what
1274
   * page it's on.
1275
   */
1276
0
  for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
1277
0
  {
1278
0
    ItemId    iid = PageGetItemId(page, off);
1279
0
    IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
1280
1281
0
    if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == childblkno)
1282
0
    {
1283
      /* yes!!, found it */
1284
0
      *downlinkoffnum = off;
1285
0
      return buffer;
1286
0
    }
1287
0
  }
1288
1289
0
  elog(ERROR, "failed to re-find parent for block %u", childblkno);
1290
0
  return InvalidBuffer;   /* keep compiler quiet */
1291
0
}
1292
1293
/*
1294
 * Process buffers emptying stack. Emptying of one buffer can cause emptying
1295
 * of other buffers. This function iterates until this cascading emptying
1296
 * process finished, e.g. until buffers emptying stack is empty.
1297
 */
1298
static void
1299
gistProcessEmptyingQueue(GISTBuildState *buildstate)
1300
0
{
1301
0
  GISTBuildBuffers *gfbb = buildstate->gfbb;
1302
1303
  /* Iterate while we have elements in buffers emptying stack. */
1304
0
  while (gfbb->bufferEmptyingQueue != NIL)
1305
0
  {
1306
0
    GISTNodeBuffer *emptyingNodeBuffer;
1307
1308
    /* Get node buffer from emptying stack. */
1309
0
    emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
1310
0
    gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
1311
0
    emptyingNodeBuffer->queuedForEmptying = false;
1312
1313
    /*
1314
     * We are going to load last pages of buffers where emptying will be
1315
     * to. So let's unload any previously loaded buffers.
1316
     */
1317
0
    gistUnloadNodeBuffers(gfbb);
1318
1319
    /*
1320
     * Pop tuples from the buffer and run them down to the buffers at
1321
     * lower level, or leaf pages. We continue until one of the lower
1322
     * level buffers fills up, or this buffer runs empty.
1323
     *
1324
     * In Arge et al's paper, the buffer emptying is stopped after
1325
     * processing 1/2 node buffer worth of tuples, to avoid overfilling
1326
     * any of the lower level buffers. However, it's more efficient to
1327
     * keep going until one of the lower level buffers actually fills up,
1328
     * so that's what we do. This doesn't need to be exact, if a buffer
1329
     * overfills by a few tuples, there's no harm done.
1330
     */
1331
0
    while (true)
1332
0
    {
1333
0
      IndexTuple  itup;
1334
1335
      /* Get next index tuple from the buffer */
1336
0
      if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
1337
0
        break;
1338
1339
      /*
1340
       * Run it down to the underlying node buffer or leaf page.
1341
       *
1342
       * Note: it's possible that the buffer we're emptying splits as a
1343
       * result of this call. If that happens, our emptyingNodeBuffer
1344
       * points to the left half of the split. After split, it's very
1345
       * likely that the new left buffer is no longer over the half-full
1346
       * threshold, but we might as well keep flushing tuples from it
1347
       * until we fill a lower-level buffer.
1348
       */
1349
0
      if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->nodeBlocknum, emptyingNodeBuffer->level))
1350
0
      {
1351
        /*
1352
         * A lower level buffer filled up. Stop emptying this buffer,
1353
         * to avoid overflowing the lower level buffer.
1354
         */
1355
0
        break;
1356
0
      }
1357
1358
      /* Free all the memory allocated during index tuple processing */
1359
0
      MemoryContextReset(buildstate->giststate->tempCxt);
1360
0
    }
1361
0
  }
1362
0
}
1363
1364
/*
1365
 * Empty all node buffers, from top to bottom. This is done at the end of
1366
 * index build to flush all remaining tuples to the index.
1367
 *
1368
 * Note: This destroys the buffersOnLevels lists, so the buffers should not
1369
 * be inserted to after this call.
1370
 */
1371
static void
1372
gistEmptyAllBuffers(GISTBuildState *buildstate)
1373
0
{
1374
0
  GISTBuildBuffers *gfbb = buildstate->gfbb;
1375
0
  MemoryContext oldCtx;
1376
0
  int     i;
1377
1378
0
  oldCtx = MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1379
1380
  /*
1381
   * Iterate through the levels from top to bottom.
1382
   */
1383
0
  for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
1384
0
  {
1385
    /*
1386
     * Empty all buffers on this level. Note that new buffers can pop up
1387
     * in the list during the processing, as a result of page splits, so a
1388
     * simple walk through the list won't work. We remove buffers from the
1389
     * list when we see them empty; a buffer can't become non-empty once
1390
     * it's been fully emptied.
1391
     */
1392
0
    while (gfbb->buffersOnLevels[i] != NIL)
1393
0
    {
1394
0
      GISTNodeBuffer *nodeBuffer;
1395
1396
0
      nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
1397
1398
0
      if (nodeBuffer->blocksCount != 0)
1399
0
      {
1400
        /*
1401
         * Add this buffer to the emptying queue, and proceed to empty
1402
         * the queue.
1403
         */
1404
0
        if (!nodeBuffer->queuedForEmptying)
1405
0
        {
1406
0
          MemoryContextSwitchTo(gfbb->context);
1407
0
          nodeBuffer->queuedForEmptying = true;
1408
0
          gfbb->bufferEmptyingQueue =
1409
0
            lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
1410
0
          MemoryContextSwitchTo(buildstate->giststate->tempCxt);
1411
0
        }
1412
0
        gistProcessEmptyingQueue(buildstate);
1413
0
      }
1414
0
      else
1415
0
        gfbb->buffersOnLevels[i] =
1416
0
          list_delete_first(gfbb->buffersOnLevels[i]);
1417
0
    }
1418
0
    elog(DEBUG2, "emptied all buffers at level %d", i);
1419
0
  }
1420
0
  MemoryContextSwitchTo(oldCtx);
1421
0
}
1422
1423
/*
1424
 * Get the depth of the GiST index.
1425
 */
1426
static int
1427
gistGetMaxLevel(Relation index)
1428
0
{
1429
0
  int     maxLevel;
1430
0
  BlockNumber blkno;
1431
1432
  /*
1433
   * Traverse down the tree, starting from the root, until we hit the leaf
1434
   * level.
1435
   */
1436
0
  maxLevel = 0;
1437
0
  blkno = GIST_ROOT_BLKNO;
1438
0
  while (true)
1439
0
  {
1440
0
    Buffer    buffer;
1441
0
    Page    page;
1442
0
    IndexTuple  itup;
1443
1444
0
    buffer = ReadBuffer(index, blkno);
1445
1446
    /*
1447
     * There's no concurrent access during index build, so locking is just
1448
     * pro forma.
1449
     */
1450
0
    LockBuffer(buffer, GIST_SHARE);
1451
0
    page = (Page) BufferGetPage(buffer);
1452
1453
0
    if (GistPageIsLeaf(page))
1454
0
    {
1455
      /* We hit the bottom, so we're done. */
1456
0
      UnlockReleaseBuffer(buffer);
1457
0
      break;
1458
0
    }
1459
1460
    /*
1461
     * Pick the first downlink on the page, and follow it. It doesn't
1462
     * matter which downlink we choose, the tree has the same depth
1463
     * everywhere, so we just pick the first one.
1464
     */
1465
0
    itup = (IndexTuple) PageGetItem(page,
1466
0
                    PageGetItemId(page, FirstOffsetNumber));
1467
0
    blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
1468
0
    UnlockReleaseBuffer(buffer);
1469
1470
    /*
1471
     * We're going down on the tree. It means that there is yet one more
1472
     * level in the tree.
1473
     */
1474
0
    maxLevel++;
1475
0
  }
1476
0
  return maxLevel;
1477
0
}
1478
1479
1480
/*
1481
 * Routines for managing the parent map.
1482
 *
1483
 * Whenever a page is split, we need to insert the downlinks into the parent.
1484
 * We need to somehow find the parent page to do that. In normal insertions,
1485
 * we keep a stack of nodes visited when we descend the tree. However, in
1486
 * buffering build, we can start descending the tree from any internal node,
1487
 * when we empty a buffer by cascading tuples to its children. So we don't
1488
 * have a full stack up to the root available at that time.
1489
 *
1490
 * So instead, we maintain a hash table to track the parent of every internal
1491
 * page. We don't need to track the parents of leaf nodes, however. Whenever
1492
 * we insert to a leaf, we've just descended down from its parent, so we know
1493
 * its immediate parent already. This helps a lot to limit the memory used
1494
 * by this hash table.
1495
 *
1496
 * Whenever an internal node is split, the parent map needs to be updated.
1497
 * the parent of the new child page needs to be recorded, and also the
1498
 * entries for all page whose downlinks are moved to a new page at the split
1499
 * needs to be updated.
1500
 *
1501
 * We also update the parent map whenever we descend the tree. That might seem
1502
 * unnecessary, because we maintain the map whenever a downlink is moved or
1503
 * created, but it is needed because we switch to buffering mode after
1504
 * creating a tree with regular index inserts. Any pages created before
1505
 * switching to buffering mode will not be present in the parent map initially,
1506
 * but will be added there the first time we visit them.
1507
 */
1508
1509
typedef struct
1510
{
1511
  BlockNumber childblkno;   /* hash key */
1512
  BlockNumber parentblkno;
1513
} ParentMapEntry;
1514
1515
static void
1516
gistInitParentMap(GISTBuildState *buildstate)
1517
0
{
1518
0
  HASHCTL   hashCtl;
1519
1520
0
  hashCtl.keysize = sizeof(BlockNumber);
1521
0
  hashCtl.entrysize = sizeof(ParentMapEntry);
1522
0
  hashCtl.hcxt = CurrentMemoryContext;
1523
0
  buildstate->parentMap = hash_create("gistbuild parent map",
1524
0
                    1024,
1525
0
                    &hashCtl,
1526
0
                    HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
1527
0
}
1528
1529
static void
1530
gistMemorizeParent(GISTBuildState *buildstate, BlockNumber child, BlockNumber parent)
1531
0
{
1532
0
  ParentMapEntry *entry;
1533
0
  bool    found;
1534
1535
0
  entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1536
0
                       &child,
1537
0
                       HASH_ENTER,
1538
0
                       &found);
1539
0
  entry->parentblkno = parent;
1540
0
}
1541
1542
/*
1543
 * Scan all downlinks on a page, and memorize their parent.
1544
 */
1545
static void
1546
gistMemorizeAllDownlinks(GISTBuildState *buildstate, Buffer parentbuf)
1547
0
{
1548
0
  OffsetNumber maxoff;
1549
0
  OffsetNumber off;
1550
0
  BlockNumber parentblkno = BufferGetBlockNumber(parentbuf);
1551
0
  Page    page = BufferGetPage(parentbuf);
1552
1553
0
  Assert(!GistPageIsLeaf(page));
1554
1555
0
  maxoff = PageGetMaxOffsetNumber(page);
1556
0
  for (off = FirstOffsetNumber; off <= maxoff; off++)
1557
0
  {
1558
0
    ItemId    iid = PageGetItemId(page, off);
1559
0
    IndexTuple  idxtuple = (IndexTuple) PageGetItem(page, iid);
1560
0
    BlockNumber childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
1561
1562
0
    gistMemorizeParent(buildstate, childblkno, parentblkno);
1563
0
  }
1564
0
}
1565
1566
static BlockNumber
1567
gistGetParent(GISTBuildState *buildstate, BlockNumber child)
1568
0
{
1569
0
  ParentMapEntry *entry;
1570
0
  bool    found;
1571
1572
  /* Find node buffer in hash table */
1573
0
  entry = (ParentMapEntry *) hash_search(buildstate->parentMap,
1574
0
                       &child,
1575
0
                       HASH_FIND,
1576
0
                       &found);
1577
0
  if (!found)
1578
0
    elog(ERROR, "could not find parent of block %u in lookup table", child);
1579
1580
0
  return entry->parentblkno;
1581
0
}