Coverage Report

Created: 2025-07-03 06:49

/src/postgres/src/backend/storage/ipc/shmem.c
Line
Count
Source (jump to first uncovered line)
1
/*-------------------------------------------------------------------------
2
 *
3
 * shmem.c
4
 *    create shared memory and initialize shared memory data structures.
5
 *
6
 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7
 * Portions Copyright (c) 1994, Regents of the University of California
8
 *
9
 *
10
 * IDENTIFICATION
11
 *    src/backend/storage/ipc/shmem.c
12
 *
13
 *-------------------------------------------------------------------------
14
 */
15
/*
16
 * POSTGRES processes share one or more regions of shared memory.
17
 * The shared memory is created by a postmaster and is inherited
18
 * by each backend via fork() (or, in some ports, via other OS-specific
19
 * methods).  The routines in this file are used for allocating and
20
 * binding to shared memory data structures.
21
 *
22
 * NOTES:
23
 *    (a) There are three kinds of shared memory data structures
24
 *  available to POSTGRES: fixed-size structures, queues and hash
25
 *  tables.  Fixed-size structures contain things like global variables
26
 *  for a module and should never be allocated after the shared memory
27
 *  initialization phase.  Hash tables have a fixed maximum size, but
28
 *  their actual size can vary dynamically.  When entries are added
29
 *  to the table, more space is allocated.  Queues link data structures
30
 *  that have been allocated either within fixed-size structures or as hash
31
 *  buckets.  Each shared data structure has a string name to identify
32
 *  it (assigned in the module that declares it).
33
 *
34
 *    (b) During initialization, each module looks for its
35
 *  shared data structures in a hash table called the "Shmem Index".
36
 *  If the data structure is not present, the caller can allocate
37
 *  a new one and initialize it.  If the data structure is present,
38
 *  the caller "attaches" to the structure by initializing a pointer
39
 *  in the local address space.
40
 *    The shmem index has two purposes: first, it gives us
41
 *  a simple model of how the world looks when a backend process
42
 *  initializes.  If something is present in the shmem index,
43
 *  it is initialized.  If it is not, it is uninitialized.  Second,
44
 *  the shmem index allows us to allocate shared memory on demand
45
 *  instead of trying to preallocate structures and hard-wire the
46
 *  sizes and locations in header files.  If you are using a lot
47
 *  of shared memory in a lot of different places (and changing
48
 *  things during development), this is important.
49
 *
50
 *    (c) In standard Unix-ish environments, individual backends do not
51
 *  need to re-establish their local pointers into shared memory, because
52
 *  they inherit correct values of those variables via fork() from the
53
 *  postmaster.  However, this does not work in the EXEC_BACKEND case.
54
 *  In ports using EXEC_BACKEND, new backends have to set up their local
55
 *  pointers using the method described in (b) above.
56
 *
57
 *    (d) memory allocation model: shared memory can never be
58
 *  freed, once allocated.   Each hash table has its own free list,
59
 *  so hash buckets can be reused when an item is deleted.  However,
60
 *  if one hash table grows very large and then shrinks, its space
61
 *  cannot be redistributed to other tables.  We could build a simple
62
 *  hash bucket garbage collector if need be.  Right now, it seems
63
 *  unnecessary.
64
 */
65
66
#include "postgres.h"
67
68
#include "fmgr.h"
69
#include "funcapi.h"
70
#include "miscadmin.h"
71
#include "port/pg_numa.h"
72
#include "storage/lwlock.h"
73
#include "storage/pg_shmem.h"
74
#include "storage/shmem.h"
75
#include "storage/spin.h"
76
#include "utils/builtins.h"
77
78
static void *ShmemAllocRaw(Size size, Size *allocated_size);
79
80
/* shared memory global variables */
81
82
static PGShmemHeader *ShmemSegHdr;  /* shared mem segment header */
83
84
static void *ShmemBase;     /* start address of shared memory */
85
86
static void *ShmemEnd;      /* end+1 address of shared memory */
87
88
slock_t    *ShmemLock;      /* spinlock for shared memory and LWLock
89
                 * allocation */
90
91
static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
92
93
/* To get reliable results for NUMA inquiry we need to "touch pages" once */
94
static bool firstNumaTouch = true;
95
96
Datum   pg_numa_available(PG_FUNCTION_ARGS);
97
98
/*
99
 *  InitShmemAccess() --- set up basic pointers to shared memory.
100
 */
101
void
102
InitShmemAccess(PGShmemHeader *seghdr)
103
0
{
104
0
  ShmemSegHdr = seghdr;
105
0
  ShmemBase = seghdr;
106
0
  ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
107
0
}
108
109
/*
110
 *  InitShmemAllocation() --- set up shared-memory space allocation.
111
 *
112
 * This should be called only in the postmaster or a standalone backend.
113
 */
114
void
115
InitShmemAllocation(void)
116
0
{
117
0
  PGShmemHeader *shmhdr = ShmemSegHdr;
118
0
  char     *aligned;
119
120
0
  Assert(shmhdr != NULL);
121
122
  /*
123
   * Initialize the spinlock used by ShmemAlloc.  We must use
124
   * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
125
   */
126
0
  ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
127
128
0
  SpinLockInit(ShmemLock);
129
130
  /*
131
   * Allocations after this point should go through ShmemAlloc, which
132
   * expects to allocate everything on cache line boundaries.  Make sure the
133
   * first allocation begins on a cache line boundary.
134
   */
135
0
  aligned = (char *)
136
0
    (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
137
0
  shmhdr->freeoffset = aligned - (char *) shmhdr;
138
139
  /* ShmemIndex can't be set up yet (need LWLocks first) */
140
0
  shmhdr->index = NULL;
141
0
  ShmemIndex = (HTAB *) NULL;
142
0
}
143
144
/*
145
 * ShmemAlloc -- allocate max-aligned chunk from shared memory
146
 *
147
 * Throws error if request cannot be satisfied.
148
 *
149
 * Assumes ShmemLock and ShmemSegHdr are initialized.
150
 */
151
void *
152
ShmemAlloc(Size size)
153
0
{
154
0
  void     *newSpace;
155
0
  Size    allocated_size;
156
157
0
  newSpace = ShmemAllocRaw(size, &allocated_size);
158
0
  if (!newSpace)
159
0
    ereport(ERROR,
160
0
        (errcode(ERRCODE_OUT_OF_MEMORY),
161
0
         errmsg("out of shared memory (%zu bytes requested)",
162
0
            size)));
163
0
  return newSpace;
164
0
}
165
166
/*
167
 * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
168
 *
169
 * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
170
 */
171
void *
172
ShmemAllocNoError(Size size)
173
0
{
174
0
  Size    allocated_size;
175
176
0
  return ShmemAllocRaw(size, &allocated_size);
177
0
}
178
179
/*
180
 * ShmemAllocRaw -- allocate align chunk and return allocated size
181
 *
182
 * Also sets *allocated_size to the number of bytes allocated, which will
183
 * be equal to the number requested plus any padding we choose to add.
184
 */
185
static void *
186
ShmemAllocRaw(Size size, Size *allocated_size)
187
0
{
188
0
  Size    newStart;
189
0
  Size    newFree;
190
0
  void     *newSpace;
191
192
  /*
193
   * Ensure all space is adequately aligned.  We used to only MAXALIGN this
194
   * space but experience has proved that on modern systems that is not good
195
   * enough.  Many parts of the system are very sensitive to critical data
196
   * structures getting split across cache line boundaries.  To avoid that,
197
   * attempt to align the beginning of the allocation to a cache line
198
   * boundary.  The calling code will still need to be careful about how it
199
   * uses the allocated space - e.g. by padding each element in an array of
200
   * structures out to a power-of-two size - but without this, even that
201
   * won't be sufficient.
202
   */
203
0
  size = CACHELINEALIGN(size);
204
0
  *allocated_size = size;
205
206
0
  Assert(ShmemSegHdr != NULL);
207
208
0
  SpinLockAcquire(ShmemLock);
209
210
0
  newStart = ShmemSegHdr->freeoffset;
211
212
0
  newFree = newStart + size;
213
0
  if (newFree <= ShmemSegHdr->totalsize)
214
0
  {
215
0
    newSpace = (char *) ShmemBase + newStart;
216
0
    ShmemSegHdr->freeoffset = newFree;
217
0
  }
218
0
  else
219
0
    newSpace = NULL;
220
221
0
  SpinLockRelease(ShmemLock);
222
223
  /* note this assert is okay with newSpace == NULL */
224
0
  Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
225
226
0
  return newSpace;
227
0
}
228
229
/*
230
 * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
231
 *
232
 * Allocate space without locking ShmemLock.  This should be used for,
233
 * and only for, allocations that must happen before ShmemLock is ready.
234
 *
235
 * We consider maxalign, rather than cachealign, sufficient here.
236
 */
237
void *
238
ShmemAllocUnlocked(Size size)
239
0
{
240
0
  Size    newStart;
241
0
  Size    newFree;
242
0
  void     *newSpace;
243
244
  /*
245
   * Ensure allocated space is adequately aligned.
246
   */
247
0
  size = MAXALIGN(size);
248
249
0
  Assert(ShmemSegHdr != NULL);
250
251
0
  newStart = ShmemSegHdr->freeoffset;
252
253
0
  newFree = newStart + size;
254
0
  if (newFree > ShmemSegHdr->totalsize)
255
0
    ereport(ERROR,
256
0
        (errcode(ERRCODE_OUT_OF_MEMORY),
257
0
         errmsg("out of shared memory (%zu bytes requested)",
258
0
            size)));
259
0
  ShmemSegHdr->freeoffset = newFree;
260
261
0
  newSpace = (char *) ShmemBase + newStart;
262
263
0
  Assert(newSpace == (void *) MAXALIGN(newSpace));
264
265
0
  return newSpace;
266
0
}
267
268
/*
269
 * ShmemAddrIsValid -- test if an address refers to shared memory
270
 *
271
 * Returns true if the pointer points within the shared memory segment.
272
 */
273
bool
274
ShmemAddrIsValid(const void *addr)
275
0
{
276
0
  return (addr >= ShmemBase) && (addr < ShmemEnd);
277
0
}
278
279
/*
280
 *  InitShmemIndex() --- set up or attach to shmem index table.
281
 */
282
void
283
InitShmemIndex(void)
284
0
{
285
0
  HASHCTL   info;
286
287
  /*
288
   * Create the shared memory shmem index.
289
   *
290
   * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
291
   * hashtable to exist already, we have a bit of a circularity problem in
292
   * initializing the ShmemIndex itself.  The special "ShmemIndex" hash
293
   * table name will tell ShmemInitStruct to fake it.
294
   */
295
0
  info.keysize = SHMEM_INDEX_KEYSIZE;
296
0
  info.entrysize = sizeof(ShmemIndexEnt);
297
298
0
  ShmemIndex = ShmemInitHash("ShmemIndex",
299
0
                 SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
300
0
                 &info,
301
0
                 HASH_ELEM | HASH_STRINGS);
302
0
}
303
304
/*
305
 * ShmemInitHash -- Create and initialize, or attach to, a
306
 *    shared memory hash table.
307
 *
308
 * We assume caller is doing some kind of synchronization
309
 * so that two processes don't try to create/initialize the same
310
 * table at once.  (In practice, all creations are done in the postmaster
311
 * process; child processes should always be attaching to existing tables.)
312
 *
313
 * max_size is the estimated maximum number of hashtable entries.  This is
314
 * not a hard limit, but the access efficiency will degrade if it is
315
 * exceeded substantially (since it's used to compute directory size and
316
 * the hash table buckets will get overfull).
317
 *
318
 * init_size is the number of hashtable entries to preallocate.  For a table
319
 * whose maximum size is certain, this should be equal to max_size; that
320
 * ensures that no run-time out-of-shared-memory failures can occur.
321
 *
322
 * *infoP and hash_flags must specify at least the entry sizes and key
323
 * comparison semantics (see hash_create()).  Flag bits and values specific
324
 * to shared-memory hash tables are added here, except that callers may
325
 * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
326
 *
327
 * Note: before Postgres 9.0, this function returned NULL for some failure
328
 * cases.  Now, it always throws error instead, so callers need not check
329
 * for NULL.
330
 */
331
HTAB *
332
ShmemInitHash(const char *name,   /* table string name for shmem index */
333
        long init_size, /* initial table size */
334
        long max_size,  /* max size of the table */
335
        HASHCTL *infoP, /* info about key and bucket size */
336
        int hash_flags) /* info about infoP */
337
0
{
338
0
  bool    found;
339
0
  void     *location;
340
341
  /*
342
   * Hash tables allocated in shared memory have a fixed directory; it can't
343
   * grow or other backends wouldn't be able to find it. So, make sure we
344
   * make it big enough to start with.
345
   *
346
   * The shared memory allocator must be specified too.
347
   */
348
0
  infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
349
0
  infoP->alloc = ShmemAllocNoError;
350
0
  hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
351
352
  /* look it up in the shmem index */
353
0
  location = ShmemInitStruct(name,
354
0
                 hash_get_shared_size(infoP, hash_flags),
355
0
                 &found);
356
357
  /*
358
   * if it already exists, attach to it rather than allocate and initialize
359
   * new space
360
   */
361
0
  if (found)
362
0
    hash_flags |= HASH_ATTACH;
363
364
  /* Pass location of hashtable header to hash_create */
365
0
  infoP->hctl = (HASHHDR *) location;
366
367
0
  return hash_create(name, init_size, infoP, hash_flags);
368
0
}
369
370
/*
371
 * ShmemInitStruct -- Create/attach to a structure in shared memory.
372
 *
373
 *    This is called during initialization to find or allocate
374
 *    a data structure in shared memory.  If no other process
375
 *    has created the structure, this routine allocates space
376
 *    for it.  If it exists already, a pointer to the existing
377
 *    structure is returned.
378
 *
379
 *  Returns: pointer to the object.  *foundPtr is set true if the object was
380
 *    already in the shmem index (hence, already initialized).
381
 *
382
 *  Note: before Postgres 9.0, this function returned NULL for some failure
383
 *  cases.  Now, it always throws error instead, so callers need not check
384
 *  for NULL.
385
 */
386
void *
387
ShmemInitStruct(const char *name, Size size, bool *foundPtr)
388
0
{
389
0
  ShmemIndexEnt *result;
390
0
  void     *structPtr;
391
392
0
  LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
393
394
0
  if (!ShmemIndex)
395
0
  {
396
0
    PGShmemHeader *shmemseghdr = ShmemSegHdr;
397
398
    /* Must be trying to create/attach to ShmemIndex itself */
399
0
    Assert(strcmp(name, "ShmemIndex") == 0);
400
401
0
    if (IsUnderPostmaster)
402
0
    {
403
      /* Must be initializing a (non-standalone) backend */
404
0
      Assert(shmemseghdr->index != NULL);
405
0
      structPtr = shmemseghdr->index;
406
0
      *foundPtr = true;
407
0
    }
408
0
    else
409
0
    {
410
      /*
411
       * If the shmem index doesn't exist, we are bootstrapping: we must
412
       * be trying to init the shmem index itself.
413
       *
414
       * Notice that the ShmemIndexLock is released before the shmem
415
       * index has been initialized.  This should be OK because no other
416
       * process can be accessing shared memory yet.
417
       */
418
0
      Assert(shmemseghdr->index == NULL);
419
0
      structPtr = ShmemAlloc(size);
420
0
      shmemseghdr->index = structPtr;
421
0
      *foundPtr = false;
422
0
    }
423
0
    LWLockRelease(ShmemIndexLock);
424
0
    return structPtr;
425
0
  }
426
427
  /* look it up in the shmem index */
428
0
  result = (ShmemIndexEnt *)
429
0
    hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
430
431
0
  if (!result)
432
0
  {
433
0
    LWLockRelease(ShmemIndexLock);
434
0
    ereport(ERROR,
435
0
        (errcode(ERRCODE_OUT_OF_MEMORY),
436
0
         errmsg("could not create ShmemIndex entry for data structure \"%s\"",
437
0
            name)));
438
0
  }
439
440
0
  if (*foundPtr)
441
0
  {
442
    /*
443
     * Structure is in the shmem index so someone else has allocated it
444
     * already.  The size better be the same as the size we are trying to
445
     * initialize to, or there is a name conflict (or worse).
446
     */
447
0
    if (result->size != size)
448
0
    {
449
0
      LWLockRelease(ShmemIndexLock);
450
0
      ereport(ERROR,
451
0
          (errmsg("ShmemIndex entry size is wrong for data structure"
452
0
              " \"%s\": expected %zu, actual %zu",
453
0
              name, size, result->size)));
454
0
    }
455
0
    structPtr = result->location;
456
0
  }
457
0
  else
458
0
  {
459
0
    Size    allocated_size;
460
461
    /* It isn't in the table yet. allocate and initialize it */
462
0
    structPtr = ShmemAllocRaw(size, &allocated_size);
463
0
    if (structPtr == NULL)
464
0
    {
465
      /* out of memory; remove the failed ShmemIndex entry */
466
0
      hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
467
0
      LWLockRelease(ShmemIndexLock);
468
0
      ereport(ERROR,
469
0
          (errcode(ERRCODE_OUT_OF_MEMORY),
470
0
           errmsg("not enough shared memory for data structure"
471
0
              " \"%s\" (%zu bytes requested)",
472
0
              name, size)));
473
0
    }
474
0
    result->size = size;
475
0
    result->allocated_size = allocated_size;
476
0
    result->location = structPtr;
477
0
  }
478
479
0
  LWLockRelease(ShmemIndexLock);
480
481
0
  Assert(ShmemAddrIsValid(structPtr));
482
483
0
  Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
484
485
0
  return structPtr;
486
0
}
487
488
489
/*
490
 * Add two Size values, checking for overflow
491
 */
492
Size
493
add_size(Size s1, Size s2)
494
0
{
495
0
  Size    result;
496
497
0
  result = s1 + s2;
498
  /* We are assuming Size is an unsigned type here... */
499
0
  if (result < s1 || result < s2)
500
0
    ereport(ERROR,
501
0
        (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
502
0
         errmsg("requested shared memory size overflows size_t")));
503
0
  return result;
504
0
}
505
506
/*
507
 * Multiply two Size values, checking for overflow
508
 */
509
Size
510
mul_size(Size s1, Size s2)
511
0
{
512
0
  Size    result;
513
514
0
  if (s1 == 0 || s2 == 0)
515
0
    return 0;
516
0
  result = s1 * s2;
517
  /* We are assuming Size is an unsigned type here... */
518
0
  if (result / s2 != s1)
519
0
    ereport(ERROR,
520
0
        (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
521
0
         errmsg("requested shared memory size overflows size_t")));
522
0
  return result;
523
0
}
524
525
/* SQL SRF showing allocated shared memory */
526
Datum
527
pg_get_shmem_allocations(PG_FUNCTION_ARGS)
528
0
{
529
0
#define PG_GET_SHMEM_SIZES_COLS 4
530
0
  ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
531
0
  HASH_SEQ_STATUS hstat;
532
0
  ShmemIndexEnt *ent;
533
0
  Size    named_allocated = 0;
534
0
  Datum   values[PG_GET_SHMEM_SIZES_COLS];
535
0
  bool    nulls[PG_GET_SHMEM_SIZES_COLS];
536
537
0
  InitMaterializedSRF(fcinfo, 0);
538
539
0
  LWLockAcquire(ShmemIndexLock, LW_SHARED);
540
541
0
  hash_seq_init(&hstat, ShmemIndex);
542
543
  /* output all allocated entries */
544
0
  memset(nulls, 0, sizeof(nulls));
545
0
  while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
546
0
  {
547
0
    values[0] = CStringGetTextDatum(ent->key);
548
0
    values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
549
0
    values[2] = Int64GetDatum(ent->size);
550
0
    values[3] = Int64GetDatum(ent->allocated_size);
551
0
    named_allocated += ent->allocated_size;
552
553
0
    tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
554
0
               values, nulls);
555
0
  }
556
557
  /* output shared memory allocated but not counted via the shmem index */
558
0
  values[0] = CStringGetTextDatum("<anonymous>");
559
0
  nulls[1] = true;
560
0
  values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
561
0
  values[3] = values[2];
562
0
  tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
563
564
  /* output as-of-yet unused shared memory */
565
0
  nulls[0] = true;
566
0
  values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
567
0
  nulls[1] = false;
568
0
  values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
569
0
  values[3] = values[2];
570
0
  tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
571
572
0
  LWLockRelease(ShmemIndexLock);
573
574
0
  return (Datum) 0;
575
0
}
576
577
/*
578
 * SQL SRF showing NUMA memory nodes for allocated shared memory
579
 *
580
 * Compared to pg_get_shmem_allocations(), this function does not return
581
 * information about shared anonymous allocations and unused shared memory.
582
 */
583
Datum
584
pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
585
0
{
586
0
#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
587
0
  ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
588
0
  HASH_SEQ_STATUS hstat;
589
0
  ShmemIndexEnt *ent;
590
0
  Datum   values[PG_GET_SHMEM_NUMA_SIZES_COLS];
591
0
  bool    nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
592
0
  Size    os_page_size;
593
0
  void    **page_ptrs;
594
0
  int      *pages_status;
595
0
  uint64    shm_total_page_count,
596
0
        shm_ent_page_count,
597
0
        max_nodes;
598
0
  Size     *nodes;
599
600
0
  if (pg_numa_init() == -1)
601
0
    elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
602
603
0
  InitMaterializedSRF(fcinfo, 0);
604
605
0
  max_nodes = pg_numa_get_max_node();
606
0
  nodes = palloc(sizeof(Size) * (max_nodes + 1));
607
608
  /*
609
   * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
610
   * the OS may have different memory page sizes.
611
   *
612
   * To correctly map between them, we need to: 1. Determine the OS memory
613
   * page size 2. Calculate how many OS pages are used by all buffer blocks
614
   * 3. Calculate how many OS pages are contained within each database
615
   * block.
616
   *
617
   * This information is needed before calling move_pages() for NUMA memory
618
   * node inquiry.
619
   */
620
0
  os_page_size = pg_get_shmem_pagesize();
621
622
  /*
623
   * Allocate memory for page pointers and status based on total shared
624
   * memory size. This simplified approach allocates enough space for all
625
   * pages in shared memory rather than calculating the exact requirements
626
   * for each segment.
627
   *
628
   * Add 1, because we don't know how exactly the segments align to OS
629
   * pages, so the allocation might use one more memory page. In practice
630
   * this is not very likely, and moreover we have more entries, each of
631
   * them using only fraction of the total pages.
632
   */
633
0
  shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
634
0
  page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
635
0
  pages_status = palloc(sizeof(int) * shm_total_page_count);
636
637
0
  if (firstNumaTouch)
638
0
    elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
639
640
0
  LWLockAcquire(ShmemIndexLock, LW_SHARED);
641
642
0
  hash_seq_init(&hstat, ShmemIndex);
643
644
  /* output all allocated entries */
645
0
  memset(nulls, 0, sizeof(nulls));
646
0
  while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
647
0
  {
648
0
    int     i;
649
0
    char     *startptr,
650
0
           *endptr;
651
0
    Size    total_len;
652
653
    /*
654
     * Calculate the range of OS pages used by this segment. The segment
655
     * may start / end half-way through a page, we want to count these
656
     * pages too. So we align the start/end pointers down/up, and then
657
     * calculate the number of pages from that.
658
     */
659
0
    startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
660
0
    endptr = (char *) TYPEALIGN(os_page_size,
661
0
                  (char *) ent->location + ent->allocated_size);
662
0
    total_len = (endptr - startptr);
663
664
0
    shm_ent_page_count = total_len / os_page_size;
665
666
    /*
667
     * If we ever get 0xff (-1) back from kernel inquiry, then we probably
668
     * have a bug in mapping buffers to OS pages.
669
     */
670
0
    memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
671
672
    /*
673
     * Setup page_ptrs[] with pointers to all OS pages for this segment,
674
     * and get the NUMA status using pg_numa_query_pages.
675
     *
676
     * In order to get reliable results we also need to touch memory
677
     * pages, so that inquiry about NUMA memory node doesn't return -2
678
     * (ENOENT, which indicates unmapped/unallocated pages).
679
     */
680
0
    for (i = 0; i < shm_ent_page_count; i++)
681
0
    {
682
0
      page_ptrs[i] = startptr + (i * os_page_size);
683
684
0
      if (firstNumaTouch)
685
0
        pg_numa_touch_mem_if_required(page_ptrs[i]);
686
687
0
      CHECK_FOR_INTERRUPTS();
688
0
    }
689
690
0
    if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
691
0
      elog(ERROR, "failed NUMA pages inquiry status: %m");
692
693
    /* Count number of NUMA nodes used for this shared memory entry */
694
0
    memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
695
696
0
    for (i = 0; i < shm_ent_page_count; i++)
697
0
    {
698
0
      int     s = pages_status[i];
699
700
      /* Ensure we are adding only valid index to the array */
701
0
      if (s < 0 || s > max_nodes)
702
0
      {
703
0
        elog(ERROR, "invalid NUMA node id outside of allowed range "
704
0
           "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
705
0
      }
706
707
0
      nodes[s]++;
708
0
    }
709
710
    /*
711
     * Add one entry for each NUMA node, including those without allocated
712
     * memory for this segment.
713
     */
714
0
    for (i = 0; i <= max_nodes; i++)
715
0
    {
716
0
      values[0] = CStringGetTextDatum(ent->key);
717
0
      values[1] = i;
718
0
      values[2] = Int64GetDatum(nodes[i] * os_page_size);
719
720
0
      tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
721
0
                 values, nulls);
722
0
    }
723
0
  }
724
725
0
  LWLockRelease(ShmemIndexLock);
726
0
  firstNumaTouch = false;
727
728
0
  return (Datum) 0;
729
0
}
730
731
/*
732
 * Determine the memory page size used for the shared memory segment.
733
 *
734
 * If the shared segment was allocated using huge pages, returns the size of
735
 * a huge page. Otherwise returns the size of regular memory page.
736
 *
737
 * This should be used only after the server is started.
738
 */
739
Size
740
pg_get_shmem_pagesize(void)
741
0
{
742
0
  Size    os_page_size;
743
#ifdef WIN32
744
  SYSTEM_INFO sysinfo;
745
746
  GetSystemInfo(&sysinfo);
747
  os_page_size = sysinfo.dwPageSize;
748
#else
749
0
  os_page_size = sysconf(_SC_PAGESIZE);
750
0
#endif
751
752
0
  Assert(IsUnderPostmaster);
753
0
  Assert(huge_pages_status != HUGE_PAGES_UNKNOWN);
754
755
0
  if (huge_pages_status == HUGE_PAGES_ON)
756
0
    GetHugePageSize(&os_page_size, NULL);
757
758
0
  return os_page_size;
759
0
}
760
761
Datum
762
pg_numa_available(PG_FUNCTION_ARGS)
763
0
{
764
0
  PG_RETURN_BOOL(pg_numa_init() != -1);
765
0
}