/src/postgres/src/backend/storage/ipc/shmem.c
Line | Count | Source (jump to first uncovered line) |
1 | | /*------------------------------------------------------------------------- |
2 | | * |
3 | | * shmem.c |
4 | | * create shared memory and initialize shared memory data structures. |
5 | | * |
6 | | * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group |
7 | | * Portions Copyright (c) 1994, Regents of the University of California |
8 | | * |
9 | | * |
10 | | * IDENTIFICATION |
11 | | * src/backend/storage/ipc/shmem.c |
12 | | * |
13 | | *------------------------------------------------------------------------- |
14 | | */ |
15 | | /* |
16 | | * POSTGRES processes share one or more regions of shared memory. |
17 | | * The shared memory is created by a postmaster and is inherited |
18 | | * by each backend via fork() (or, in some ports, via other OS-specific |
19 | | * methods). The routines in this file are used for allocating and |
20 | | * binding to shared memory data structures. |
21 | | * |
22 | | * NOTES: |
23 | | * (a) There are three kinds of shared memory data structures |
24 | | * available to POSTGRES: fixed-size structures, queues and hash |
25 | | * tables. Fixed-size structures contain things like global variables |
26 | | * for a module and should never be allocated after the shared memory |
27 | | * initialization phase. Hash tables have a fixed maximum size, but |
28 | | * their actual size can vary dynamically. When entries are added |
29 | | * to the table, more space is allocated. Queues link data structures |
30 | | * that have been allocated either within fixed-size structures or as hash |
31 | | * buckets. Each shared data structure has a string name to identify |
32 | | * it (assigned in the module that declares it). |
33 | | * |
34 | | * (b) During initialization, each module looks for its |
35 | | * shared data structures in a hash table called the "Shmem Index". |
36 | | * If the data structure is not present, the caller can allocate |
37 | | * a new one and initialize it. If the data structure is present, |
38 | | * the caller "attaches" to the structure by initializing a pointer |
39 | | * in the local address space. |
40 | | * The shmem index has two purposes: first, it gives us |
41 | | * a simple model of how the world looks when a backend process |
42 | | * initializes. If something is present in the shmem index, |
43 | | * it is initialized. If it is not, it is uninitialized. Second, |
44 | | * the shmem index allows us to allocate shared memory on demand |
45 | | * instead of trying to preallocate structures and hard-wire the |
46 | | * sizes and locations in header files. If you are using a lot |
47 | | * of shared memory in a lot of different places (and changing |
48 | | * things during development), this is important. |
49 | | * |
50 | | * (c) In standard Unix-ish environments, individual backends do not |
51 | | * need to re-establish their local pointers into shared memory, because |
52 | | * they inherit correct values of those variables via fork() from the |
53 | | * postmaster. However, this does not work in the EXEC_BACKEND case. |
54 | | * In ports using EXEC_BACKEND, new backends have to set up their local |
55 | | * pointers using the method described in (b) above. |
56 | | * |
57 | | * (d) memory allocation model: shared memory can never be |
58 | | * freed, once allocated. Each hash table has its own free list, |
59 | | * so hash buckets can be reused when an item is deleted. However, |
60 | | * if one hash table grows very large and then shrinks, its space |
61 | | * cannot be redistributed to other tables. We could build a simple |
62 | | * hash bucket garbage collector if need be. Right now, it seems |
63 | | * unnecessary. |
64 | | */ |
65 | | |
66 | | #include "postgres.h" |
67 | | |
68 | | #include "fmgr.h" |
69 | | #include "funcapi.h" |
70 | | #include "miscadmin.h" |
71 | | #include "port/pg_numa.h" |
72 | | #include "storage/lwlock.h" |
73 | | #include "storage/pg_shmem.h" |
74 | | #include "storage/shmem.h" |
75 | | #include "storage/spin.h" |
76 | | #include "utils/builtins.h" |
77 | | |
78 | | static void *ShmemAllocRaw(Size size, Size *allocated_size); |
79 | | |
80 | | /* shared memory global variables */ |
81 | | |
82 | | static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ |
83 | | |
84 | | static void *ShmemBase; /* start address of shared memory */ |
85 | | |
86 | | static void *ShmemEnd; /* end+1 address of shared memory */ |
87 | | |
88 | | slock_t *ShmemLock; /* spinlock for shared memory and LWLock |
89 | | * allocation */ |
90 | | |
91 | | static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ |
92 | | |
93 | | /* To get reliable results for NUMA inquiry we need to "touch pages" once */ |
94 | | static bool firstNumaTouch = true; |
95 | | |
96 | | Datum pg_numa_available(PG_FUNCTION_ARGS); |
97 | | |
98 | | /* |
99 | | * InitShmemAccess() --- set up basic pointers to shared memory. |
100 | | */ |
101 | | void |
102 | | InitShmemAccess(PGShmemHeader *seghdr) |
103 | 0 | { |
104 | 0 | ShmemSegHdr = seghdr; |
105 | 0 | ShmemBase = seghdr; |
106 | 0 | ShmemEnd = (char *) ShmemBase + seghdr->totalsize; |
107 | 0 | } |
108 | | |
109 | | /* |
110 | | * InitShmemAllocation() --- set up shared-memory space allocation. |
111 | | * |
112 | | * This should be called only in the postmaster or a standalone backend. |
113 | | */ |
114 | | void |
115 | | InitShmemAllocation(void) |
116 | 0 | { |
117 | 0 | PGShmemHeader *shmhdr = ShmemSegHdr; |
118 | 0 | char *aligned; |
119 | |
|
120 | 0 | Assert(shmhdr != NULL); |
121 | | |
122 | | /* |
123 | | * Initialize the spinlock used by ShmemAlloc. We must use |
124 | | * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet. |
125 | | */ |
126 | 0 | ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t)); |
127 | |
|
128 | 0 | SpinLockInit(ShmemLock); |
129 | | |
130 | | /* |
131 | | * Allocations after this point should go through ShmemAlloc, which |
132 | | * expects to allocate everything on cache line boundaries. Make sure the |
133 | | * first allocation begins on a cache line boundary. |
134 | | */ |
135 | 0 | aligned = (char *) |
136 | 0 | (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset))); |
137 | 0 | shmhdr->freeoffset = aligned - (char *) shmhdr; |
138 | | |
139 | | /* ShmemIndex can't be set up yet (need LWLocks first) */ |
140 | 0 | shmhdr->index = NULL; |
141 | 0 | ShmemIndex = (HTAB *) NULL; |
142 | 0 | } |
143 | | |
144 | | /* |
145 | | * ShmemAlloc -- allocate max-aligned chunk from shared memory |
146 | | * |
147 | | * Throws error if request cannot be satisfied. |
148 | | * |
149 | | * Assumes ShmemLock and ShmemSegHdr are initialized. |
150 | | */ |
151 | | void * |
152 | | ShmemAlloc(Size size) |
153 | 0 | { |
154 | 0 | void *newSpace; |
155 | 0 | Size allocated_size; |
156 | |
|
157 | 0 | newSpace = ShmemAllocRaw(size, &allocated_size); |
158 | 0 | if (!newSpace) |
159 | 0 | ereport(ERROR, |
160 | 0 | (errcode(ERRCODE_OUT_OF_MEMORY), |
161 | 0 | errmsg("out of shared memory (%zu bytes requested)", |
162 | 0 | size))); |
163 | 0 | return newSpace; |
164 | 0 | } |
165 | | |
166 | | /* |
167 | | * ShmemAllocNoError -- allocate max-aligned chunk from shared memory |
168 | | * |
169 | | * As ShmemAlloc, but returns NULL if out of space, rather than erroring. |
170 | | */ |
171 | | void * |
172 | | ShmemAllocNoError(Size size) |
173 | 0 | { |
174 | 0 | Size allocated_size; |
175 | |
|
176 | 0 | return ShmemAllocRaw(size, &allocated_size); |
177 | 0 | } |
178 | | |
179 | | /* |
180 | | * ShmemAllocRaw -- allocate align chunk and return allocated size |
181 | | * |
182 | | * Also sets *allocated_size to the number of bytes allocated, which will |
183 | | * be equal to the number requested plus any padding we choose to add. |
184 | | */ |
185 | | static void * |
186 | | ShmemAllocRaw(Size size, Size *allocated_size) |
187 | 0 | { |
188 | 0 | Size newStart; |
189 | 0 | Size newFree; |
190 | 0 | void *newSpace; |
191 | | |
192 | | /* |
193 | | * Ensure all space is adequately aligned. We used to only MAXALIGN this |
194 | | * space but experience has proved that on modern systems that is not good |
195 | | * enough. Many parts of the system are very sensitive to critical data |
196 | | * structures getting split across cache line boundaries. To avoid that, |
197 | | * attempt to align the beginning of the allocation to a cache line |
198 | | * boundary. The calling code will still need to be careful about how it |
199 | | * uses the allocated space - e.g. by padding each element in an array of |
200 | | * structures out to a power-of-two size - but without this, even that |
201 | | * won't be sufficient. |
202 | | */ |
203 | 0 | size = CACHELINEALIGN(size); |
204 | 0 | *allocated_size = size; |
205 | |
|
206 | 0 | Assert(ShmemSegHdr != NULL); |
207 | |
|
208 | 0 | SpinLockAcquire(ShmemLock); |
209 | |
|
210 | 0 | newStart = ShmemSegHdr->freeoffset; |
211 | |
|
212 | 0 | newFree = newStart + size; |
213 | 0 | if (newFree <= ShmemSegHdr->totalsize) |
214 | 0 | { |
215 | 0 | newSpace = (char *) ShmemBase + newStart; |
216 | 0 | ShmemSegHdr->freeoffset = newFree; |
217 | 0 | } |
218 | 0 | else |
219 | 0 | newSpace = NULL; |
220 | |
|
221 | 0 | SpinLockRelease(ShmemLock); |
222 | | |
223 | | /* note this assert is okay with newSpace == NULL */ |
224 | 0 | Assert(newSpace == (void *) CACHELINEALIGN(newSpace)); |
225 | |
|
226 | 0 | return newSpace; |
227 | 0 | } |
228 | | |
229 | | /* |
230 | | * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory |
231 | | * |
232 | | * Allocate space without locking ShmemLock. This should be used for, |
233 | | * and only for, allocations that must happen before ShmemLock is ready. |
234 | | * |
235 | | * We consider maxalign, rather than cachealign, sufficient here. |
236 | | */ |
237 | | void * |
238 | | ShmemAllocUnlocked(Size size) |
239 | 0 | { |
240 | 0 | Size newStart; |
241 | 0 | Size newFree; |
242 | 0 | void *newSpace; |
243 | | |
244 | | /* |
245 | | * Ensure allocated space is adequately aligned. |
246 | | */ |
247 | 0 | size = MAXALIGN(size); |
248 | |
|
249 | 0 | Assert(ShmemSegHdr != NULL); |
250 | |
|
251 | 0 | newStart = ShmemSegHdr->freeoffset; |
252 | |
|
253 | 0 | newFree = newStart + size; |
254 | 0 | if (newFree > ShmemSegHdr->totalsize) |
255 | 0 | ereport(ERROR, |
256 | 0 | (errcode(ERRCODE_OUT_OF_MEMORY), |
257 | 0 | errmsg("out of shared memory (%zu bytes requested)", |
258 | 0 | size))); |
259 | 0 | ShmemSegHdr->freeoffset = newFree; |
260 | |
|
261 | 0 | newSpace = (char *) ShmemBase + newStart; |
262 | |
|
263 | 0 | Assert(newSpace == (void *) MAXALIGN(newSpace)); |
264 | |
|
265 | 0 | return newSpace; |
266 | 0 | } |
267 | | |
268 | | /* |
269 | | * ShmemAddrIsValid -- test if an address refers to shared memory |
270 | | * |
271 | | * Returns true if the pointer points within the shared memory segment. |
272 | | */ |
273 | | bool |
274 | | ShmemAddrIsValid(const void *addr) |
275 | 0 | { |
276 | 0 | return (addr >= ShmemBase) && (addr < ShmemEnd); |
277 | 0 | } |
278 | | |
279 | | /* |
280 | | * InitShmemIndex() --- set up or attach to shmem index table. |
281 | | */ |
282 | | void |
283 | | InitShmemIndex(void) |
284 | 0 | { |
285 | 0 | HASHCTL info; |
286 | | |
287 | | /* |
288 | | * Create the shared memory shmem index. |
289 | | * |
290 | | * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex |
291 | | * hashtable to exist already, we have a bit of a circularity problem in |
292 | | * initializing the ShmemIndex itself. The special "ShmemIndex" hash |
293 | | * table name will tell ShmemInitStruct to fake it. |
294 | | */ |
295 | 0 | info.keysize = SHMEM_INDEX_KEYSIZE; |
296 | 0 | info.entrysize = sizeof(ShmemIndexEnt); |
297 | |
|
298 | 0 | ShmemIndex = ShmemInitHash("ShmemIndex", |
299 | 0 | SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE, |
300 | 0 | &info, |
301 | 0 | HASH_ELEM | HASH_STRINGS); |
302 | 0 | } |
303 | | |
304 | | /* |
305 | | * ShmemInitHash -- Create and initialize, or attach to, a |
306 | | * shared memory hash table. |
307 | | * |
308 | | * We assume caller is doing some kind of synchronization |
309 | | * so that two processes don't try to create/initialize the same |
310 | | * table at once. (In practice, all creations are done in the postmaster |
311 | | * process; child processes should always be attaching to existing tables.) |
312 | | * |
313 | | * max_size is the estimated maximum number of hashtable entries. This is |
314 | | * not a hard limit, but the access efficiency will degrade if it is |
315 | | * exceeded substantially (since it's used to compute directory size and |
316 | | * the hash table buckets will get overfull). |
317 | | * |
318 | | * init_size is the number of hashtable entries to preallocate. For a table |
319 | | * whose maximum size is certain, this should be equal to max_size; that |
320 | | * ensures that no run-time out-of-shared-memory failures can occur. |
321 | | * |
322 | | * *infoP and hash_flags must specify at least the entry sizes and key |
323 | | * comparison semantics (see hash_create()). Flag bits and values specific |
324 | | * to shared-memory hash tables are added here, except that callers may |
325 | | * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE. |
326 | | * |
327 | | * Note: before Postgres 9.0, this function returned NULL for some failure |
328 | | * cases. Now, it always throws error instead, so callers need not check |
329 | | * for NULL. |
330 | | */ |
331 | | HTAB * |
332 | | ShmemInitHash(const char *name, /* table string name for shmem index */ |
333 | | long init_size, /* initial table size */ |
334 | | long max_size, /* max size of the table */ |
335 | | HASHCTL *infoP, /* info about key and bucket size */ |
336 | | int hash_flags) /* info about infoP */ |
337 | 0 | { |
338 | 0 | bool found; |
339 | 0 | void *location; |
340 | | |
341 | | /* |
342 | | * Hash tables allocated in shared memory have a fixed directory; it can't |
343 | | * grow or other backends wouldn't be able to find it. So, make sure we |
344 | | * make it big enough to start with. |
345 | | * |
346 | | * The shared memory allocator must be specified too. |
347 | | */ |
348 | 0 | infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size); |
349 | 0 | infoP->alloc = ShmemAllocNoError; |
350 | 0 | hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE; |
351 | | |
352 | | /* look it up in the shmem index */ |
353 | 0 | location = ShmemInitStruct(name, |
354 | 0 | hash_get_shared_size(infoP, hash_flags), |
355 | 0 | &found); |
356 | | |
357 | | /* |
358 | | * if it already exists, attach to it rather than allocate and initialize |
359 | | * new space |
360 | | */ |
361 | 0 | if (found) |
362 | 0 | hash_flags |= HASH_ATTACH; |
363 | | |
364 | | /* Pass location of hashtable header to hash_create */ |
365 | 0 | infoP->hctl = (HASHHDR *) location; |
366 | |
|
367 | 0 | return hash_create(name, init_size, infoP, hash_flags); |
368 | 0 | } |
369 | | |
370 | | /* |
371 | | * ShmemInitStruct -- Create/attach to a structure in shared memory. |
372 | | * |
373 | | * This is called during initialization to find or allocate |
374 | | * a data structure in shared memory. If no other process |
375 | | * has created the structure, this routine allocates space |
376 | | * for it. If it exists already, a pointer to the existing |
377 | | * structure is returned. |
378 | | * |
379 | | * Returns: pointer to the object. *foundPtr is set true if the object was |
380 | | * already in the shmem index (hence, already initialized). |
381 | | * |
382 | | * Note: before Postgres 9.0, this function returned NULL for some failure |
383 | | * cases. Now, it always throws error instead, so callers need not check |
384 | | * for NULL. |
385 | | */ |
386 | | void * |
387 | | ShmemInitStruct(const char *name, Size size, bool *foundPtr) |
388 | 0 | { |
389 | 0 | ShmemIndexEnt *result; |
390 | 0 | void *structPtr; |
391 | |
|
392 | 0 | LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); |
393 | |
|
394 | 0 | if (!ShmemIndex) |
395 | 0 | { |
396 | 0 | PGShmemHeader *shmemseghdr = ShmemSegHdr; |
397 | | |
398 | | /* Must be trying to create/attach to ShmemIndex itself */ |
399 | 0 | Assert(strcmp(name, "ShmemIndex") == 0); |
400 | |
|
401 | 0 | if (IsUnderPostmaster) |
402 | 0 | { |
403 | | /* Must be initializing a (non-standalone) backend */ |
404 | 0 | Assert(shmemseghdr->index != NULL); |
405 | 0 | structPtr = shmemseghdr->index; |
406 | 0 | *foundPtr = true; |
407 | 0 | } |
408 | 0 | else |
409 | 0 | { |
410 | | /* |
411 | | * If the shmem index doesn't exist, we are bootstrapping: we must |
412 | | * be trying to init the shmem index itself. |
413 | | * |
414 | | * Notice that the ShmemIndexLock is released before the shmem |
415 | | * index has been initialized. This should be OK because no other |
416 | | * process can be accessing shared memory yet. |
417 | | */ |
418 | 0 | Assert(shmemseghdr->index == NULL); |
419 | 0 | structPtr = ShmemAlloc(size); |
420 | 0 | shmemseghdr->index = structPtr; |
421 | 0 | *foundPtr = false; |
422 | 0 | } |
423 | 0 | LWLockRelease(ShmemIndexLock); |
424 | 0 | return structPtr; |
425 | 0 | } |
426 | | |
427 | | /* look it up in the shmem index */ |
428 | 0 | result = (ShmemIndexEnt *) |
429 | 0 | hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr); |
430 | |
|
431 | 0 | if (!result) |
432 | 0 | { |
433 | 0 | LWLockRelease(ShmemIndexLock); |
434 | 0 | ereport(ERROR, |
435 | 0 | (errcode(ERRCODE_OUT_OF_MEMORY), |
436 | 0 | errmsg("could not create ShmemIndex entry for data structure \"%s\"", |
437 | 0 | name))); |
438 | 0 | } |
439 | | |
440 | 0 | if (*foundPtr) |
441 | 0 | { |
442 | | /* |
443 | | * Structure is in the shmem index so someone else has allocated it |
444 | | * already. The size better be the same as the size we are trying to |
445 | | * initialize to, or there is a name conflict (or worse). |
446 | | */ |
447 | 0 | if (result->size != size) |
448 | 0 | { |
449 | 0 | LWLockRelease(ShmemIndexLock); |
450 | 0 | ereport(ERROR, |
451 | 0 | (errmsg("ShmemIndex entry size is wrong for data structure" |
452 | 0 | " \"%s\": expected %zu, actual %zu", |
453 | 0 | name, size, result->size))); |
454 | 0 | } |
455 | 0 | structPtr = result->location; |
456 | 0 | } |
457 | 0 | else |
458 | 0 | { |
459 | 0 | Size allocated_size; |
460 | | |
461 | | /* It isn't in the table yet. allocate and initialize it */ |
462 | 0 | structPtr = ShmemAllocRaw(size, &allocated_size); |
463 | 0 | if (structPtr == NULL) |
464 | 0 | { |
465 | | /* out of memory; remove the failed ShmemIndex entry */ |
466 | 0 | hash_search(ShmemIndex, name, HASH_REMOVE, NULL); |
467 | 0 | LWLockRelease(ShmemIndexLock); |
468 | 0 | ereport(ERROR, |
469 | 0 | (errcode(ERRCODE_OUT_OF_MEMORY), |
470 | 0 | errmsg("not enough shared memory for data structure" |
471 | 0 | " \"%s\" (%zu bytes requested)", |
472 | 0 | name, size))); |
473 | 0 | } |
474 | 0 | result->size = size; |
475 | 0 | result->allocated_size = allocated_size; |
476 | 0 | result->location = structPtr; |
477 | 0 | } |
478 | | |
479 | 0 | LWLockRelease(ShmemIndexLock); |
480 | |
|
481 | 0 | Assert(ShmemAddrIsValid(structPtr)); |
482 | |
|
483 | 0 | Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); |
484 | |
|
485 | 0 | return structPtr; |
486 | 0 | } |
487 | | |
488 | | |
489 | | /* |
490 | | * Add two Size values, checking for overflow |
491 | | */ |
492 | | Size |
493 | | add_size(Size s1, Size s2) |
494 | 0 | { |
495 | 0 | Size result; |
496 | |
|
497 | 0 | result = s1 + s2; |
498 | | /* We are assuming Size is an unsigned type here... */ |
499 | 0 | if (result < s1 || result < s2) |
500 | 0 | ereport(ERROR, |
501 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
502 | 0 | errmsg("requested shared memory size overflows size_t"))); |
503 | 0 | return result; |
504 | 0 | } |
505 | | |
506 | | /* |
507 | | * Multiply two Size values, checking for overflow |
508 | | */ |
509 | | Size |
510 | | mul_size(Size s1, Size s2) |
511 | 0 | { |
512 | 0 | Size result; |
513 | |
|
514 | 0 | if (s1 == 0 || s2 == 0) |
515 | 0 | return 0; |
516 | 0 | result = s1 * s2; |
517 | | /* We are assuming Size is an unsigned type here... */ |
518 | 0 | if (result / s2 != s1) |
519 | 0 | ereport(ERROR, |
520 | 0 | (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), |
521 | 0 | errmsg("requested shared memory size overflows size_t"))); |
522 | 0 | return result; |
523 | 0 | } |
524 | | |
525 | | /* SQL SRF showing allocated shared memory */ |
526 | | Datum |
527 | | pg_get_shmem_allocations(PG_FUNCTION_ARGS) |
528 | 0 | { |
529 | 0 | #define PG_GET_SHMEM_SIZES_COLS 4 |
530 | 0 | ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; |
531 | 0 | HASH_SEQ_STATUS hstat; |
532 | 0 | ShmemIndexEnt *ent; |
533 | 0 | Size named_allocated = 0; |
534 | 0 | Datum values[PG_GET_SHMEM_SIZES_COLS]; |
535 | 0 | bool nulls[PG_GET_SHMEM_SIZES_COLS]; |
536 | |
|
537 | 0 | InitMaterializedSRF(fcinfo, 0); |
538 | |
|
539 | 0 | LWLockAcquire(ShmemIndexLock, LW_SHARED); |
540 | |
|
541 | 0 | hash_seq_init(&hstat, ShmemIndex); |
542 | | |
543 | | /* output all allocated entries */ |
544 | 0 | memset(nulls, 0, sizeof(nulls)); |
545 | 0 | while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) |
546 | 0 | { |
547 | 0 | values[0] = CStringGetTextDatum(ent->key); |
548 | 0 | values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr); |
549 | 0 | values[2] = Int64GetDatum(ent->size); |
550 | 0 | values[3] = Int64GetDatum(ent->allocated_size); |
551 | 0 | named_allocated += ent->allocated_size; |
552 | |
|
553 | 0 | tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, |
554 | 0 | values, nulls); |
555 | 0 | } |
556 | | |
557 | | /* output shared memory allocated but not counted via the shmem index */ |
558 | 0 | values[0] = CStringGetTextDatum("<anonymous>"); |
559 | 0 | nulls[1] = true; |
560 | 0 | values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated); |
561 | 0 | values[3] = values[2]; |
562 | 0 | tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); |
563 | | |
564 | | /* output as-of-yet unused shared memory */ |
565 | 0 | nulls[0] = true; |
566 | 0 | values[1] = Int64GetDatum(ShmemSegHdr->freeoffset); |
567 | 0 | nulls[1] = false; |
568 | 0 | values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset); |
569 | 0 | values[3] = values[2]; |
570 | 0 | tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); |
571 | |
|
572 | 0 | LWLockRelease(ShmemIndexLock); |
573 | |
|
574 | 0 | return (Datum) 0; |
575 | 0 | } |
576 | | |
577 | | /* |
578 | | * SQL SRF showing NUMA memory nodes for allocated shared memory |
579 | | * |
580 | | * Compared to pg_get_shmem_allocations(), this function does not return |
581 | | * information about shared anonymous allocations and unused shared memory. |
582 | | */ |
583 | | Datum |
584 | | pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS) |
585 | 0 | { |
586 | 0 | #define PG_GET_SHMEM_NUMA_SIZES_COLS 3 |
587 | 0 | ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; |
588 | 0 | HASH_SEQ_STATUS hstat; |
589 | 0 | ShmemIndexEnt *ent; |
590 | 0 | Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS]; |
591 | 0 | bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS]; |
592 | 0 | Size os_page_size; |
593 | 0 | void **page_ptrs; |
594 | 0 | int *pages_status; |
595 | 0 | uint64 shm_total_page_count, |
596 | 0 | shm_ent_page_count, |
597 | 0 | max_nodes; |
598 | 0 | Size *nodes; |
599 | |
|
600 | 0 | if (pg_numa_init() == -1) |
601 | 0 | elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); |
602 | | |
603 | 0 | InitMaterializedSRF(fcinfo, 0); |
604 | |
|
605 | 0 | max_nodes = pg_numa_get_max_node(); |
606 | 0 | nodes = palloc(sizeof(Size) * (max_nodes + 1)); |
607 | | |
608 | | /* |
609 | | * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while |
610 | | * the OS may have different memory page sizes. |
611 | | * |
612 | | * To correctly map between them, we need to: 1. Determine the OS memory |
613 | | * page size 2. Calculate how many OS pages are used by all buffer blocks |
614 | | * 3. Calculate how many OS pages are contained within each database |
615 | | * block. |
616 | | * |
617 | | * This information is needed before calling move_pages() for NUMA memory |
618 | | * node inquiry. |
619 | | */ |
620 | 0 | os_page_size = pg_get_shmem_pagesize(); |
621 | | |
622 | | /* |
623 | | * Allocate memory for page pointers and status based on total shared |
624 | | * memory size. This simplified approach allocates enough space for all |
625 | | * pages in shared memory rather than calculating the exact requirements |
626 | | * for each segment. |
627 | | * |
628 | | * Add 1, because we don't know how exactly the segments align to OS |
629 | | * pages, so the allocation might use one more memory page. In practice |
630 | | * this is not very likely, and moreover we have more entries, each of |
631 | | * them using only fraction of the total pages. |
632 | | */ |
633 | 0 | shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1; |
634 | 0 | page_ptrs = palloc0(sizeof(void *) * shm_total_page_count); |
635 | 0 | pages_status = palloc(sizeof(int) * shm_total_page_count); |
636 | |
|
637 | 0 | if (firstNumaTouch) |
638 | 0 | elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts"); |
639 | | |
640 | 0 | LWLockAcquire(ShmemIndexLock, LW_SHARED); |
641 | |
|
642 | 0 | hash_seq_init(&hstat, ShmemIndex); |
643 | | |
644 | | /* output all allocated entries */ |
645 | 0 | memset(nulls, 0, sizeof(nulls)); |
646 | 0 | while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) |
647 | 0 | { |
648 | 0 | int i; |
649 | 0 | char *startptr, |
650 | 0 | *endptr; |
651 | 0 | Size total_len; |
652 | | |
653 | | /* |
654 | | * Calculate the range of OS pages used by this segment. The segment |
655 | | * may start / end half-way through a page, we want to count these |
656 | | * pages too. So we align the start/end pointers down/up, and then |
657 | | * calculate the number of pages from that. |
658 | | */ |
659 | 0 | startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location); |
660 | 0 | endptr = (char *) TYPEALIGN(os_page_size, |
661 | 0 | (char *) ent->location + ent->allocated_size); |
662 | 0 | total_len = (endptr - startptr); |
663 | |
|
664 | 0 | shm_ent_page_count = total_len / os_page_size; |
665 | | |
666 | | /* |
667 | | * If we ever get 0xff (-1) back from kernel inquiry, then we probably |
668 | | * have a bug in mapping buffers to OS pages. |
669 | | */ |
670 | 0 | memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count); |
671 | | |
672 | | /* |
673 | | * Setup page_ptrs[] with pointers to all OS pages for this segment, |
674 | | * and get the NUMA status using pg_numa_query_pages. |
675 | | * |
676 | | * In order to get reliable results we also need to touch memory |
677 | | * pages, so that inquiry about NUMA memory node doesn't return -2 |
678 | | * (ENOENT, which indicates unmapped/unallocated pages). |
679 | | */ |
680 | 0 | for (i = 0; i < shm_ent_page_count; i++) |
681 | 0 | { |
682 | 0 | page_ptrs[i] = startptr + (i * os_page_size); |
683 | |
|
684 | 0 | if (firstNumaTouch) |
685 | 0 | pg_numa_touch_mem_if_required(page_ptrs[i]); |
686 | |
|
687 | 0 | CHECK_FOR_INTERRUPTS(); |
688 | 0 | } |
689 | |
|
690 | 0 | if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1) |
691 | 0 | elog(ERROR, "failed NUMA pages inquiry status: %m"); |
692 | | |
693 | | /* Count number of NUMA nodes used for this shared memory entry */ |
694 | 0 | memset(nodes, 0, sizeof(Size) * (max_nodes + 1)); |
695 | |
|
696 | 0 | for (i = 0; i < shm_ent_page_count; i++) |
697 | 0 | { |
698 | 0 | int s = pages_status[i]; |
699 | | |
700 | | /* Ensure we are adding only valid index to the array */ |
701 | 0 | if (s < 0 || s > max_nodes) |
702 | 0 | { |
703 | 0 | elog(ERROR, "invalid NUMA node id outside of allowed range " |
704 | 0 | "[0, " UINT64_FORMAT "]: %d", max_nodes, s); |
705 | 0 | } |
706 | | |
707 | 0 | nodes[s]++; |
708 | 0 | } |
709 | | |
710 | | /* |
711 | | * Add one entry for each NUMA node, including those without allocated |
712 | | * memory for this segment. |
713 | | */ |
714 | 0 | for (i = 0; i <= max_nodes; i++) |
715 | 0 | { |
716 | 0 | values[0] = CStringGetTextDatum(ent->key); |
717 | 0 | values[1] = i; |
718 | 0 | values[2] = Int64GetDatum(nodes[i] * os_page_size); |
719 | |
|
720 | 0 | tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, |
721 | 0 | values, nulls); |
722 | 0 | } |
723 | 0 | } |
724 | | |
725 | 0 | LWLockRelease(ShmemIndexLock); |
726 | 0 | firstNumaTouch = false; |
727 | |
|
728 | 0 | return (Datum) 0; |
729 | 0 | } |
730 | | |
731 | | /* |
732 | | * Determine the memory page size used for the shared memory segment. |
733 | | * |
734 | | * If the shared segment was allocated using huge pages, returns the size of |
735 | | * a huge page. Otherwise returns the size of regular memory page. |
736 | | * |
737 | | * This should be used only after the server is started. |
738 | | */ |
739 | | Size |
740 | | pg_get_shmem_pagesize(void) |
741 | 0 | { |
742 | 0 | Size os_page_size; |
743 | | #ifdef WIN32 |
744 | | SYSTEM_INFO sysinfo; |
745 | | |
746 | | GetSystemInfo(&sysinfo); |
747 | | os_page_size = sysinfo.dwPageSize; |
748 | | #else |
749 | 0 | os_page_size = sysconf(_SC_PAGESIZE); |
750 | 0 | #endif |
751 | |
|
752 | 0 | Assert(IsUnderPostmaster); |
753 | 0 | Assert(huge_pages_status != HUGE_PAGES_UNKNOWN); |
754 | |
|
755 | 0 | if (huge_pages_status == HUGE_PAGES_ON) |
756 | 0 | GetHugePageSize(&os_page_size, NULL); |
757 | |
|
758 | 0 | return os_page_size; |
759 | 0 | } |
760 | | |
761 | | Datum |
762 | | pg_numa_available(PG_FUNCTION_ARGS) |
763 | 0 | { |
764 | 0 | PG_RETURN_BOOL(pg_numa_init() != -1); |
765 | 0 | } |