Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * shmem.c
4 : : * create shared memory and initialize shared memory data structures.
5 : : *
6 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/storage/ipc/shmem.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : /*
16 : : * POSTGRES processes share one or more regions of shared memory.
17 : : * The shared memory is created by a postmaster and is inherited
18 : : * by each backend via fork() (or, in some ports, via other OS-specific
19 : : * methods). The routines in this file are used for allocating and
20 : : * binding to shared memory data structures.
21 : : *
22 : : * NOTES:
23 : : * (a) There are three kinds of shared memory data structures
24 : : * available to POSTGRES: fixed-size structures, queues and hash
25 : : * tables. Fixed-size structures contain things like global variables
26 : : * for a module and should never be allocated after the shared memory
27 : : * initialization phase. Hash tables have a fixed maximum size, but
28 : : * their actual size can vary dynamically. When entries are added
29 : : * to the table, more space is allocated. Queues link data structures
30 : : * that have been allocated either within fixed-size structures or as hash
31 : : * buckets. Each shared data structure has a string name to identify
32 : : * it (assigned in the module that declares it).
33 : : *
34 : : * (b) During initialization, each module looks for its
35 : : * shared data structures in a hash table called the "Shmem Index".
36 : : * If the data structure is not present, the caller can allocate
37 : : * a new one and initialize it. If the data structure is present,
38 : : * the caller "attaches" to the structure by initializing a pointer
39 : : * in the local address space.
40 : : * The shmem index has two purposes: first, it gives us
41 : : * a simple model of how the world looks when a backend process
42 : : * initializes. If something is present in the shmem index,
43 : : * it is initialized. If it is not, it is uninitialized. Second,
44 : : * the shmem index allows us to allocate shared memory on demand
45 : : * instead of trying to preallocate structures and hard-wire the
46 : : * sizes and locations in header files. If you are using a lot
47 : : * of shared memory in a lot of different places (and changing
48 : : * things during development), this is important.
49 : : *
50 : : * (c) In standard Unix-ish environments, individual backends do not
51 : : * need to re-establish their local pointers into shared memory, because
52 : : * they inherit correct values of those variables via fork() from the
53 : : * postmaster. However, this does not work in the EXEC_BACKEND case.
54 : : * In ports using EXEC_BACKEND, new backends have to set up their local
55 : : * pointers using the method described in (b) above.
56 : : *
57 : : * (d) memory allocation model: shared memory can never be
58 : : * freed, once allocated. Each hash table has its own free list,
59 : : * so hash buckets can be reused when an item is deleted. However,
60 : : * if one hash table grows very large and then shrinks, its space
61 : : * cannot be redistributed to other tables. We could build a simple
62 : : * hash bucket garbage collector if need be. Right now, it seems
63 : : * unnecessary.
64 : : */
65 : :
66 : : #include "postgres.h"
67 : :
68 : : #include "common/int.h"
69 : : #include "fmgr.h"
70 : : #include "funcapi.h"
71 : : #include "miscadmin.h"
72 : : #include "port/pg_numa.h"
73 : : #include "storage/lwlock.h"
74 : : #include "storage/pg_shmem.h"
75 : : #include "storage/shmem.h"
76 : : #include "storage/spin.h"
77 : : #include "utils/builtins.h"
78 : :
79 : : /*
80 : : * This is the first data structure stored in the shared memory segment, at
81 : : * the offset that PGShmemHeader->content_offset points to. Allocations by
82 : : * ShmemAlloc() are carved out of the space after this.
83 : : *
84 : : * For the base pointer and the total size of the shmem segment, we rely on
85 : : * the PGShmemHeader.
86 : : */
87 : : typedef struct ShmemAllocatorData
88 : : {
89 : : Size free_offset; /* offset to first free space from ShmemBase */
90 : : HASHHDR *index; /* location of ShmemIndex */
91 : :
92 : : /* protects shared memory and LWLock allocation */
93 : : slock_t shmem_lock;
94 : : } ShmemAllocatorData;
95 : :
96 : : static void *ShmemAllocRaw(Size size, Size *allocated_size);
97 : :
98 : : /* shared memory global variables */
99 : :
100 : : static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
101 : : static void *ShmemBase; /* start address of shared memory */
102 : : static void *ShmemEnd; /* end+1 address of shared memory */
103 : :
104 : : static ShmemAllocatorData *ShmemAllocator;
105 : : slock_t *ShmemLock; /* points to ShmemAllocator->shmem_lock */
106 : : static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
107 : :
108 : : /* To get reliable results for NUMA inquiry we need to "touch pages" once */
109 : : static bool firstNumaTouch = true;
110 : :
111 : : Datum pg_numa_available(PG_FUNCTION_ARGS);
112 : :
113 : : /*
114 : : * InitShmemAllocator() --- set up basic pointers to shared memory.
115 : : *
116 : : * Called at postmaster or stand-alone backend startup, to initialize the
117 : : * allocator's data structure in the shared memory segment. In EXEC_BACKEND,
118 : : * this is also called at backend startup, to set up pointers to the shared
119 : : * memory areas.
120 : : */
121 : : void
44 heikki.linnakangas@i 122 :GNC 1150 : InitShmemAllocator(PGShmemHeader *seghdr)
123 : : {
124 [ - + ]: 1150 : Assert(seghdr != NULL);
125 : :
126 : : /*
127 : : * We assume the pointer and offset are MAXALIGN. Not a hard requirement,
128 : : * but it's true today and keeps the math below simpler.
129 : : */
130 [ - + ]: 1150 : Assert(seghdr == (void *) MAXALIGN(seghdr));
131 [ - + ]: 1150 : Assert(seghdr->content_offset == MAXALIGN(seghdr->content_offset));
132 : :
474 peter@eisentraut.org 133 :CBC 1150 : ShmemSegHdr = seghdr;
134 : 1150 : ShmemBase = seghdr;
135 : 1150 : ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
136 : :
137 : : #ifndef EXEC_BACKEND
44 heikki.linnakangas@i 138 [ - + ]:GNC 1150 : Assert(!IsUnderPostmaster);
139 : : #endif
140 [ - + ]: 1150 : if (IsUnderPostmaster)
141 : : {
44 heikki.linnakangas@i 142 :UNC 0 : PGShmemHeader *shmhdr = ShmemSegHdr;
143 : :
144 : 0 : ShmemAllocator = (ShmemAllocatorData *) ((char *) shmhdr + shmhdr->content_offset);
145 : 0 : ShmemLock = &ShmemAllocator->shmem_lock;
146 : : }
147 : : else
148 : : {
149 : : Size offset;
150 : :
151 : : /*
152 : : * Allocations after this point should go through ShmemAlloc, which
153 : : * expects to allocate everything on cache line boundaries. Make sure
154 : : * the first allocation begins on a cache line boundary.
155 : : */
44 heikki.linnakangas@i 156 :GNC 1150 : offset = CACHELINEALIGN(seghdr->content_offset + sizeof(ShmemAllocatorData));
157 [ - + ]: 1150 : if (offset > seghdr->totalsize)
44 heikki.linnakangas@i 158 [ # # ]:UNC 0 : ereport(ERROR,
159 : : (errcode(ERRCODE_OUT_OF_MEMORY),
160 : : errmsg("out of shared memory (%zu bytes requested)",
161 : : offset)));
162 : :
44 heikki.linnakangas@i 163 :GNC 1150 : ShmemAllocator = (ShmemAllocatorData *) ((char *) seghdr + seghdr->content_offset);
164 : :
165 : 1150 : SpinLockInit(&ShmemAllocator->shmem_lock);
166 : 1150 : ShmemLock = &ShmemAllocator->shmem_lock;
167 : 1150 : ShmemAllocator->free_offset = offset;
168 : : /* ShmemIndex can't be set up yet (need LWLocks first) */
169 : 1150 : ShmemAllocator->index = NULL;
170 : 1150 : ShmemIndex = (HTAB *) NULL;
171 : : }
10841 scrappy@hub.org 172 :CBC 1150 : }
173 : :
174 : : /*
175 : : * ShmemAlloc -- allocate max-aligned chunk from shared memory
176 : : *
177 : : * Throws error if request cannot be satisfied.
178 : : *
179 : : * Assumes ShmemLock and ShmemSegHdr are initialized.
180 : : */
181 : : void *
7091 tgl@sss.pgh.pa.us 182 : 3453 : ShmemAlloc(Size size)
183 : : {
184 : : void *newSpace;
185 : : Size allocated_size;
186 : :
2257 rhaas@postgresql.org 187 : 3453 : newSpace = ShmemAllocRaw(size, &allocated_size);
3482 tgl@sss.pgh.pa.us 188 [ - + ]: 3453 : if (!newSpace)
3482 tgl@sss.pgh.pa.us 189 [ # # ]:UBC 0 : ereport(ERROR,
190 : : (errcode(ERRCODE_OUT_OF_MEMORY),
191 : : errmsg("out of shared memory (%zu bytes requested)",
192 : : size)));
3482 tgl@sss.pgh.pa.us 193 :CBC 3453 : return newSpace;
194 : : }
195 : :
196 : : /*
197 : : * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
198 : : *
199 : : * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
200 : : */
201 : : void *
202 : 464377 : ShmemAllocNoError(Size size)
203 : : {
204 : : Size allocated_size;
205 : :
2257 rhaas@postgresql.org 206 : 464377 : return ShmemAllocRaw(size, &allocated_size);
207 : : }
208 : :
209 : : /*
210 : : * ShmemAllocRaw -- allocate align chunk and return allocated size
211 : : *
212 : : * Also sets *allocated_size to the number of bytes allocated, which will
213 : : * be equal to the number requested plus any padding we choose to add.
214 : : */
215 : : static void *
216 : 554075 : ShmemAllocRaw(Size size, Size *allocated_size)
217 : : {
218 : : Size newStart;
219 : : Size newFree;
220 : : void *newSpace;
221 : :
222 : : /*
223 : : * Ensure all space is adequately aligned. We used to only MAXALIGN this
224 : : * space but experience has proved that on modern systems that is not good
225 : : * enough. Many parts of the system are very sensitive to critical data
226 : : * structures getting split across cache line boundaries. To avoid that,
227 : : * attempt to align the beginning of the allocation to a cache line
228 : : * boundary. The calling code will still need to be careful about how it
229 : : * uses the allocated space - e.g. by padding each element in an array of
230 : : * structures out to a power-of-two size - but without this, even that
231 : : * won't be sufficient.
232 : : */
3631 233 : 554075 : size = CACHELINEALIGN(size);
2257 234 : 554075 : *allocated_size = size;
235 : :
3803 236 [ - + ]: 554075 : Assert(ShmemSegHdr != NULL);
237 : :
8933 tgl@sss.pgh.pa.us 238 [ - + ]: 554075 : SpinLockAcquire(ShmemLock);
239 : :
44 heikki.linnakangas@i 240 :GNC 554075 : newStart = ShmemAllocator->free_offset;
241 : :
8211 tgl@sss.pgh.pa.us 242 :CBC 554075 : newFree = newStart + size;
3803 rhaas@postgresql.org 243 [ + - ]: 554075 : if (newFree <= ShmemSegHdr->totalsize)
244 : : {
472 peter@eisentraut.org 245 : 554075 : newSpace = (char *) ShmemBase + newStart;
44 heikki.linnakangas@i 246 :GNC 554075 : ShmemAllocator->free_offset = newFree;
247 : : }
248 : : else
10416 bruce@momjian.us 249 :UBC 0 : newSpace = NULL;
250 : :
8933 tgl@sss.pgh.pa.us 251 :CBC 554075 : SpinLockRelease(ShmemLock);
252 : :
253 : : /* note this assert is okay with newSpace == NULL */
3615 rhaas@postgresql.org 254 [ - + ]: 554075 : Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
255 : :
10057 bruce@momjian.us 256 : 554075 : return newSpace;
257 : : }
258 : :
259 : : /*
260 : : * ShmemAddrIsValid -- test if an address refers to shared memory
261 : : *
262 : : * Returns true if the pointer points within the shared memory segment.
263 : : */
264 : : bool
5515 heikki.linnakangas@i 265 : 87912 : ShmemAddrIsValid(const void *addr)
266 : : {
6342 tgl@sss.pgh.pa.us 267 [ + - + - ]: 87912 : return (addr >= ShmemBase) && (addr < ShmemEnd);
268 : : }
269 : :
270 : : /*
271 : : * InitShmemIndex() --- set up or attach to shmem index table.
272 : : */
273 : : void
8933 274 : 1150 : InitShmemIndex(void)
275 : : {
276 : : HASHCTL info;
277 : :
278 : : /*
279 : : * Create the shared memory shmem index.
280 : : *
281 : : * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
282 : : * hashtable to exist already, we have a bit of a circularity problem in
283 : : * initializing the ShmemIndex itself. The special "ShmemIndex" hash
284 : : * table name will tell ShmemInitStruct to fake it.
285 : : */
286 : 1150 : info.keysize = SHMEM_INDEX_KEYSIZE;
8931 287 : 1150 : info.entrysize = sizeof(ShmemIndexEnt);
288 : :
8933 289 : 1150 : ShmemIndex = ShmemInitHash("ShmemIndex",
290 : : SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
291 : : &info,
292 : : HASH_ELEM | HASH_STRINGS);
293 : 1150 : }
294 : :
295 : : /*
296 : : * ShmemInitHash -- Create and initialize, or attach to, a
297 : : * shared memory hash table.
298 : : *
299 : : * We assume caller is doing some kind of synchronization
300 : : * so that two processes don't try to create/initialize the same
301 : : * table at once. (In practice, all creations are done in the postmaster
302 : : * process; child processes should always be attaching to existing tables.)
303 : : *
304 : : * max_size is the estimated maximum number of hashtable entries. This is
305 : : * not a hard limit, but the access efficiency will degrade if it is
306 : : * exceeded substantially (since it's used to compute directory size and
307 : : * the hash table buckets will get overfull).
308 : : *
309 : : * init_size is the number of hashtable entries to preallocate. For a table
310 : : * whose maximum size is certain, this should be equal to max_size; that
311 : : * ensures that no run-time out-of-shared-memory failures can occur.
312 : : *
313 : : * *infoP and hash_flags must specify at least the entry sizes and key
314 : : * comparison semantics (see hash_create()). Flag bits and values specific
315 : : * to shared-memory hash tables are added here, except that callers may
316 : : * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
317 : : *
318 : : * Note: before Postgres 9.0, this function returned NULL for some failure
319 : : * cases. Now, it always throws error instead, so callers need not check
320 : : * for NULL.
321 : : */
322 : : HTAB *
3189 323 : 10357 : ShmemInitHash(const char *name, /* table string name for shmem index */
324 : : int64 init_size, /* initial table size */
325 : : int64 max_size, /* max size of the table */
326 : : HASHCTL *infoP, /* info about key and bucket size */
327 : : int hash_flags) /* info about infoP */
328 : : {
329 : : bool found;
330 : : void *location;
331 : :
332 : : /*
333 : : * Hash tables allocated in shared memory have a fixed directory; it can't
334 : : * grow or other backends wouldn't be able to find it. So, make sure we
335 : : * make it big enough to start with.
336 : : *
337 : : * The shared memory allocator must be specified too.
338 : : */
9514 339 : 10357 : infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
3482 340 : 10357 : infoP->alloc = ShmemAllocNoError;
8111 341 : 10357 : hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
342 : :
343 : : /* look it up in the shmem index */
9883 344 : 10357 : location = ShmemInitStruct(name,
345 : : hash_get_shared_size(infoP, hash_flags),
346 : : &found);
347 : :
348 : : /*
349 : : * if it already exists, attach to it rather than allocate and initialize
350 : : * new space
351 : : */
10416 bruce@momjian.us 352 [ - + ]: 10357 : if (found)
10416 bruce@momjian.us 353 :UBC 0 : hash_flags |= HASH_ATTACH;
354 : :
355 : : /* Pass location of hashtable header to hash_create */
8931 tgl@sss.pgh.pa.us 356 :CBC 10357 : infoP->hctl = (HASHHDR *) location;
357 : :
8927 358 : 10357 : return hash_create(name, init_size, infoP, hash_flags);
359 : : }
360 : :
361 : : /*
362 : : * ShmemInitStruct -- Create/attach to a structure in shared memory.
363 : : *
364 : : * This is called during initialization to find or allocate
365 : : * a data structure in shared memory. If no other process
366 : : * has created the structure, this routine allocates space
367 : : * for it. If it exists already, a pointer to the existing
368 : : * structure is returned.
369 : : *
370 : : * Returns: pointer to the object. *foundPtr is set true if the object was
371 : : * already in the shmem index (hence, already initialized).
372 : : *
373 : : * Note: before Postgres 9.0, this function returned NULL for some failure
374 : : * cases. Now, it always throws error instead, so callers need not check
375 : : * for NULL.
376 : : */
377 : : void *
378 : 87395 : ShmemInitStruct(const char *name, Size size, bool *foundPtr)
379 : : {
380 : : ShmemIndexEnt *result;
381 : : void *structPtr;
382 : :
7375 383 : 87395 : LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
384 : :
10123 bruce@momjian.us 385 [ + + ]: 87395 : if (!ShmemIndex)
386 : : {
387 : : /* Must be trying to create/attach to ShmemIndex itself */
7650 neilc@samurai.com 388 [ - + ]: 1150 : Assert(strcmp(name, "ShmemIndex") == 0);
389 : :
8121 bruce@momjian.us 390 [ - + ]: 1150 : if (IsUnderPostmaster)
391 : : {
392 : : /* Must be initializing a (non-standalone) backend */
44 heikki.linnakangas@i 393 [ # # ]:UNC 0 : Assert(ShmemAllocator->index != NULL);
394 : 0 : structPtr = ShmemAllocator->index;
3133 peter_e@gmx.net 395 :UBC 0 : *foundPtr = true;
396 : : }
397 : : else
398 : : {
399 : : /*
400 : : * If the shmem index doesn't exist, we are bootstrapping: we must
401 : : * be trying to init the shmem index itself.
402 : : *
403 : : * Notice that the ShmemIndexLock is released before the shmem
404 : : * index has been initialized. This should be OK because no other
405 : : * process can be accessing shared memory yet.
406 : : */
44 heikki.linnakangas@i 407 [ - + ]:GNC 1150 : Assert(ShmemAllocator->index == NULL);
7375 tgl@sss.pgh.pa.us 408 :CBC 1150 : structPtr = ShmemAlloc(size);
44 heikki.linnakangas@i 409 :GNC 1150 : ShmemAllocator->index = structPtr;
3133 peter_e@gmx.net 410 :CBC 1150 : *foundPtr = false;
411 : : }
7176 tgl@sss.pgh.pa.us 412 : 1150 : LWLockRelease(ShmemIndexLock);
7375 413 : 1150 : return structPtr;
414 : : }
415 : :
416 : : /* look it up in the shmem index */
417 : : result = (ShmemIndexEnt *)
7109 418 : 86245 : hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
419 : :
10416 bruce@momjian.us 420 [ - + ]: 86245 : if (!result)
421 : : {
7375 tgl@sss.pgh.pa.us 422 :UBC 0 : LWLockRelease(ShmemIndexLock);
8270 423 [ # # ]: 0 : ereport(ERROR,
424 : : (errcode(ERRCODE_OUT_OF_MEMORY),
425 : : errmsg("could not create ShmemIndex entry for data structure \"%s\"",
426 : : name)));
427 : : }
428 : :
9238 tgl@sss.pgh.pa.us 429 [ - + ]:CBC 86245 : if (*foundPtr)
430 : : {
431 : : /*
432 : : * Structure is in the shmem index so someone else has allocated it
433 : : * already. The size better be the same as the size we are trying to
434 : : * initialize to, or there is a name conflict (or worse).
435 : : */
10416 bruce@momjian.us 436 [ # # ]:UBC 0 : if (result->size != size)
437 : : {
7375 tgl@sss.pgh.pa.us 438 : 0 : LWLockRelease(ShmemIndexLock);
5800 439 [ # # ]: 0 : ereport(ERROR,
440 : : (errmsg("ShmemIndex entry size is wrong for data structure"
441 : : " \"%s\": expected %zu, actual %zu",
442 : : name, size, result->size)));
443 : : }
6342 444 : 0 : structPtr = result->location;
445 : : }
446 : : else
447 : : {
448 : : Size allocated_size;
449 : :
450 : : /* It isn't in the table yet. allocate and initialize it */
2257 rhaas@postgresql.org 451 :CBC 86245 : structPtr = ShmemAllocRaw(size, &allocated_size);
5800 tgl@sss.pgh.pa.us 452 [ - + ]: 86245 : if (structPtr == NULL)
453 : : {
454 : : /* out of memory; remove the failed ShmemIndex entry */
7109 tgl@sss.pgh.pa.us 455 :UBC 0 : hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
7375 456 : 0 : LWLockRelease(ShmemIndexLock);
5800 457 [ # # ]: 0 : ereport(ERROR,
458 : : (errcode(ERRCODE_OUT_OF_MEMORY),
459 : : errmsg("not enough shared memory for data structure"
460 : : " \"%s\" (%zu bytes requested)",
461 : : name, size)));
462 : : }
10416 bruce@momjian.us 463 :CBC 86245 : result->size = size;
2257 rhaas@postgresql.org 464 : 86245 : result->allocated_size = allocated_size;
6342 tgl@sss.pgh.pa.us 465 : 86245 : result->location = structPtr;
466 : : }
467 : :
7375 468 : 86245 : LWLockRelease(ShmemIndexLock);
469 : :
5800 470 [ - + ]: 86245 : Assert(ShmemAddrIsValid(structPtr));
471 : :
3615 rhaas@postgresql.org 472 [ - + ]: 86245 : Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
473 : :
10057 bruce@momjian.us 474 : 86245 : return structPtr;
475 : : }
476 : :
477 : :
478 : : /*
479 : : * Add two Size values, checking for overflow
480 : : */
481 : : Size
7512 tgl@sss.pgh.pa.us 482 : 581389 : add_size(Size s1, Size s2)
483 : : {
484 : : Size result;
485 : :
111 jchampion@postgresql 486 [ - + ]:GNC 581389 : if (pg_add_size_overflow(s1, s2, &result))
7512 tgl@sss.pgh.pa.us 487 [ # # ]:UBC 0 : ereport(ERROR,
488 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
489 : : errmsg("requested shared memory size overflows size_t")));
7512 tgl@sss.pgh.pa.us 490 :CBC 581389 : return result;
491 : : }
492 : :
493 : : /*
494 : : * Multiply two Size values, checking for overflow
495 : : */
496 : : Size
497 : 277339 : mul_size(Size s1, Size s2)
498 : : {
499 : : Size result;
500 : :
111 jchampion@postgresql 501 [ - + ]:GNC 277339 : if (pg_mul_size_overflow(s1, s2, &result))
7512 tgl@sss.pgh.pa.us 502 [ # # ]:UBC 0 : ereport(ERROR,
503 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
504 : : errmsg("requested shared memory size overflows size_t")));
7512 tgl@sss.pgh.pa.us 505 :CBC 277339 : return result;
506 : : }
507 : :
508 : : /* SQL SRF showing allocated shared memory */
509 : : Datum
2257 rhaas@postgresql.org 510 : 3 : pg_get_shmem_allocations(PG_FUNCTION_ARGS)
511 : : {
512 : : #define PG_GET_SHMEM_SIZES_COLS 4
513 : 3 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
514 : : HASH_SEQ_STATUS hstat;
515 : : ShmemIndexEnt *ent;
2131 tgl@sss.pgh.pa.us 516 : 3 : Size named_allocated = 0;
517 : : Datum values[PG_GET_SHMEM_SIZES_COLS];
518 : : bool nulls[PG_GET_SHMEM_SIZES_COLS];
519 : :
1244 michael@paquier.xyz 520 : 3 : InitMaterializedSRF(fcinfo, 0);
521 : :
2257 rhaas@postgresql.org 522 : 3 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
523 : :
524 : 3 : hash_seq_init(&hstat, ShmemIndex);
525 : :
526 : : /* output all allocated entries */
527 : 3 : memset(nulls, 0, sizeof(nulls));
528 [ + + ]: 230 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
529 : : {
530 : 227 : values[0] = CStringGetTextDatum(ent->key);
531 : 227 : values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
532 : 227 : values[2] = Int64GetDatum(ent->size);
533 : 227 : values[3] = Int64GetDatum(ent->allocated_size);
534 : 227 : named_allocated += ent->allocated_size;
535 : :
1469 michael@paquier.xyz 536 : 227 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
537 : : values, nulls);
538 : : }
539 : :
540 : : /* output shared memory allocated but not counted via the shmem index */
2257 rhaas@postgresql.org 541 : 3 : values[0] = CStringGetTextDatum("<anonymous>");
542 : 3 : nulls[1] = true;
44 heikki.linnakangas@i 543 :GNC 3 : values[2] = Int64GetDatum(ShmemAllocator->free_offset - named_allocated);
2257 rhaas@postgresql.org 544 :CBC 3 : values[3] = values[2];
1469 michael@paquier.xyz 545 : 3 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
546 : :
547 : : /* output as-of-yet unused shared memory */
2257 rhaas@postgresql.org 548 : 3 : nulls[0] = true;
44 heikki.linnakangas@i 549 :GNC 3 : values[1] = Int64GetDatum(ShmemAllocator->free_offset);
2257 rhaas@postgresql.org 550 :CBC 3 : nulls[1] = false;
44 heikki.linnakangas@i 551 :GNC 3 : values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemAllocator->free_offset);
2257 rhaas@postgresql.org 552 :CBC 3 : values[3] = values[2];
1469 michael@paquier.xyz 553 : 3 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
554 : :
2257 rhaas@postgresql.org 555 : 3 : LWLockRelease(ShmemIndexLock);
556 : :
557 : 3 : return (Datum) 0;
558 : : }
559 : :
560 : : /*
561 : : * SQL SRF showing NUMA memory nodes for allocated shared memory
562 : : *
563 : : * Compared to pg_get_shmem_allocations(), this function does not return
564 : : * information about shared anonymous allocations and unused shared memory.
565 : : */
566 : : Datum
342 tomas.vondra@postgre 567 : 3 : pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
568 : : {
569 : : #define PG_GET_SHMEM_NUMA_SIZES_COLS 3
570 : 3 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
571 : : HASH_SEQ_STATUS hstat;
572 : : ShmemIndexEnt *ent;
573 : : Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS];
574 : : bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
575 : : Size os_page_size;
576 : : void **page_ptrs;
577 : : int *pages_status;
578 : : uint64 shm_total_page_count,
579 : : shm_ent_page_count,
580 : : max_nodes;
581 : : Size *nodes;
582 : :
583 [ - + ]: 3 : if (pg_numa_init() == -1)
342 tomas.vondra@postgre 584 [ # # ]:UBC 0 : elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
585 : :
342 tomas.vondra@postgre 586 :CBC 3 : InitMaterializedSRF(fcinfo, 0);
587 : :
588 : 3 : max_nodes = pg_numa_get_max_node();
48 tomas.vondra@postgre 589 :GNC 3 : nodes = palloc_array(Size, max_nodes + 2);
590 : :
591 : : /*
592 : : * Shared memory allocations can vary in size and may not align with OS
593 : : * memory page boundaries, while NUMA queries work on pages.
594 : : *
595 : : * To correctly map each allocation to NUMA nodes, we need to: 1.
596 : : * Determine the OS memory page size. 2. Align each allocation's start/end
597 : : * addresses to page boundaries. 3. Query NUMA node information for all
598 : : * pages spanning the allocation.
599 : : */
340 tomas.vondra@postgre 600 :CBC 3 : os_page_size = pg_get_shmem_pagesize();
601 : :
602 : : /*
603 : : * Allocate memory for page pointers and status based on total shared
604 : : * memory size. This simplified approach allocates enough space for all
605 : : * pages in shared memory rather than calculating the exact requirements
606 : : * for each segment.
607 : : *
608 : : * Add 1, because we don't know how exactly the segments align to OS
609 : : * pages, so the allocation might use one more memory page. In practice
610 : : * this is not very likely, and moreover we have more entries, each of
611 : : * them using only fraction of the total pages.
612 : : */
342 613 : 3 : shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
95 michael@paquier.xyz 614 :GNC 3 : page_ptrs = palloc0_array(void *, shm_total_page_count);
615 : 3 : pages_status = palloc_array(int, shm_total_page_count);
616 : :
342 tomas.vondra@postgre 617 [ + - ]:CBC 3 : if (firstNumaTouch)
618 [ - + ]: 3 : elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
619 : :
620 : 3 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
621 : :
622 : 3 : hash_seq_init(&hstat, ShmemIndex);
623 : :
624 : : /* output all allocated entries */
625 [ + + ]: 230 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
626 : : {
627 : : int i;
628 : : char *startptr,
629 : : *endptr;
630 : : Size total_len;
631 : :
632 : : /*
633 : : * Calculate the range of OS pages used by this segment. The segment
634 : : * may start / end half-way through a page, we want to count these
635 : : * pages too. So we align the start/end pointers down/up, and then
636 : : * calculate the number of pages from that.
637 : : */
638 : 227 : startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
639 : 227 : endptr = (char *) TYPEALIGN(os_page_size,
640 : : (char *) ent->location + ent->allocated_size);
641 : 227 : total_len = (endptr - startptr);
642 : :
643 : 227 : shm_ent_page_count = total_len / os_page_size;
644 : :
645 : : /*
646 : : * If we ever get 0xff (-1) back from kernel inquiry, then we probably
647 : : * have a bug in mapping buffers to OS pages.
648 : : */
649 : 227 : memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
650 : :
651 : : /*
652 : : * Setup page_ptrs[] with pointers to all OS pages for this segment,
653 : : * and get the NUMA status using pg_numa_query_pages.
654 : : *
655 : : * In order to get reliable results we also need to touch memory
656 : : * pages, so that inquiry about NUMA memory node doesn't return -2
657 : : * (ENOENT, which indicates unmapped/unallocated pages).
658 : : */
659 [ + + ]: 73831 : for (i = 0; i < shm_ent_page_count; i++)
660 : : {
661 : 73604 : page_ptrs[i] = startptr + (i * os_page_size);
662 : :
663 [ + - ]: 73604 : if (firstNumaTouch)
257 664 : 73604 : pg_numa_touch_mem_if_required(page_ptrs[i]);
665 : :
342 666 [ - + ]: 73604 : CHECK_FOR_INTERRUPTS();
667 : : }
668 : :
669 [ - + ]: 227 : if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
342 tomas.vondra@postgre 670 [ # # ]:UBC 0 : elog(ERROR, "failed NUMA pages inquiry status: %m");
671 : :
672 : : /* Count number of NUMA nodes used for this shared memory entry */
48 tomas.vondra@postgre 673 :CBC 227 : memset(nodes, 0, sizeof(Size) * (max_nodes + 2));
674 : :
342 675 [ + + ]: 73831 : for (i = 0; i < shm_ent_page_count; i++)
676 : : {
677 : 73604 : int s = pages_status[i];
678 : :
679 : : /* Ensure we are adding only valid index to the array */
48 680 [ + - + - ]: 73604 : if (s >= 0 && s <= max_nodes)
681 : : {
682 : : /* valid NUMA node */
683 : 73604 : nodes[s]++;
684 : 73604 : continue;
685 : : }
48 tomas.vondra@postgre 686 [ # # ]:UBC 0 : else if (s == -2)
687 : : {
688 : : /* -2 means ENOENT (e.g. page was moved to swap) */
689 : 0 : nodes[max_nodes + 1]++;
690 : 0 : continue;
691 : : }
692 : :
693 [ # # ]: 0 : elog(ERROR, "invalid NUMA node id outside of allowed range "
694 : : "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
695 : : }
696 : :
697 : : /* no NULLs for regular nodes */
48 tomas.vondra@postgre 698 :CBC 227 : memset(nulls, 0, sizeof(nulls));
699 : :
700 : : /*
701 : : * Add one entry for each NUMA node, including those without allocated
702 : : * memory for this segment.
703 : : */
342 704 [ + + ]: 454 : for (i = 0; i <= max_nodes; i++)
705 : : {
706 : 227 : values[0] = CStringGetTextDatum(ent->key);
219 peter@eisentraut.org 707 :GNC 227 : values[1] = Int32GetDatum(i);
342 tomas.vondra@postgre 708 :CBC 227 : values[2] = Int64GetDatum(nodes[i] * os_page_size);
709 : :
710 : 227 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
711 : : values, nulls);
712 : : }
713 : :
714 : : /* The last entry is used for pages without a NUMA node. */
48 715 : 227 : nulls[1] = true;
716 : 227 : values[0] = CStringGetTextDatum(ent->key);
717 : 227 : values[2] = Int64GetDatum(nodes[max_nodes + 1] * os_page_size);
718 : :
719 : 227 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
720 : : values, nulls);
721 : : }
722 : :
342 723 : 3 : LWLockRelease(ShmemIndexLock);
724 : 3 : firstNumaTouch = false;
725 : :
726 : 3 : return (Datum) 0;
727 : : }
728 : :
729 : : /*
730 : : * Determine the memory page size used for the shared memory segment.
731 : : *
732 : : * If the shared segment was allocated using huge pages, returns the size of
733 : : * a huge page. Otherwise returns the size of regular memory page.
734 : : *
735 : : * This should be used only after the server is started.
736 : : */
737 : : Size
340 738 : 7 : pg_get_shmem_pagesize(void)
739 : : {
740 : : Size os_page_size;
741 : : #ifdef WIN32
742 : : SYSTEM_INFO sysinfo;
743 : :
744 : : GetSystemInfo(&sysinfo);
745 : : os_page_size = sysinfo.dwPageSize;
746 : : #else
747 : 7 : os_page_size = sysconf(_SC_PAGESIZE);
748 : : #endif
749 : :
750 [ - + ]: 7 : Assert(IsUnderPostmaster);
751 [ - + ]: 7 : Assert(huge_pages_status != HUGE_PAGES_UNKNOWN);
752 : :
753 [ - + ]: 7 : if (huge_pages_status == HUGE_PAGES_ON)
340 tomas.vondra@postgre 754 :UBC 0 : GetHugePageSize(&os_page_size, NULL);
755 : :
340 tomas.vondra@postgre 756 :CBC 7 : return os_page_size;
757 : : }
758 : :
759 : : Datum
760 : 4 : pg_numa_available(PG_FUNCTION_ARGS)
761 : : {
762 : 4 : PG_RETURN_BOOL(pg_numa_init() != -1);
763 : : }
|