Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * shmem.c
4 : : * create shared memory and initialize shared memory data structures.
5 : : *
6 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 : : * Portions Copyright (c) 1994, Regents of the University of California
8 : : *
9 : : *
10 : : * IDENTIFICATION
11 : : * src/backend/storage/ipc/shmem.c
12 : : *
13 : : *-------------------------------------------------------------------------
14 : : */
15 : : /*
16 : : * POSTGRES processes share one or more regions of shared memory.
17 : : * The shared memory is created by a postmaster and is inherited
18 : : * by each backend via fork() (or, in some ports, via other OS-specific
19 : : * methods). The routines in this file are used for allocating and
20 : : * binding to shared memory data structures.
21 : : *
22 : : * NOTES:
23 : : * (a) There are three kinds of shared memory data structures
24 : : * available to POSTGRES: fixed-size structures, queues and hash
25 : : * tables. Fixed-size structures contain things like global variables
26 : : * for a module and should never be allocated after the shared memory
27 : : * initialization phase. Hash tables have a fixed maximum size, but
28 : : * their actual size can vary dynamically. When entries are added
29 : : * to the table, more space is allocated. Queues link data structures
30 : : * that have been allocated either within fixed-size structures or as hash
31 : : * buckets. Each shared data structure has a string name to identify
32 : : * it (assigned in the module that declares it).
33 : : *
34 : : * (b) During initialization, each module looks for its
35 : : * shared data structures in a hash table called the "Shmem Index".
36 : : * If the data structure is not present, the caller can allocate
37 : : * a new one and initialize it. If the data structure is present,
38 : : * the caller "attaches" to the structure by initializing a pointer
39 : : * in the local address space.
40 : : * The shmem index has two purposes: first, it gives us
41 : : * a simple model of how the world looks when a backend process
42 : : * initializes. If something is present in the shmem index,
43 : : * it is initialized. If it is not, it is uninitialized. Second,
44 : : * the shmem index allows us to allocate shared memory on demand
45 : : * instead of trying to preallocate structures and hard-wire the
46 : : * sizes and locations in header files. If you are using a lot
47 : : * of shared memory in a lot of different places (and changing
48 : : * things during development), this is important.
49 : : *
50 : : * (c) In standard Unix-ish environments, individual backends do not
51 : : * need to re-establish their local pointers into shared memory, because
52 : : * they inherit correct values of those variables via fork() from the
53 : : * postmaster. However, this does not work in the EXEC_BACKEND case.
54 : : * In ports using EXEC_BACKEND, new backends have to set up their local
55 : : * pointers using the method described in (b) above.
56 : : *
57 : : * (d) memory allocation model: shared memory can never be
58 : : * freed, once allocated. Each hash table has its own free list,
59 : : * so hash buckets can be reused when an item is deleted. However,
60 : : * if one hash table grows very large and then shrinks, its space
61 : : * cannot be redistributed to other tables. We could build a simple
62 : : * hash bucket garbage collector if need be. Right now, it seems
63 : : * unnecessary.
64 : : */
65 : :
66 : : #include "postgres.h"
67 : :
68 : : #include "fmgr.h"
69 : : #include "funcapi.h"
70 : : #include "miscadmin.h"
71 : : #include "port/pg_numa.h"
72 : : #include "storage/lwlock.h"
73 : : #include "storage/pg_shmem.h"
74 : : #include "storage/shmem.h"
75 : : #include "storage/spin.h"
76 : : #include "utils/builtins.h"
77 : :
78 : : static void *ShmemAllocRaw(Size size, Size *allocated_size);
79 : :
80 : : /* shared memory global variables */
81 : :
82 : : static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
83 : :
84 : : static void *ShmemBase; /* start address of shared memory */
85 : :
86 : : static void *ShmemEnd; /* end+1 address of shared memory */
87 : :
88 : : slock_t *ShmemLock; /* spinlock for shared memory and LWLock
89 : : * allocation */
90 : :
91 : : static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
92 : :
93 : : /* To get reliable results for NUMA inquiry we need to "touch pages" once */
94 : : static bool firstNumaTouch = true;
95 : :
96 : : Datum pg_numa_available(PG_FUNCTION_ARGS);
97 : :
98 : : /*
99 : : * InitShmemAccess() --- set up basic pointers to shared memory.
100 : : */
101 : : void
284 peter@eisentraut.org 102 :CBC 1029 : InitShmemAccess(PGShmemHeader *seghdr)
103 : : {
104 : 1029 : ShmemSegHdr = seghdr;
105 : 1029 : ShmemBase = seghdr;
106 : 1029 : ShmemEnd = (char *) ShmemBase + seghdr->totalsize;
7185 tgl@sss.pgh.pa.us 107 : 1029 : }
108 : :
109 : : /*
110 : : * InitShmemAllocation() --- set up shared-memory space allocation.
111 : : *
112 : : * This should be called only in the postmaster or a standalone backend.
113 : : */
114 : : void
115 : 1029 : InitShmemAllocation(void)
116 : : {
117 : 1029 : PGShmemHeader *shmhdr = ShmemSegHdr;
118 : : char *aligned;
119 : :
120 [ - + ]: 1029 : Assert(shmhdr != NULL);
121 : :
122 : : /*
123 : : * Initialize the spinlock used by ShmemAlloc. We must use
124 : : * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
125 : : */
3190 126 : 1029 : ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
127 : :
128 : 1029 : SpinLockInit(ShmemLock);
129 : :
130 : : /*
131 : : * Allocations after this point should go through ShmemAlloc, which
132 : : * expects to allocate everything on cache line boundaries. Make sure the
133 : : * first allocation begins on a cache line boundary.
134 : : */
3425 rhaas@postgresql.org 135 : 1029 : aligned = (char *)
136 : 1029 : (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
137 : 1029 : shmhdr->freeoffset = aligned - (char *) shmhdr;
138 : :
139 : : /* ShmemIndex can't be set up yet (need LWLocks first) */
6152 tgl@sss.pgh.pa.us 140 : 1029 : shmhdr->index = NULL;
7185 141 : 1029 : ShmemIndex = (HTAB *) NULL;
10651 scrappy@hub.org 142 : 1029 : }
143 : :
144 : : /*
145 : : * ShmemAlloc -- allocate max-aligned chunk from shared memory
146 : : *
147 : : * Throws error if request cannot be satisfied.
148 : : *
149 : : * Assumes ShmemLock and ShmemSegHdr are initialized.
150 : : */
151 : : void *
6901 tgl@sss.pgh.pa.us 152 : 2058 : ShmemAlloc(Size size)
153 : : {
154 : : void *newSpace;
155 : : Size allocated_size;
156 : :
2067 rhaas@postgresql.org 157 : 2058 : newSpace = ShmemAllocRaw(size, &allocated_size);
3292 tgl@sss.pgh.pa.us 158 [ - + ]: 2058 : if (!newSpace)
3292 tgl@sss.pgh.pa.us 159 [ # # ]:UBC 0 : ereport(ERROR,
160 : : (errcode(ERRCODE_OUT_OF_MEMORY),
161 : : errmsg("out of shared memory (%zu bytes requested)",
162 : : size)));
3292 tgl@sss.pgh.pa.us 163 :CBC 2058 : return newSpace;
164 : : }
165 : :
166 : : /*
167 : : * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
168 : : *
169 : : * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
170 : : */
171 : : void *
172 : 418054 : ShmemAllocNoError(Size size)
173 : : {
174 : : Size allocated_size;
175 : :
2067 rhaas@postgresql.org 176 : 418054 : return ShmemAllocRaw(size, &allocated_size);
177 : : }
178 : :
179 : : /*
180 : : * ShmemAllocRaw -- allocate align chunk and return allocated size
181 : : *
182 : : * Also sets *allocated_size to the number of bytes allocated, which will
183 : : * be equal to the number requested plus any padding we choose to add.
184 : : */
185 : : static void *
186 : 495220 : ShmemAllocRaw(Size size, Size *allocated_size)
187 : : {
188 : : Size newStart;
189 : : Size newFree;
190 : : void *newSpace;
191 : :
192 : : /*
193 : : * Ensure all space is adequately aligned. We used to only MAXALIGN this
194 : : * space but experience has proved that on modern systems that is not good
195 : : * enough. Many parts of the system are very sensitive to critical data
196 : : * structures getting split across cache line boundaries. To avoid that,
197 : : * attempt to align the beginning of the allocation to a cache line
198 : : * boundary. The calling code will still need to be careful about how it
199 : : * uses the allocated space - e.g. by padding each element in an array of
200 : : * structures out to a power-of-two size - but without this, even that
201 : : * won't be sufficient.
202 : : */
3441 203 : 495220 : size = CACHELINEALIGN(size);
2067 204 : 495220 : *allocated_size = size;
205 : :
3613 206 [ - + ]: 495220 : Assert(ShmemSegHdr != NULL);
207 : :
8743 tgl@sss.pgh.pa.us 208 [ - + ]: 495220 : SpinLockAcquire(ShmemLock);
209 : :
3613 rhaas@postgresql.org 210 : 495220 : newStart = ShmemSegHdr->freeoffset;
211 : :
8021 tgl@sss.pgh.pa.us 212 : 495220 : newFree = newStart + size;
3613 rhaas@postgresql.org 213 [ + - ]: 495220 : if (newFree <= ShmemSegHdr->totalsize)
214 : : {
282 peter@eisentraut.org 215 : 495220 : newSpace = (char *) ShmemBase + newStart;
3613 rhaas@postgresql.org 216 : 495220 : ShmemSegHdr->freeoffset = newFree;
217 : : }
218 : : else
10226 bruce@momjian.us 219 :UBC 0 : newSpace = NULL;
220 : :
8743 tgl@sss.pgh.pa.us 221 :CBC 495220 : SpinLockRelease(ShmemLock);
222 : :
223 : : /* note this assert is okay with newSpace == NULL */
3425 rhaas@postgresql.org 224 [ - + ]: 495220 : Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
225 : :
9867 bruce@momjian.us 226 : 495220 : return newSpace;
227 : : }
228 : :
229 : : /*
230 : : * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
231 : : *
232 : : * Allocate space without locking ShmemLock. This should be used for,
233 : : * and only for, allocations that must happen before ShmemLock is ready.
234 : : *
235 : : * We consider maxalign, rather than cachealign, sufficient here.
236 : : */
237 : : void *
3190 tgl@sss.pgh.pa.us 238 : 2058 : ShmemAllocUnlocked(Size size)
239 : : {
240 : : Size newStart;
241 : : Size newFree;
242 : : void *newSpace;
243 : :
244 : : /*
245 : : * Ensure allocated space is adequately aligned.
246 : : */
247 : 2058 : size = MAXALIGN(size);
248 : :
249 [ - + ]: 2058 : Assert(ShmemSegHdr != NULL);
250 : :
251 : 2058 : newStart = ShmemSegHdr->freeoffset;
252 : :
253 : 2058 : newFree = newStart + size;
254 [ - + ]: 2058 : if (newFree > ShmemSegHdr->totalsize)
3190 tgl@sss.pgh.pa.us 255 [ # # ]:UBC 0 : ereport(ERROR,
256 : : (errcode(ERRCODE_OUT_OF_MEMORY),
257 : : errmsg("out of shared memory (%zu bytes requested)",
258 : : size)));
3190 tgl@sss.pgh.pa.us 259 :CBC 2058 : ShmemSegHdr->freeoffset = newFree;
260 : :
282 peter@eisentraut.org 261 : 2058 : newSpace = (char *) ShmemBase + newStart;
262 : :
3190 tgl@sss.pgh.pa.us 263 [ - + ]: 2058 : Assert(newSpace == (void *) MAXALIGN(newSpace));
264 : :
265 : 2058 : return newSpace;
266 : : }
267 : :
268 : : /*
269 : : * ShmemAddrIsValid -- test if an address refers to shared memory
270 : : *
271 : : * Returns true if the pointer points within the shared memory segment.
272 : : */
273 : : bool
5325 heikki.linnakangas@i 274 : 76749 : ShmemAddrIsValid(const void *addr)
275 : : {
6152 tgl@sss.pgh.pa.us 276 [ + - + - ]: 76749 : return (addr >= ShmemBase) && (addr < ShmemEnd);
277 : : }
278 : :
279 : : /*
280 : : * InitShmemIndex() --- set up or attach to shmem index table.
281 : : */
282 : : void
8743 283 : 1029 : InitShmemIndex(void)
284 : : {
285 : : HASHCTL info;
286 : :
287 : : /*
288 : : * Create the shared memory shmem index.
289 : : *
290 : : * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
291 : : * hashtable to exist already, we have a bit of a circularity problem in
292 : : * initializing the ShmemIndex itself. The special "ShmemIndex" hash
293 : : * table name will tell ShmemInitStruct to fake it.
294 : : */
295 : 1029 : info.keysize = SHMEM_INDEX_KEYSIZE;
8741 296 : 1029 : info.entrysize = sizeof(ShmemIndexEnt);
297 : :
8743 298 : 1029 : ShmemIndex = ShmemInitHash("ShmemIndex",
299 : : SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
300 : : &info,
301 : : HASH_ELEM | HASH_STRINGS);
302 : 1029 : }
303 : :
304 : : /*
305 : : * ShmemInitHash -- Create and initialize, or attach to, a
306 : : * shared memory hash table.
307 : : *
308 : : * We assume caller is doing some kind of synchronization
309 : : * so that two processes don't try to create/initialize the same
310 : : * table at once. (In practice, all creations are done in the postmaster
311 : : * process; child processes should always be attaching to existing tables.)
312 : : *
313 : : * max_size is the estimated maximum number of hashtable entries. This is
314 : : * not a hard limit, but the access efficiency will degrade if it is
315 : : * exceeded substantially (since it's used to compute directory size and
316 : : * the hash table buckets will get overfull).
317 : : *
318 : : * init_size is the number of hashtable entries to preallocate. For a table
319 : : * whose maximum size is certain, this should be equal to max_size; that
320 : : * ensures that no run-time out-of-shared-memory failures can occur.
321 : : *
322 : : * *infoP and hash_flags must specify at least the entry sizes and key
323 : : * comparison semantics (see hash_create()). Flag bits and values specific
324 : : * to shared-memory hash tables are added here, except that callers may
325 : : * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
326 : : *
327 : : * Note: before Postgres 9.0, this function returned NULL for some failure
328 : : * cases. Now, it always throws error instead, so callers need not check
329 : : * for NULL.
330 : : */
331 : : HTAB *
2999 332 : 9268 : ShmemInitHash(const char *name, /* table string name for shmem index */
333 : : int64 init_size, /* initial table size */
334 : : int64 max_size, /* max size of the table */
335 : : HASHCTL *infoP, /* info about key and bucket size */
336 : : int hash_flags) /* info about infoP */
337 : : {
338 : : bool found;
339 : : void *location;
340 : :
341 : : /*
342 : : * Hash tables allocated in shared memory have a fixed directory; it can't
343 : : * grow or other backends wouldn't be able to find it. So, make sure we
344 : : * make it big enough to start with.
345 : : *
346 : : * The shared memory allocator must be specified too.
347 : : */
9324 348 : 9268 : infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
3292 349 : 9268 : infoP->alloc = ShmemAllocNoError;
7921 350 : 9268 : hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
351 : :
352 : : /* look it up in the shmem index */
9693 353 : 9268 : location = ShmemInitStruct(name,
354 : : hash_get_shared_size(infoP, hash_flags),
355 : : &found);
356 : :
357 : : /*
358 : : * if it already exists, attach to it rather than allocate and initialize
359 : : * new space
360 : : */
10226 bruce@momjian.us 361 [ - + ]: 9268 : if (found)
10226 bruce@momjian.us 362 :UBC 0 : hash_flags |= HASH_ATTACH;
363 : :
364 : : /* Pass location of hashtable header to hash_create */
8741 tgl@sss.pgh.pa.us 365 :CBC 9268 : infoP->hctl = (HASHHDR *) location;
366 : :
8737 367 : 9268 : return hash_create(name, init_size, infoP, hash_flags);
368 : : }
369 : :
370 : : /*
371 : : * ShmemInitStruct -- Create/attach to a structure in shared memory.
372 : : *
373 : : * This is called during initialization to find or allocate
374 : : * a data structure in shared memory. If no other process
375 : : * has created the structure, this routine allocates space
376 : : * for it. If it exists already, a pointer to the existing
377 : : * structure is returned.
378 : : *
379 : : * Returns: pointer to the object. *foundPtr is set true if the object was
380 : : * already in the shmem index (hence, already initialized).
381 : : *
382 : : * Note: before Postgres 9.0, this function returned NULL for some failure
383 : : * cases. Now, it always throws error instead, so callers need not check
384 : : * for NULL.
385 : : */
386 : : void *
387 : 76137 : ShmemInitStruct(const char *name, Size size, bool *foundPtr)
388 : : {
389 : : ShmemIndexEnt *result;
390 : : void *structPtr;
391 : :
7185 392 : 76137 : LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
393 : :
9933 bruce@momjian.us 394 [ + + ]: 76137 : if (!ShmemIndex)
395 : : {
7185 tgl@sss.pgh.pa.us 396 : 1029 : PGShmemHeader *shmemseghdr = ShmemSegHdr;
397 : :
398 : : /* Must be trying to create/attach to ShmemIndex itself */
7460 neilc@samurai.com 399 [ - + ]: 1029 : Assert(strcmp(name, "ShmemIndex") == 0);
400 : :
7931 bruce@momjian.us 401 [ - + ]: 1029 : if (IsUnderPostmaster)
402 : : {
403 : : /* Must be initializing a (non-standalone) backend */
6152 tgl@sss.pgh.pa.us 404 [ # # ]:UBC 0 : Assert(shmemseghdr->index != NULL);
405 : 0 : structPtr = shmemseghdr->index;
2943 peter_e@gmx.net 406 : 0 : *foundPtr = true;
407 : : }
408 : : else
409 : : {
410 : : /*
411 : : * If the shmem index doesn't exist, we are bootstrapping: we must
412 : : * be trying to init the shmem index itself.
413 : : *
414 : : * Notice that the ShmemIndexLock is released before the shmem
415 : : * index has been initialized. This should be OK because no other
416 : : * process can be accessing shared memory yet.
417 : : */
6152 tgl@sss.pgh.pa.us 418 [ - + ]:CBC 1029 : Assert(shmemseghdr->index == NULL);
7185 419 : 1029 : structPtr = ShmemAlloc(size);
6152 420 : 1029 : shmemseghdr->index = structPtr;
2943 peter_e@gmx.net 421 : 1029 : *foundPtr = false;
422 : : }
6986 tgl@sss.pgh.pa.us 423 : 1029 : LWLockRelease(ShmemIndexLock);
7185 424 : 1029 : return structPtr;
425 : : }
426 : :
427 : : /* look it up in the shmem index */
428 : : result = (ShmemIndexEnt *)
6919 429 : 75108 : hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
430 : :
10226 bruce@momjian.us 431 [ - + ]: 75108 : if (!result)
432 : : {
7185 tgl@sss.pgh.pa.us 433 :UBC 0 : LWLockRelease(ShmemIndexLock);
8080 434 [ # # ]: 0 : ereport(ERROR,
435 : : (errcode(ERRCODE_OUT_OF_MEMORY),
436 : : errmsg("could not create ShmemIndex entry for data structure \"%s\"",
437 : : name)));
438 : : }
439 : :
9048 tgl@sss.pgh.pa.us 440 [ - + ]:CBC 75108 : if (*foundPtr)
441 : : {
442 : : /*
443 : : * Structure is in the shmem index so someone else has allocated it
444 : : * already. The size better be the same as the size we are trying to
445 : : * initialize to, or there is a name conflict (or worse).
446 : : */
10226 bruce@momjian.us 447 [ # # ]:UBC 0 : if (result->size != size)
448 : : {
7185 tgl@sss.pgh.pa.us 449 : 0 : LWLockRelease(ShmemIndexLock);
5610 450 [ # # ]: 0 : ereport(ERROR,
451 : : (errmsg("ShmemIndex entry size is wrong for data structure"
452 : : " \"%s\": expected %zu, actual %zu",
453 : : name, size, result->size)));
454 : : }
6152 455 : 0 : structPtr = result->location;
456 : : }
457 : : else
458 : : {
459 : : Size allocated_size;
460 : :
461 : : /* It isn't in the table yet. allocate and initialize it */
2067 rhaas@postgresql.org 462 :CBC 75108 : structPtr = ShmemAllocRaw(size, &allocated_size);
5610 tgl@sss.pgh.pa.us 463 [ - + ]: 75108 : if (structPtr == NULL)
464 : : {
465 : : /* out of memory; remove the failed ShmemIndex entry */
6919 tgl@sss.pgh.pa.us 466 :UBC 0 : hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
7185 467 : 0 : LWLockRelease(ShmemIndexLock);
5610 468 [ # # ]: 0 : ereport(ERROR,
469 : : (errcode(ERRCODE_OUT_OF_MEMORY),
470 : : errmsg("not enough shared memory for data structure"
471 : : " \"%s\" (%zu bytes requested)",
472 : : name, size)));
473 : : }
10226 bruce@momjian.us 474 :CBC 75108 : result->size = size;
2067 rhaas@postgresql.org 475 : 75108 : result->allocated_size = allocated_size;
6152 tgl@sss.pgh.pa.us 476 : 75108 : result->location = structPtr;
477 : : }
478 : :
7185 479 : 75108 : LWLockRelease(ShmemIndexLock);
480 : :
5610 481 [ - + ]: 75108 : Assert(ShmemAddrIsValid(structPtr));
482 : :
3425 rhaas@postgresql.org 483 [ - + ]: 75108 : Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
484 : :
9867 bruce@momjian.us 485 : 75108 : return structPtr;
486 : : }
487 : :
488 : :
489 : : /*
490 : : * Add two Size values, checking for overflow
491 : : */
492 : : Size
7322 tgl@sss.pgh.pa.us 493 : 503743 : add_size(Size s1, Size s2)
494 : : {
495 : : Size result;
496 : :
497 : 503743 : result = s1 + s2;
498 : : /* We are assuming Size is an unsigned type here... */
499 [ + - - + ]: 503743 : if (result < s1 || result < s2)
7322 tgl@sss.pgh.pa.us 500 [ # # ]:UBC 0 : ereport(ERROR,
501 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
502 : : errmsg("requested shared memory size overflows size_t")));
7322 tgl@sss.pgh.pa.us 503 :CBC 503743 : return result;
504 : : }
505 : :
506 : : /*
507 : : * Multiply two Size values, checking for overflow
508 : : */
509 : : Size
510 : 237449 : mul_size(Size s1, Size s2)
511 : : {
512 : : Size result;
513 : :
514 [ + + + + ]: 237449 : if (s1 == 0 || s2 == 0)
515 : 7422 : return 0;
516 : 230027 : result = s1 * s2;
517 : : /* We are assuming Size is an unsigned type here... */
518 [ - + ]: 230027 : if (result / s2 != s1)
7322 tgl@sss.pgh.pa.us 519 [ # # ]:UBC 0 : ereport(ERROR,
520 : : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
521 : : errmsg("requested shared memory size overflows size_t")));
7322 tgl@sss.pgh.pa.us 522 :CBC 230027 : return result;
523 : : }
524 : :
525 : : /* SQL SRF showing allocated shared memory */
526 : : Datum
2067 rhaas@postgresql.org 527 : 3 : pg_get_shmem_allocations(PG_FUNCTION_ARGS)
528 : : {
529 : : #define PG_GET_SHMEM_SIZES_COLS 4
530 : 3 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
531 : : HASH_SEQ_STATUS hstat;
532 : : ShmemIndexEnt *ent;
1941 tgl@sss.pgh.pa.us 533 : 3 : Size named_allocated = 0;
534 : : Datum values[PG_GET_SHMEM_SIZES_COLS];
535 : : bool nulls[PG_GET_SHMEM_SIZES_COLS];
536 : :
1054 michael@paquier.xyz 537 : 3 : InitMaterializedSRF(fcinfo, 0);
538 : :
2067 rhaas@postgresql.org 539 : 3 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
540 : :
541 : 3 : hash_seq_init(&hstat, ShmemIndex);
542 : :
543 : : /* output all allocated entries */
544 : 3 : memset(nulls, 0, sizeof(nulls));
545 [ + + ]: 224 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
546 : : {
547 : 221 : values[0] = CStringGetTextDatum(ent->key);
548 : 221 : values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
549 : 221 : values[2] = Int64GetDatum(ent->size);
550 : 221 : values[3] = Int64GetDatum(ent->allocated_size);
551 : 221 : named_allocated += ent->allocated_size;
552 : :
1279 michael@paquier.xyz 553 : 221 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
554 : : values, nulls);
555 : : }
556 : :
557 : : /* output shared memory allocated but not counted via the shmem index */
2067 rhaas@postgresql.org 558 : 3 : values[0] = CStringGetTextDatum("<anonymous>");
559 : 3 : nulls[1] = true;
560 : 3 : values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
561 : 3 : values[3] = values[2];
1279 michael@paquier.xyz 562 : 3 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
563 : :
564 : : /* output as-of-yet unused shared memory */
2067 rhaas@postgresql.org 565 : 3 : nulls[0] = true;
566 : 3 : values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
567 : 3 : nulls[1] = false;
568 : 3 : values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
569 : 3 : values[3] = values[2];
1279 michael@paquier.xyz 570 : 3 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
571 : :
2067 rhaas@postgresql.org 572 : 3 : LWLockRelease(ShmemIndexLock);
573 : :
574 : 3 : return (Datum) 0;
575 : : }
576 : :
577 : : /*
578 : : * SQL SRF showing NUMA memory nodes for allocated shared memory
579 : : *
580 : : * Compared to pg_get_shmem_allocations(), this function does not return
581 : : * information about shared anonymous allocations and unused shared memory.
582 : : */
583 : : Datum
152 tomas.vondra@postgre 584 : 3 : pg_get_shmem_allocations_numa(PG_FUNCTION_ARGS)
585 : : {
586 : : #define PG_GET_SHMEM_NUMA_SIZES_COLS 3
587 : 3 : ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
588 : : HASH_SEQ_STATUS hstat;
589 : : ShmemIndexEnt *ent;
590 : : Datum values[PG_GET_SHMEM_NUMA_SIZES_COLS];
591 : : bool nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
592 : : Size os_page_size;
593 : : void **page_ptrs;
594 : : int *pages_status;
595 : : uint64 shm_total_page_count,
596 : : shm_ent_page_count,
597 : : max_nodes;
598 : : Size *nodes;
599 : :
600 [ - + ]: 3 : if (pg_numa_init() == -1)
152 tomas.vondra@postgre 601 [ # # ]:UBC 0 : elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
602 : :
152 tomas.vondra@postgre 603 :CBC 3 : InitMaterializedSRF(fcinfo, 0);
604 : :
605 : 3 : max_nodes = pg_numa_get_max_node();
606 : 3 : nodes = palloc(sizeof(Size) * (max_nodes + 1));
607 : :
608 : : /*
609 : : * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, while
610 : : * the OS may have different memory page sizes.
611 : : *
612 : : * To correctly map between them, we need to: 1. Determine the OS memory
613 : : * page size 2. Calculate how many OS pages are used by all buffer blocks
614 : : * 3. Calculate how many OS pages are contained within each database
615 : : * block.
616 : : *
617 : : * This information is needed before calling move_pages() for NUMA memory
618 : : * node inquiry.
619 : : */
150 620 : 3 : os_page_size = pg_get_shmem_pagesize();
621 : :
622 : : /*
623 : : * Allocate memory for page pointers and status based on total shared
624 : : * memory size. This simplified approach allocates enough space for all
625 : : * pages in shared memory rather than calculating the exact requirements
626 : : * for each segment.
627 : : *
628 : : * Add 1, because we don't know how exactly the segments align to OS
629 : : * pages, so the allocation might use one more memory page. In practice
630 : : * this is not very likely, and moreover we have more entries, each of
631 : : * them using only fraction of the total pages.
632 : : */
152 633 : 3 : shm_total_page_count = (ShmemSegHdr->totalsize / os_page_size) + 1;
634 : 3 : page_ptrs = palloc0(sizeof(void *) * shm_total_page_count);
635 : 3 : pages_status = palloc(sizeof(int) * shm_total_page_count);
636 : :
637 [ + - ]: 3 : if (firstNumaTouch)
638 [ - + ]: 3 : elog(DEBUG1, "NUMA: page-faulting shared memory segments for proper NUMA readouts");
639 : :
640 : 3 : LWLockAcquire(ShmemIndexLock, LW_SHARED);
641 : :
642 : 3 : hash_seq_init(&hstat, ShmemIndex);
643 : :
644 : : /* output all allocated entries */
645 : 3 : memset(nulls, 0, sizeof(nulls));
646 [ + + ]: 224 : while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
647 : : {
648 : : int i;
649 : : char *startptr,
650 : : *endptr;
651 : : Size total_len;
652 : :
653 : : /*
654 : : * Calculate the range of OS pages used by this segment. The segment
655 : : * may start / end half-way through a page, we want to count these
656 : : * pages too. So we align the start/end pointers down/up, and then
657 : : * calculate the number of pages from that.
658 : : */
659 : 221 : startptr = (char *) TYPEALIGN_DOWN(os_page_size, ent->location);
660 : 221 : endptr = (char *) TYPEALIGN(os_page_size,
661 : : (char *) ent->location + ent->allocated_size);
662 : 221 : total_len = (endptr - startptr);
663 : :
664 : 221 : shm_ent_page_count = total_len / os_page_size;
665 : :
666 : : /*
667 : : * If we ever get 0xff (-1) back from kernel inquiry, then we probably
668 : : * have a bug in mapping buffers to OS pages.
669 : : */
670 : 221 : memset(pages_status, 0xff, sizeof(int) * shm_ent_page_count);
671 : :
672 : : /*
673 : : * Setup page_ptrs[] with pointers to all OS pages for this segment,
674 : : * and get the NUMA status using pg_numa_query_pages.
675 : : *
676 : : * In order to get reliable results we also need to touch memory
677 : : * pages, so that inquiry about NUMA memory node doesn't return -2
678 : : * (ENOENT, which indicates unmapped/unallocated pages).
679 : : */
680 [ + + ]: 73686 : for (i = 0; i < shm_ent_page_count; i++)
681 : : {
682 : 73465 : page_ptrs[i] = startptr + (i * os_page_size);
683 : :
684 [ + - ]: 73465 : if (firstNumaTouch)
67 685 : 73465 : pg_numa_touch_mem_if_required(page_ptrs[i]);
686 : :
152 687 [ - + ]: 73465 : CHECK_FOR_INTERRUPTS();
688 : : }
689 : :
690 [ - + ]: 221 : if (pg_numa_query_pages(0, shm_ent_page_count, page_ptrs, pages_status) == -1)
152 tomas.vondra@postgre 691 [ # # ]:UBC 0 : elog(ERROR, "failed NUMA pages inquiry status: %m");
692 : :
693 : : /* Count number of NUMA nodes used for this shared memory entry */
152 tomas.vondra@postgre 694 :CBC 221 : memset(nodes, 0, sizeof(Size) * (max_nodes + 1));
695 : :
696 [ + + ]: 73686 : for (i = 0; i < shm_ent_page_count; i++)
697 : : {
698 : 73465 : int s = pages_status[i];
699 : :
700 : : /* Ensure we are adding only valid index to the array */
701 [ + - - + ]: 73465 : if (s < 0 || s > max_nodes)
702 : : {
152 tomas.vondra@postgre 703 [ # # ]:UBC 0 : elog(ERROR, "invalid NUMA node id outside of allowed range "
704 : : "[0, " UINT64_FORMAT "]: %d", max_nodes, s);
705 : : }
706 : :
152 tomas.vondra@postgre 707 :CBC 73465 : nodes[s]++;
708 : : }
709 : :
710 : : /*
711 : : * Add one entry for each NUMA node, including those without allocated
712 : : * memory for this segment.
713 : : */
714 [ + + ]: 442 : for (i = 0; i <= max_nodes; i++)
715 : : {
716 : 221 : values[0] = CStringGetTextDatum(ent->key);
29 peter@eisentraut.org 717 :GNC 221 : values[1] = Int32GetDatum(i);
152 tomas.vondra@postgre 718 :CBC 221 : values[2] = Int64GetDatum(nodes[i] * os_page_size);
719 : :
720 : 221 : tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
721 : : values, nulls);
722 : : }
723 : : }
724 : :
725 : 3 : LWLockRelease(ShmemIndexLock);
726 : 3 : firstNumaTouch = false;
727 : :
728 : 3 : return (Datum) 0;
729 : : }
730 : :
731 : : /*
732 : : * Determine the memory page size used for the shared memory segment.
733 : : *
734 : : * If the shared segment was allocated using huge pages, returns the size of
735 : : * a huge page. Otherwise returns the size of regular memory page.
736 : : *
737 : : * This should be used only after the server is started.
738 : : */
739 : : Size
150 740 : 5 : pg_get_shmem_pagesize(void)
741 : : {
742 : : Size os_page_size;
743 : : #ifdef WIN32
744 : : SYSTEM_INFO sysinfo;
745 : :
746 : : GetSystemInfo(&sysinfo);
747 : : os_page_size = sysinfo.dwPageSize;
748 : : #else
749 : 5 : os_page_size = sysconf(_SC_PAGESIZE);
750 : : #endif
751 : :
752 [ - + ]: 5 : Assert(IsUnderPostmaster);
753 [ - + ]: 5 : Assert(huge_pages_status != HUGE_PAGES_UNKNOWN);
754 : :
755 [ - + ]: 5 : if (huge_pages_status == HUGE_PAGES_ON)
150 tomas.vondra@postgre 756 :UBC 0 : GetHugePageSize(&os_page_size, NULL);
757 : :
150 tomas.vondra@postgre 758 :CBC 5 : return os_page_size;
759 : : }
760 : :
761 : : Datum
762 : 4 : pg_numa_available(PG_FUNCTION_ARGS)
763 : : {
764 : 4 : PG_RETURN_BOOL(pg_numa_init() != -1);
765 : : }
|