Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * dynahash.c
4 : : * dynamic chained hash tables
5 : : *
6 : : * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 : : * shared memory. For shared hash tables, it is the caller's responsibility
8 : : * to provide appropriate access interlocking. The simplest convention is
9 : : * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 : : * hash_seq_search) need only shared lock, but any update requires exclusive
11 : : * lock. For heavily-used shared tables, the single-lock approach creates a
12 : : * concurrency bottleneck, so we also support "partitioned" locking wherein
13 : : * there are multiple LWLocks guarding distinct subsets of the table. To use
14 : : * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 : : * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 : : * Therefore, each hash bucket chain operates independently, and no fields
17 : : * of the hash header change after init except nentries and freeList.
18 : : * (A partitioned table uses multiple copies of those fields, guarded by
19 : : * spinlocks, for additional concurrency.)
20 : : * This lets any subset of the hash buckets be treated as a separately
21 : : * lockable partition. We expect callers to use the low-order bits of a
22 : : * lookup key's hash value as a partition number --- this will work because
23 : : * of the way calc_bucket() maps hash values to bucket numbers.
24 : : *
25 : : * The memory allocator function should match malloc's semantics of returning
26 : : * NULL on failure. (This is essential for hash tables in shared memory.
27 : : * For hash tables in local memory, we used to use palloc() which will throw
28 : : * error on failure; but we no longer do, so it's untested whether this
29 : : * module will still cope with that behavior.)
30 : : *
31 : : * dynahash.c provides support for these types of lookup keys:
32 : : *
33 : : * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
34 : : * compared as though by strcmp(). This is selected by specifying the
35 : : * HASH_STRINGS flag to hash_create.
36 : : *
37 : : * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
38 : : * (Caller must ensure there are no undefined padding bits in the keys!)
39 : : * This is selected by specifying the HASH_BLOBS flag to hash_create.
40 : : *
41 : : * 3. More complex key behavior can be selected by specifying user-supplied
42 : : * hashing, comparison, and/or key-copying functions. At least a hashing
43 : : * function must be supplied; comparison defaults to memcmp() and key copying
44 : : * to memcpy() when a user-defined hashing function is selected.
45 : : *
46 : : * Compared to simplehash, dynahash has the following benefits:
47 : : *
48 : : * - It supports partitioning, which is useful for shared memory access using
49 : : * locks.
50 : : * - Shared memory hashes are allocated in a fixed size area at startup and
51 : : * are discoverable by name from other processes.
52 : : * - Because entries don't need to be moved in the case of hash conflicts,
53 : : * dynahash has better performance for large entries.
54 : : * - Guarantees stable pointers to entries.
55 : : *
56 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
57 : : * Portions Copyright (c) 1994, Regents of the University of California
58 : : *
59 : : *
60 : : * IDENTIFICATION
61 : : * src/backend/utils/hash/dynahash.c
62 : : *
63 : : *-------------------------------------------------------------------------
64 : : */
65 : :
66 : : /*
67 : : * Original comments:
68 : : *
69 : : * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
70 : : * Coded into C, with minor code improvements, and with hsearch(3) interface,
71 : : * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
72 : : * also, hcreate/hdestroy routines added to simulate hsearch(3).
73 : : *
74 : : * These routines simulate hsearch(3) and family, with the important
75 : : * difference that the hash table is dynamic - can grow indefinitely
76 : : * beyond its original size (as supplied to hcreate()).
77 : : *
78 : : * Performance appears to be comparable to that of hsearch(3).
79 : : * The 'source-code' options referred to in hsearch(3)'s 'man' page
80 : : * are not implemented; otherwise functionality is identical.
81 : : *
82 : : * Compilation controls:
83 : : * HASH_STATISTICS causes some usage statistics to be maintained, which can be
84 : : * logged by calling hash_stats().
85 : : *
86 : : * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 : : * concatenation property, in probably unnecessary code 'optimization'.
88 : : *
89 : : * Modified margo@postgres.berkeley.edu February 1990
90 : : * added multiple table interface
91 : : * Modified by sullivan@postgres.berkeley.edu April 1990
92 : : * changed ctl structure for shared memory
93 : : */
94 : :
95 : : #include "postgres.h"
96 : :
97 : : #include <limits.h>
98 : :
99 : : #include "access/xact.h"
100 : : #include "common/hashfn.h"
101 : : #include "lib/ilist.h"
102 : : #include "port/pg_bitutils.h"
103 : : #include "storage/shmem.h"
104 : : #include "storage/spin.h"
105 : : #include "utils/memutils.h"
106 : :
107 : :
108 : : /*
109 : : * Constants
110 : : *
111 : : * A hash table has a top-level "directory", each of whose entries points to a
112 : : * "segment" of HASH_SEGSIZE bucket headers. The maximum number of hash
113 : : * buckets is thus dsize * HASH_SEGSIZE (but dsize may be expansible). Of
114 : : * course, the number of records in the table can be larger, but we don't want
115 : : * a whole lot of records per bucket or performance goes down.
116 : : *
117 : : * In a hash table allocated in shared memory, the directory cannot be
118 : : * expanded because it must stay at a fixed address. The directory size is
119 : : * chosen at creation based on the initial number of elements, so even though
120 : : * we support allocating more elements later, performance will suffer if the
121 : : * table grows much beyond the initial size. (Currently, shared memory hash
122 : : * tables are only created by ShmemRequestHash()/ShmemInitHash() though, which
123 : : * doesn't support growing at all.)
124 : : */
125 : : #define HASH_SEGSIZE 256
126 : : #define HASH_SEGSIZE_SHIFT 8 /* must be log2(HASH_SEGSIZE) */
127 : : #define DEF_DIRSIZE 256
128 : :
129 : : /* Number of freelists to be used for a partitioned hash table. */
130 : : #define NUM_FREELISTS 32
131 : :
132 : : /* A hash bucket is a linked list of HASHELEMENTs */
133 : : typedef HASHELEMENT *HASHBUCKET;
134 : :
135 : : /* A hash segment is an array of bucket headers */
136 : : typedef HASHBUCKET *HASHSEGMENT;
137 : :
138 : : /*
139 : : * Per-freelist data.
140 : : *
141 : : * In a partitioned hash table, each freelist is associated with a specific
142 : : * set of hashcodes, as determined by the FREELIST_IDX() macro below.
143 : : * nentries tracks the number of live hashtable entries having those hashcodes
144 : : * (NOT the number of entries in the freelist, as you might expect).
145 : : *
146 : : * The coverage of a freelist might be more or less than one partition, so it
147 : : * needs its own lock rather than relying on caller locking. Relying on that
148 : : * wouldn't work even if the coverage was the same, because of the occasional
149 : : * need to "borrow" entries from another freelist; see get_hash_entry().
150 : : *
151 : : * Using an array of FreeListData instead of separate arrays of mutexes,
152 : : * nentries and freeLists helps to reduce sharing of cache lines between
153 : : * different mutexes.
154 : : */
155 : : typedef struct
156 : : {
157 : : slock_t mutex; /* spinlock for this freelist */
158 : : int64 nentries; /* number of entries in associated buckets */
159 : : HASHELEMENT *freeList; /* chain of free elements */
160 : : } FreeListData;
161 : :
162 : : /*
163 : : * Header structure for a hash table --- contains all changeable info
164 : : *
165 : : * In a shared-memory hash table, the HASHHDR is in shared memory, while
166 : : * each backend has a local HTAB struct. For a non-shared table, there isn't
167 : : * any functional difference between HASHHDR and HTAB, but we separate them
168 : : * anyway to share code between shared and non-shared tables.
169 : : */
170 : : struct HASHHDR
171 : : {
172 : : /*
173 : : * The freelist can become a point of contention in high-concurrency hash
174 : : * tables, so we use an array of freelists, each with its own mutex and
175 : : * nentries count, instead of just a single one. Although the freelists
176 : : * normally operate independently, we will scavenge entries from freelists
177 : : * other than a hashcode's default freelist when necessary.
178 : : *
179 : : * If the hash table is not partitioned, only freeList[0] is used and its
180 : : * spinlock is not used at all; callers' locking is assumed sufficient.
181 : : */
182 : : FreeListData freeList[NUM_FREELISTS];
183 : :
184 : : /* These fields can change, but not in a partitioned table */
185 : : /* Also, dsize can't change in a shared table, even if unpartitioned */
186 : : int64 dsize; /* directory size */
187 : : int64 nsegs; /* number of allocated segments (<= dsize) */
188 : : uint32 max_bucket; /* ID of maximum bucket in use */
189 : : uint32 high_mask; /* mask to modulo into entire table */
190 : : uint32 low_mask; /* mask to modulo into lower half of table */
191 : :
192 : : /* These fields are fixed at hashtable creation */
193 : : Size keysize; /* hash key length in bytes */
194 : : Size entrysize; /* total user element size in bytes */
195 : : int64 num_partitions; /* # partitions (must be power of 2), or 0 */
196 : : int64 max_dsize; /* 'dsize' limit if directory is fixed size */
197 : : int nelem_alloc; /* number of entries to allocate at once */
198 : : bool isfixed; /* if true, don't enlarge */
199 : :
200 : : /* Current directory. In shared tables, this doesn't change */
201 : : HASHSEGMENT *dir;
202 : :
203 : : #ifdef HASH_STATISTICS
204 : :
205 : : /*
206 : : * Count statistics here. NB: stats code doesn't bother with mutex, so
207 : : * counts could be corrupted a bit in a partitioned table.
208 : : */
209 : : uint64 accesses;
210 : : uint64 collisions;
211 : : uint64 expansions;
212 : : #endif
213 : : };
214 : :
215 : : #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
216 : :
217 : : #define FREELIST_IDX(hctl, hashcode) \
218 : : (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
219 : :
220 : : /*
221 : : * Top control structure for a hashtable --- in a shared table, each backend
222 : : * has its own copy (OK since no fields change at runtime)
223 : : */
224 : : struct HTAB
225 : : {
226 : : HASHHDR *hctl; /* => shared control information */
227 : : HASHSEGMENT *dir; /* directory of segment starts */
228 : : HashValueFunc hash; /* hash function */
229 : : HashCompareFunc match; /* key comparison function */
230 : : HashCopyFunc keycopy; /* key copying function */
231 : : HashAllocFunc alloc; /* memory allocator */
232 : : void *alloc_arg; /* opaque argument passed to allocator */
233 : : MemoryContext hcxt; /* memory context if default allocator used */
234 : : char *tabname; /* table name (for error messages) */
235 : : bool isshared; /* true if table is in shared memory */
236 : :
237 : : /* freezing a shared table isn't allowed, so we can keep state here */
238 : : bool frozen; /* true = no more inserts allowed */
239 : :
240 : : /* We keep local copies of these fixed values to reduce contention */
241 : : Size keysize; /* hash key length in bytes */
242 : :
243 : : /*
244 : : * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of
245 : : * all the element blocks they have allocated. This pacifies Valgrind,
246 : : * which would otherwise often claim that the element blocks are "possibly
247 : : * lost" for lack of any non-interior pointers to their starts.
248 : : */
249 : : #ifdef USE_VALGRIND
250 : : slist_head element_blocks;
251 : : #endif
252 : : };
253 : :
254 : : /*
255 : : * Key (also entry) part of a HASHELEMENT
256 : : */
257 : : #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
258 : :
259 : : /*
260 : : * Obtain element pointer given pointer to key
261 : : */
262 : : #define ELEMENT_FROM_KEY(key) \
263 : : ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
264 : :
265 : : /*
266 : : * Fast MOD arithmetic, assuming that y is a power of 2 !
267 : : */
268 : : #define MOD(x,y) ((x) & ((y)-1))
269 : :
270 : : /*
271 : : * Private function prototypes
272 : : */
273 : : static void *DynaHashAlloc(Size size, void *alloc_arg);
274 : : static HASHSEGMENT seg_alloc(HTAB *hashp);
275 : : static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
276 : : static bool dir_realloc(HTAB *hashp);
277 : : static bool expand_table(HTAB *hashp);
278 : : static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
279 : : static void hdefault(HTAB *hashp);
280 : : static int choose_nelem_alloc(Size entrysize);
281 : : static bool init_htab(HTAB *hashp, int64 nelem);
282 : : pg_noreturn static void hash_corrupted(HTAB *hashp);
283 : : static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue,
284 : : HASHBUCKET **bucketptr);
285 : : static int my_log2(int64 num);
286 : : static int64 next_pow2_int64(int64 num);
287 : : static int next_pow2_int(int64 num);
288 : : static void register_seq_scan(HTAB *hashp);
289 : : static void deregister_seq_scan(HTAB *hashp);
290 : : static bool has_seq_scans(HTAB *hashp);
291 : :
292 : :
293 : : /*
294 : : * memory allocation support
295 : : */
296 : : static void *
35 heikki.linnakangas@i 297 :GNC 1799265 : DynaHashAlloc(Size size, void *alloc_arg)
298 : : {
299 : 1799265 : MemoryContext hcxt = (MemoryContext) alloc_arg;
300 : :
301 [ + - - + : 1799265 : Assert(MemoryContextIsValid(hcxt));
- - - - -
- ]
302 : 1799265 : return MemoryContextAllocExtended(hcxt, size, MCXT_ALLOC_NO_OOM);
303 : : }
304 : :
305 : :
306 : : /*
307 : : * HashCompareFunc for string keys
308 : : *
309 : : * Because we copy keys with strlcpy(), they will be truncated at keysize-1
310 : : * bytes, so we can only compare that many ... hence strncmp is almost but
311 : : * not quite the right thing.
312 : : */
313 : : static int
7160 tgl@sss.pgh.pa.us 314 :CBC 606568 : string_compare(const char *key1, const char *key2, Size keysize)
315 : : {
316 : 606568 : return strncmp(key1, key2, keysize - 1);
317 : : }
318 : :
319 : :
320 : : /************************** CREATE ROUTINES **********************/
321 : :
322 : : /*
323 : : * hash_create -- create a new dynamic hash table
324 : : *
325 : : * tabname: a name for the table (for debugging purposes)
326 : : * nelem: maximum number of elements expected
327 : : * *info: additional table parameters, as indicated by flags
328 : : * flags: bitmask indicating which parameters to take from *info
329 : : *
330 : : * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
331 : : * optional, but the default keysize and entrysize values were useless.)
332 : : * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
333 : : * or HASH_FUNCTION, to define the key hashing semantics (C strings,
334 : : * binary blobs, or custom, respectively). Callers specifying a custom
335 : : * hash function will likely also want to use HASH_COMPARE, and perhaps
336 : : * also HASH_KEYCOPY, to control key comparison and copying.
337 : : * Another often-used flag is HASH_CONTEXT, to allocate the hash table
338 : : * under info->hcxt rather than under TopMemoryContext; the default
339 : : * behavior is only suitable for session-lifespan hash tables.
340 : : * Other flags bits are special-purpose and seldom used, except for those
341 : : * associated with shared-memory hash tables, for which see
342 : : * ShmemRequestHash().
343 : : *
344 : : * Fields in *info are read only when the associated flags bit is set.
345 : : * It is not necessary to initialize other fields of *info.
346 : : * Neither tabname nor *info need persist after the hash_create() call.
347 : : *
348 : : * Note: It is deprecated for callers of hash_create() to explicitly specify
349 : : * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
350 : : * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
351 : : * one of these.
352 : : *
353 : : * Note: for a shared-memory hashtable, nelem needs to be a pretty good
354 : : * estimate, since we can't expand the table on the fly. But an unshared
355 : : * hashtable can be expanded on-the-fly, so it's better for nelem to be
356 : : * on the small side and let the table grow if it's exceeded. An overly
357 : : * large nelem will penalize hash_seq_search speed without buying much.
358 : : */
359 : : HTAB *
256 michael@paquier.xyz 360 :GNC 418027 : hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
361 : : {
362 : : HTAB *hashp;
363 : : HASHHDR *hctl;
364 : : MemoryContext hcxt;
365 : :
366 : : /*
367 : : * Hash tables now allocate space for key and data, but you have to say
368 : : * how much space to allocate.
369 : : */
1967 tgl@sss.pgh.pa.us 370 [ - + ]:CBC 418027 : Assert(flags & HASH_ELEM);
371 [ - + ]: 418027 : Assert(info->keysize > 0);
372 [ - + ]: 418027 : Assert(info->entrysize >= info->keysize);
373 : :
374 : : /*
375 : : * For shared hash tables, we have a local hash header (HTAB struct) that
376 : : * we allocate in TopMemoryContext; all else is in shared memory.
377 : : *
378 : : * For non-shared hash tables, everything including the hash header is in
379 : : * a memory context created specially for the hash table --- this makes
380 : : * hash_destroy very simple. The memory context is made a child of either
381 : : * a context specified by the caller, or TopMemoryContext if nothing is
382 : : * specified.
383 : : *
384 : : * Note that HASH_ALLOC had better be set as well.
385 : : */
7669 386 [ + + ]: 418027 : if (flags & HASH_SHARED_MEM)
387 : : {
388 : : /* Set up to allocate the hash header */
35 heikki.linnakangas@i 389 :GNC 11180 : hcxt = TopMemoryContext;
390 : : }
391 : : else
392 : : {
393 : : /* Create the hash table's private memory context */
7669 tgl@sss.pgh.pa.us 394 [ + + ]:CBC 406847 : if (flags & HASH_CONTEXT)
35 heikki.linnakangas@i 395 :GNC 266766 : hcxt = info->hcxt;
396 : : else
397 : 140081 : hcxt = TopMemoryContext;
398 : 406847 : hcxt = AllocSetContextCreate(hcxt, "dynahash",
399 : : ALLOCSET_DEFAULT_SIZES);
400 : : }
401 : :
402 : : /* Initialize the hash header, plus a copy of the table name */
403 : 418027 : hashp = (HTAB *) MemoryContextAlloc(hcxt,
377 tgl@sss.pgh.pa.us 404 :CBC 418027 : sizeof(HTAB) + strlen(tabname) + 1);
10456 bruce@momjian.us 405 [ + - + - : 5016324 : MemSet(hashp, 0, sizeof(HTAB));
+ - + - +
+ ]
406 : :
7669 tgl@sss.pgh.pa.us 407 : 418027 : hashp->tabname = (char *) (hashp + 1);
8978 408 : 418027 : strcpy(hashp->tabname, tabname);
409 : :
410 : : /* If we have a private context, label it with hashtable's name */
2961 411 [ + + ]: 418027 : if (!(flags & HASH_SHARED_MEM))
35 heikki.linnakangas@i 412 :GNC 406847 : MemoryContextSetIdentifier(hcxt, hashp->tabname);
413 : :
414 : : /*
415 : : * Select the appropriate hash function (see comments at head of file).
416 : : */
10467 bruce@momjian.us 417 [ + + ]:CBC 418027 : if (flags & HASH_FUNCTION)
418 : : {
1967 tgl@sss.pgh.pa.us 419 [ - + ]: 16220 : Assert(!(flags & (HASH_BLOBS | HASH_STRINGS)));
10467 bruce@momjian.us 420 : 16220 : hashp->hash = info->hash;
421 : : }
4156 tgl@sss.pgh.pa.us 422 [ + + ]: 401807 : else if (flags & HASH_BLOBS)
423 : : {
1967 424 [ - + ]: 338620 : Assert(!(flags & HASH_STRINGS));
425 : : /* We can optimize hashing for common key sizes */
4156 426 [ + + ]: 338620 : if (info->keysize == sizeof(uint32))
427 : 243338 : hashp->hash = uint32_hash;
428 : : else
429 : 95282 : hashp->hash = tag_hash;
430 : : }
431 : : else
432 : : {
433 : : /*
434 : : * string_hash used to be considered the default hash method, and in a
435 : : * non-assert build it effectively still is. But we now consider it
436 : : * an assertion error to not say HASH_STRINGS explicitly. To help
437 : : * catch mistaken usage of HASH_STRINGS, we also insist on a
438 : : * reasonably long string length: if the keysize is only 4 or 8 bytes,
439 : : * it's almost certainly an integer or pointer not a string.
440 : : */
1967 441 [ - + ]: 63187 : Assert(flags & HASH_STRINGS);
442 [ - + ]: 63187 : Assert(info->keysize > 8);
443 : :
444 : 63187 : hashp->hash = string_hash;
445 : : }
446 : :
447 : : /*
448 : : * If you don't specify a match function, it defaults to string_compare if
449 : : * you used string_hash, and to memcmp otherwise.
450 : : *
451 : : * Note: explicitly specifying string_hash is deprecated, because this
452 : : * might not work for callers in loadable modules on some platforms due to
453 : : * referencing a trampoline instead of the string_hash function proper.
454 : : * Specify HASH_STRINGS instead.
455 : : */
8295 456 [ + + ]: 418027 : if (flags & HASH_COMPARE)
457 : 8769 : hashp->match = info->match;
458 [ + + ]: 409258 : else if (hashp->hash == string_hash)
7160 459 : 63187 : hashp->match = (HashCompareFunc) string_compare;
460 : : else
8295 461 : 346071 : hashp->match = memcmp;
462 : :
463 : : /*
464 : : * Similarly, the key-copying function defaults to strlcpy or memcpy.
465 : : */
7626 466 [ - + ]: 418027 : if (flags & HASH_KEYCOPY)
7626 tgl@sss.pgh.pa.us 467 :UBC 0 : hashp->keycopy = info->keycopy;
7626 tgl@sss.pgh.pa.us 468 [ + + ]:CBC 418027 : else if (hashp->hash == string_hash)
469 : : {
470 : : /*
471 : : * The signature of keycopy is meant for memcpy(), which returns
472 : : * void*, but strlcpy() returns size_t. Since we never use the return
473 : : * value of keycopy, and size_t is pretty much always the same size as
474 : : * void *, this should be safe. The extra cast in the middle is to
475 : : * avoid warnings from -Wcast-function-type.
476 : : */
2121 peter@eisentraut.org 477 : 63187 : hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
478 : : }
479 : : else
7626 tgl@sss.pgh.pa.us 480 : 354840 : hashp->keycopy = memcpy;
481 : :
482 : : /* And select the entry allocation function, too. */
7669 483 [ + + ]: 418027 : if (flags & HASH_ALLOC)
484 : : {
485 : 11180 : hashp->alloc = info->alloc;
35 heikki.linnakangas@i 486 :GNC 11180 : hashp->alloc_arg = info->alloc_arg;
487 : : }
488 : : else
489 : : {
7669 tgl@sss.pgh.pa.us 490 :CBC 406847 : hashp->alloc = DynaHashAlloc;
35 heikki.linnakangas@i 491 :GNC 406847 : hashp->alloc_arg = hcxt;
492 : : }
493 : :
10467 bruce@momjian.us 494 [ + + ]:CBC 418027 : if (flags & HASH_SHARED_MEM)
495 : : {
9083 JanWieck@Yahoo.com 496 : 11180 : hashp->hcxt = NULL;
8978 tgl@sss.pgh.pa.us 497 : 11180 : hashp->isshared = true;
498 : :
499 : : /* hash table already exists, we're just attaching to it */
10467 bruce@momjian.us 500 [ - + ]: 11180 : if (flags & HASH_ATTACH)
501 : : {
502 : : /* Caller must pass the pointer to the shared header */
31 heikki.linnakangas@i 503 [ # # ]:UNC 0 : Assert(info->hctl);
504 : 0 : hashp->hctl = info->hctl;
505 : :
506 : : /* make local copies of some heavily-used values */
507 : 0 : hashp->dir = info->hctl->dir;
508 : 0 : hashp->keysize = info->hctl->keysize;
509 : :
10108 bruce@momjian.us 510 :UBC 0 : return hashp;
511 : : }
512 : : }
513 : : else
514 : : {
515 : : /* setup hash table defaults */
9934 tgl@sss.pgh.pa.us 516 :CBC 406847 : hashp->hctl = NULL;
10467 bruce@momjian.us 517 : 406847 : hashp->dir = NULL;
35 heikki.linnakangas@i 518 :GNC 406847 : hashp->hcxt = hcxt;
8978 tgl@sss.pgh.pa.us 519 :CBC 406847 : hashp->isshared = false;
520 : : }
521 : :
522 : : /*
523 : : * Allocate the header structure.
524 : : *
525 : : * XXX: In case of a shared memory hash table, other processes need the
526 : : * pointer to the header to re-find the hash table. There is currently no
527 : : * explicit way to pass it back from here, the caller relies on the fact
528 : : * that this is the first allocation made with the alloc function. That's
529 : : * a little ugly, but works for now.
530 : : */
31 heikki.linnakangas@i 531 :GNC 418027 : hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR), hashp->alloc_arg);
10467 bruce@momjian.us 532 [ - + ]:CBC 418027 : if (!hashp->hctl)
31 heikki.linnakangas@i 533 [ # # ]:UNC 0 : ereport(ERROR,
534 : : (errcode(ERRCODE_OUT_OF_MEMORY),
535 : : errmsg("out of memory")));
536 : :
6949 tgl@sss.pgh.pa.us 537 :CBC 418027 : hashp->frozen = false;
538 : :
7865 neilc@samurai.com 539 : 418027 : hdefault(hashp);
540 : :
10467 bruce@momjian.us 541 : 418027 : hctl = hashp->hctl;
542 : :
7227 tgl@sss.pgh.pa.us 543 [ + + ]: 418027 : if (flags & HASH_PARTITION)
544 : : {
545 : : /* Doesn't make sense to partition a local hash table */
546 [ - + ]: 6205 : Assert(flags & HASH_SHARED_MEM);
547 : :
548 : : /*
549 : : * The number of partitions had better be a power of 2. Also, it must
550 : : * be less than INT_MAX (see init_htab()), so call the int version of
551 : : * next_pow2.
552 : : */
4893 553 [ - + ]: 6205 : Assert(info->num_partitions == next_pow2_int(info->num_partitions));
554 : :
7227 555 : 6205 : hctl->num_partitions = info->num_partitions;
556 : : }
557 : :
558 : : /* remember the entry sizes, too */
1967 559 : 418027 : hctl->keysize = info->keysize;
560 : 418027 : hctl->entrysize = info->entrysize;
561 : :
562 : : /* make local copies of heavily-used constant fields */
7227 563 : 418027 : hashp->keysize = hctl->keysize;
564 : :
565 : : /* Build the hash directory structure */
8982 566 [ - + ]: 418027 : if (!init_htab(hashp, nelem))
6811 tgl@sss.pgh.pa.us 567 [ # # ]:UBC 0 : elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
568 : :
569 : : /*
570 : : * For a shared hash table, preallocate the requested number of elements.
571 : : * This reduces problems with run-time out-of-shared-memory conditions.
572 : : *
573 : : * For a non-shared hash table, preallocate the requested number of
574 : : * elements if it's less than our chosen nelem_alloc. This avoids wasting
575 : : * space if the caller correctly estimates a small table size.
576 : : */
7254 tgl@sss.pgh.pa.us 577 [ + + ]:CBC 418027 : if ((flags & HASH_SHARED_MEM) ||
578 [ + + ]: 406847 : nelem < hctl->nelem_alloc)
579 : : {
580 : : int i,
581 : : freelist_partitions,
582 : : nelem_alloc,
583 : : nelem_alloc_first;
584 : :
585 : : /*
586 : : * If hash table is partitioned, give each freelist an equal share of
587 : : * the initial allocation. Otherwise only freeList[0] is used.
588 : : */
3695 rhaas@postgresql.org 589 [ + + ]: 146203 : if (IS_PARTITIONED(hashp->hctl))
590 : 6205 : freelist_partitions = NUM_FREELISTS;
591 : : else
592 : 139998 : freelist_partitions = 1;
593 : :
594 : 146203 : nelem_alloc = nelem / freelist_partitions;
3209 tgl@sss.pgh.pa.us 595 [ - + ]: 146203 : if (nelem_alloc <= 0)
3695 rhaas@postgresql.org 596 :UBC 0 : nelem_alloc = 1;
597 : :
598 : : /*
599 : : * Make sure we'll allocate all the requested elements; freeList[0]
600 : : * gets the excess if the request isn't divisible by NUM_FREELISTS.
601 : : */
3695 rhaas@postgresql.org 602 [ + + ]:CBC 146203 : if (nelem_alloc * freelist_partitions < nelem)
603 : 59 : nelem_alloc_first =
604 : 59 : nelem - nelem_alloc * (freelist_partitions - 1);
605 : : else
606 : 146144 : nelem_alloc_first = nelem_alloc;
607 : :
608 [ + + ]: 484761 : for (i = 0; i < freelist_partitions; i++)
609 : : {
610 [ + + ]: 338558 : int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
611 : :
396 tomas.vondra@postgre 612 [ - + ]: 338558 : if (!element_alloc(hashp, temp, i))
396 tomas.vondra@postgre 613 [ # # ]:UBC 0 : ereport(ERROR,
614 : : (errcode(ERRCODE_OUT_OF_MEMORY),
615 : : errmsg("out of memory")));
616 : : }
617 : : }
618 : :
619 : : /* Set isfixed if requested, but not till after we build initial entries */
5503 heikki.linnakangas@i 620 [ + + ]:CBC 418027 : if (flags & HASH_FIXED_SIZE)
284 tgl@sss.pgh.pa.us 621 :GNC 11180 : hctl->isfixed = true;
622 : :
10108 bruce@momjian.us 623 :CBC 418027 : return hashp;
624 : : }
625 : :
626 : : /*
627 : : * Set default HASHHDR parameters.
628 : : */
629 : : static void
10466 630 : 418027 : hdefault(HTAB *hashp)
631 : : {
8958 632 : 418027 : HASHHDR *hctl = hashp->hctl;
633 : :
8982 tgl@sss.pgh.pa.us 634 [ + - + - : 44728889 : MemSet(hctl, 0, sizeof(HASHHDR));
+ - + - +
+ ]
635 : :
7227 636 : 418027 : hctl->num_partitions = 0; /* not partitioned */
637 : :
638 : : /* table has no fixed maximum size */
10467 bruce@momjian.us 639 : 418027 : hctl->max_dsize = NO_MAX_DSIZE;
640 : :
284 tgl@sss.pgh.pa.us 641 :GNC 418027 : hctl->isfixed = false; /* can be enlarged */
642 : :
643 : : #ifdef HASH_STATISTICS
644 : : hctl->accesses = hctl->collisions = hctl->expansions = 0;
645 : : #endif
10892 scrappy@hub.org 646 :CBC 418027 : }
647 : :
648 : : /*
649 : : * Given the user-specified entry size, choose nelem_alloc, ie, how many
650 : : * elements to add to the hash table when we need more.
651 : : */
652 : : static int
7254 tgl@sss.pgh.pa.us 653 : 431542 : choose_nelem_alloc(Size entrysize)
654 : : {
655 : : int nelem_alloc;
656 : : Size elementSize;
657 : : Size allocSize;
658 : :
659 : : /* Each element has a HASHELEMENT header plus user data. */
660 : : /* NB: this had better match element_alloc() */
661 : 431542 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
662 : :
663 : : /*
664 : : * The idea here is to choose nelem_alloc at least 32, but round up so
665 : : * that the allocation request will be a power of 2 or just less. This
666 : : * makes little difference for hash tables in shared memory, but for hash
667 : : * tables managed by palloc, the allocation request will be rounded up to
668 : : * a power of 2 anyway. If we fail to take this into account, we'll waste
669 : : * as much as half the allocated space.
670 : : */
671 : 431542 : allocSize = 32 * 4; /* assume elementSize at least 8 */
672 : : do
673 : : {
674 : 1654843 : allocSize <<= 1;
675 : 1654843 : nelem_alloc = allocSize / elementSize;
676 [ + + ]: 1654843 : } while (nelem_alloc < 32);
677 : :
678 : 431542 : return nelem_alloc;
679 : : }
680 : :
681 : : /*
682 : : * Compute derived fields of hctl and build the initial directory/segment
683 : : * arrays
684 : : */
685 : : static bool
256 michael@paquier.xyz 686 :GNC 418027 : init_htab(HTAB *hashp, int64 nelem)
687 : : {
8958 bruce@momjian.us 688 :CBC 418027 : HASHHDR *hctl = hashp->hctl;
689 : : HASHSEGMENT *segp;
690 : : int nbuckets;
691 : : int nsegs;
692 : : int i;
693 : :
694 : : /*
695 : : * initialize mutexes if it's a partitioned table
696 : : */
7227 tgl@sss.pgh.pa.us 697 [ + + ]: 418027 : if (IS_PARTITIONED(hctl))
3695 rhaas@postgresql.org 698 [ + + ]: 204765 : for (i = 0; i < NUM_FREELISTS; i++)
699 : 198560 : SpinLockInit(&(hctl->freeList[i].mutex));
700 : :
701 : : /*
702 : : * Allocate space for the next greater power of two number of buckets,
703 : : * assuming a desired maximum load factor of 1.
704 : : */
396 tomas.vondra@postgre 705 : 418027 : nbuckets = next_pow2_int(nelem);
706 : :
707 : : /*
708 : : * In a partitioned table, nbuckets must be at least equal to
709 : : * num_partitions; were it less, keys with apparently different partition
710 : : * numbers would map to the same bucket, breaking partition independence.
711 : : * (Normally nbuckets will be much bigger; this is just a safety check.)
712 : : */
713 [ - + ]: 418027 : while (nbuckets < hctl->num_partitions)
396 tomas.vondra@postgre 714 :UBC 0 : nbuckets <<= 1;
715 : :
10467 bruce@momjian.us 716 :CBC 418027 : hctl->max_bucket = hctl->low_mask = nbuckets - 1;
717 : 418027 : hctl->high_mask = (nbuckets << 1) - 1;
718 : :
719 : : /*
720 : : * Figure number of directory segments needed, round up to a power of 2
721 : : */
35 heikki.linnakangas@i 722 :GNC 418027 : nsegs = (nbuckets - 1) / HASH_SEGSIZE + 1;
396 tomas.vondra@postgre 723 :CBC 418027 : nsegs = next_pow2_int(nsegs);
724 : :
725 : : /*
726 : : * Make sure directory is big enough.
727 : : */
31 heikki.linnakangas@i 728 :GNC 418027 : hctl->dsize = Max(DEF_DIRSIZE, nsegs);
729 : :
730 : : /* SHM hash tables have a fixed directory. */
731 [ + + ]: 418027 : if (hashp->isshared)
732 : 11180 : hctl->max_dsize = hctl->dsize;
733 : :
734 : : /* Allocate a directory */
735 : 418027 : hctl->dir = (HASHSEGMENT *)
736 : 418027 : hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT), hashp->alloc_arg);
737 [ - + ]: 418027 : if (!hctl->dir)
31 heikki.linnakangas@i 738 :UNC 0 : return false;
31 heikki.linnakangas@i 739 :GNC 418027 : hashp->dir = hctl->dir;
740 : :
741 : : /* Allocate initial segments */
10467 bruce@momjian.us 742 [ + + ]:CBC 1377708 : for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
743 : : {
396 tomas.vondra@postgre 744 : 959681 : *segp = seg_alloc(hashp);
745 [ - + ]: 959681 : if (*segp == NULL)
396 tomas.vondra@postgre 746 :UBC 0 : return false;
747 : : }
748 : :
749 : : /* Choose number of entries to allocate at a time */
396 tomas.vondra@postgre 750 :CBC 418027 : hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
751 : :
8982 tgl@sss.pgh.pa.us 752 : 418027 : return true;
753 : : }
754 : :
755 : : /*
756 : : * Estimate the space needed for a hashtable containing the given number
757 : : * of entries of given size.
758 : : * NOTE: this is used to estimate the footprint of hashtables in shared
759 : : * memory; therefore it does not count HTAB which is in local memory.
760 : : * NB: assumes that all hash structure parameters have default values!
761 : : */
762 : : Size
256 michael@paquier.xyz 763 :GNC 13515 : hash_estimate_size(int64 num_entries, Size entrysize)
764 : : {
765 : : Size size;
766 : : int64 nBuckets,
767 : : nSegments,
768 : : nDirEntries,
769 : : nElementAllocs,
770 : : elementSize,
771 : : elementAllocCnt;
772 : :
773 : : /* estimate number of buckets wanted */
774 : 13515 : nBuckets = next_pow2_int64(num_entries);
775 : : /* # of segments needed for nBuckets */
35 heikki.linnakangas@i 776 : 13515 : nSegments = next_pow2_int64((nBuckets - 1) / HASH_SEGSIZE + 1);
777 : : /* directory entries */
31 778 : 13515 : nDirEntries = Max(DEF_DIRSIZE, nSegments);
779 : :
780 : : /* fixed control info */
7563 tgl@sss.pgh.pa.us 781 :CBC 13515 : size = MAXALIGN(sizeof(HASHHDR)); /* but not HTAB, per above */
782 : : /* directory */
783 : 13515 : size = add_size(size, mul_size(nDirEntries, sizeof(HASHSEGMENT)));
784 : : /* segments */
785 : 13515 : size = add_size(size, mul_size(nSegments,
786 : : MAXALIGN(HASH_SEGSIZE * sizeof(HASHBUCKET))));
787 : : /* elements --- allocated in groups of choose_nelem_alloc() entries */
7254 788 : 13515 : elementAllocCnt = choose_nelem_alloc(entrysize);
7618 789 : 13515 : nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
7254 790 : 13515 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(entrysize);
7563 791 : 13515 : size = add_size(size,
792 : : mul_size(nElementAllocs,
793 : : mul_size(elementAllocCnt, elementSize)));
794 : :
9934 795 : 13515 : return size;
796 : : }
797 : :
798 : :
799 : : /********************** DESTROY ROUTINES ************************/
800 : :
801 : : void
10466 bruce@momjian.us 802 : 79516 : hash_destroy(HTAB *hashp)
803 : : {
10467 804 [ + - ]: 79516 : if (hashp != NULL)
805 : : {
806 : : /* allocation method must be one we know how to free, too */
7669 tgl@sss.pgh.pa.us 807 [ - + ]: 79516 : Assert(hashp->alloc == DynaHashAlloc);
808 : : /* so this hashtable must have its own context */
9083 JanWieck@Yahoo.com 809 [ - + ]: 79516 : Assert(hashp->hcxt != NULL);
810 : :
259 drowley@postgresql.o 811 :GNC 79516 : hash_stats(__func__, hashp);
812 : :
813 : : /*
814 : : * Free everything by destroying the hash table's memory context.
815 : : */
9083 JanWieck@Yahoo.com 816 :CBC 79516 : MemoryContextDelete(hashp->hcxt);
817 : : }
10892 scrappy@hub.org 818 : 79516 : }
819 : :
820 : : void
259 drowley@postgresql.o 821 :GNC 79516 : hash_stats(const char *caller, HTAB *hashp)
822 : : {
823 : : #ifdef HASH_STATISTICS
824 : : HASHHDR *hctl = hashp->hctl;
825 : :
826 : : elog(DEBUG4,
827 : : "hash_stats: Caller: %s Table Name: \"%s\" Accesses: " UINT64_FORMAT " Collisions: " UINT64_FORMAT " Expansions: " UINT64_FORMAT " Entries: " INT64_FORMAT " Key Size: %zu Max Bucket: %u Segment Count: " INT64_FORMAT,
828 : : caller != NULL ? caller : "(unknown)", hashp->tabname, hctl->accesses,
829 : : hctl->collisions, hctl->expansions, hash_get_num_entries(hashp),
830 : : hctl->keysize, hctl->max_bucket, hctl->nsegs);
831 : : #endif
10892 scrappy@hub.org 832 :CBC 79516 : }
833 : :
834 : : /*******************************SEARCH ROUTINES *****************************/
835 : :
836 : :
837 : : /*
838 : : * get_hash_value -- exported routine to calculate a key's hash value
839 : : *
840 : : * We export this because for partitioned tables, callers need to compute
841 : : * the partition number (from the low-order bits of the hash value) before
842 : : * searching.
843 : : */
844 : : uint32
7227 tgl@sss.pgh.pa.us 845 : 108038973 : get_hash_value(HTAB *hashp, const void *keyPtr)
846 : : {
847 : 108038973 : return hashp->hash(keyPtr, hashp->keysize);
848 : : }
849 : :
850 : : /* Convert a hash value to a bucket number */
851 : : static inline uint32
8295 852 : 244182531 : calc_bucket(HASHHDR *hctl, uint32 hash_val)
853 : : {
854 : : uint32 bucket;
855 : :
10467 bruce@momjian.us 856 : 244182531 : bucket = hash_val & hctl->high_mask;
857 [ + + ]: 244182531 : if (bucket > hctl->max_bucket)
858 : 115702323 : bucket = bucket & hctl->low_mask;
859 : :
8823 tgl@sss.pgh.pa.us 860 : 244182531 : return bucket;
861 : : }
862 : :
863 : : /*
864 : : * hash_search -- look up key in table and perform action
865 : : * hash_search_with_hash_value -- same, with key's hash value already computed
866 : : *
867 : : * action is one of:
868 : : * HASH_FIND: look up key in table
869 : : * HASH_ENTER: look up key in table, creating entry if not present
870 : : * HASH_ENTER_NULL: same, but return NULL if out of memory
871 : : * HASH_REMOVE: look up key in table, remove entry if present
872 : : *
873 : : * Return value is a pointer to the element found/entered/removed if any,
874 : : * or NULL if no match was found. (NB: in the case of the REMOVE action,
875 : : * the result is a dangling pointer that shouldn't be dereferenced!)
876 : : *
877 : : * HASH_ENTER will normally ereport a generic "out of memory" error if
878 : : * it is unable to create a new entry. The HASH_ENTER_NULL operation is
879 : : * the same except it will return NULL if out of memory.
880 : : *
881 : : * If foundPtr isn't NULL, then *foundPtr is set true if we found an
882 : : * existing entry in the table, false otherwise. This is needed in the
883 : : * HASH_ENTER case, but is redundant with the return value otherwise.
884 : : *
885 : : * For hash_search_with_hash_value, the hashvalue parameter must have been
886 : : * calculated with get_hash_value().
887 : : */
888 : : void *
10466 bruce@momjian.us 889 : 145077550 : hash_search(HTAB *hashp,
890 : : const void *keyPtr,
891 : : HASHACTION action,
892 : : bool *foundPtr)
893 : : {
7227 tgl@sss.pgh.pa.us 894 : 145077550 : return hash_search_with_hash_value(hashp,
895 : : keyPtr,
896 : 145077550 : hashp->hash(keyPtr, hashp->keysize),
897 : : action,
898 : : foundPtr);
899 : : }
900 : :
901 : : void *
902 : 242262875 : hash_search_with_hash_value(HTAB *hashp,
903 : : const void *keyPtr,
904 : : uint32 hashvalue,
905 : : HASHACTION action,
906 : : bool *foundPtr)
907 : : {
8958 bruce@momjian.us 908 : 242262875 : HASHHDR *hctl = hashp->hctl;
3209 tgl@sss.pgh.pa.us 909 [ + + ]: 242262875 : int freelist_idx = FREELIST_IDX(hctl, hashvalue);
910 : : Size keysize;
911 : : HASHBUCKET currBucket;
912 : : HASHBUCKET *prevBucketPtr;
913 : : HashCompareFunc match;
914 : :
915 : : #ifdef HASH_STATISTICS
916 : : hctl->accesses++;
917 : : #endif
918 : :
919 : : /*
920 : : * If inserting, check if it is time to split a bucket.
921 : : *
922 : : * NOTE: failure to expand table is not a fatal error, it just means we
923 : : * have to run at higher fill factor than we wanted. However, if we're
924 : : * using the palloc allocator then it will throw error anyway on
925 : : * out-of-memory, so we must do this before modifying the table.
926 : : */
4946 927 [ + + + + ]: 242262875 : if (action == HASH_ENTER || action == HASH_ENTER_NULL)
928 : : {
929 : : /*
930 : : * Can't split if running in partitioned mode, nor if frozen, nor if
931 : : * table is the subject of any active hash_seq_search scans.
932 : : */
256 michael@paquier.xyz 933 [ + + ]:GNC 58646271 : if (hctl->freeList[0].nentries > (int64) hctl->max_bucket &&
2054 tmunro@postgresql.or 934 [ + - + - ]:CBC 449957 : !IS_PARTITIONED(hctl) && !hashp->frozen &&
4946 tgl@sss.pgh.pa.us 935 [ + - ]: 449957 : !has_seq_scans(hashp))
936 : 449957 : (void) expand_table(hashp);
937 : : }
938 : :
939 : : /*
940 : : * Do the initial lookup
941 : : */
781 michael@paquier.xyz 942 : 242262875 : (void) hash_initial_lookup(hashp, hashvalue, &prevBucketPtr);
7646 tgl@sss.pgh.pa.us 943 : 242262875 : currBucket = *prevBucketPtr;
944 : :
945 : : /*
946 : : * Follow collision chain looking for matching key
947 : : */
948 : 242262875 : match = hashp->match; /* save one fetch in inner loop */
7227 949 : 242262875 : keysize = hashp->keysize; /* ditto */
950 : :
7646 951 [ + + ]: 290292035 : while (currBucket != NULL)
952 : : {
953 [ + + + + ]: 437837549 : if (currBucket->hashvalue == hashvalue &&
954 : 194907622 : match(ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
955 : 194900767 : break;
956 : 48029160 : prevBucketPtr = &(currBucket->link);
8982 957 : 48029160 : currBucket = *prevBucketPtr;
958 : : #ifdef HASH_STATISTICS
959 : : hctl->collisions++;
960 : : #endif
961 : : }
962 : :
8978 963 [ + + ]: 242262875 : if (foundPtr)
964 : 60000673 : *foundPtr = (bool) (currBucket != NULL);
965 : :
966 : : /*
967 : : * OK, now what?
968 : : */
10467 bruce@momjian.us 969 [ + + + - ]: 242262875 : switch (action)
970 : : {
8978 tgl@sss.pgh.pa.us 971 : 153267077 : case HASH_FIND:
8982 972 [ + + ]: 153267077 : if (currBucket != NULL)
523 peter@eisentraut.org 973 : 141992888 : return ELEMENTKEY(currBucket);
8978 tgl@sss.pgh.pa.us 974 : 11274189 : return NULL;
975 : :
10466 bruce@momjian.us 976 : 30349527 : case HASH_REMOVE:
8982 tgl@sss.pgh.pa.us 977 [ + + ]: 30349527 : if (currBucket != NULL)
978 : : {
979 : : /* if partitioned, must lock to touch nentries and freeList */
3854 rhaas@postgresql.org 980 [ + + ]: 30347559 : if (IS_PARTITIONED(hctl))
3695 981 [ + + ]: 6234512 : SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
982 : :
983 : : /* delete the record from the appropriate nentries counter. */
984 [ - + ]: 30347559 : Assert(hctl->freeList[freelist_idx].nentries > 0);
985 : 30347559 : hctl->freeList[freelist_idx].nentries--;
986 : :
987 : : /* remove record from hash bucket's chain. */
8982 tgl@sss.pgh.pa.us 988 : 30347559 : *prevBucketPtr = currBucket->link;
989 : :
990 : : /* add the record to the appropriate freelist. */
3695 rhaas@postgresql.org 991 : 30347559 : currBucket->link = hctl->freeList[freelist_idx].freeList;
992 : 30347559 : hctl->freeList[freelist_idx].freeList = currBucket;
993 : :
3854 994 [ + + ]: 30347559 : if (IS_PARTITIONED(hctl))
3695 995 : 6234512 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
996 : :
997 : : /*
998 : : * better hope the caller is synchronizing access to this
999 : : * element, because someone else is going to reuse it the next
1000 : : * time something is added to the table
1001 : : */
523 peter@eisentraut.org 1002 : 30347559 : return ELEMENTKEY(currBucket);
1003 : : }
8978 tgl@sss.pgh.pa.us 1004 : 1968 : return NULL;
1005 : :
1006 : 58646271 : case HASH_ENTER:
1007 : : case HASH_ENTER_NULL:
1008 : : /* Return existing element if found, else create one */
8982 1009 [ + + ]: 58646271 : if (currBucket != NULL)
523 peter@eisentraut.org 1010 : 22560320 : return ELEMENTKEY(currBucket);
1011 : :
1012 : : /* disallow inserts if frozen */
6949 tgl@sss.pgh.pa.us 1013 [ - + ]: 36085951 : if (hashp->frozen)
6811 tgl@sss.pgh.pa.us 1014 [ # # ]:UBC 0 : elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
1015 : : hashp->tabname);
1016 : :
3695 rhaas@postgresql.org 1017 :CBC 36085951 : currBucket = get_hash_entry(hashp, freelist_idx);
8978 tgl@sss.pgh.pa.us 1018 [ - + ]: 36085951 : if (currBucket == NULL)
1019 : : {
1020 : : /* out of memory */
7227 tgl@sss.pgh.pa.us 1021 [ # # ]:UBC 0 : if (action == HASH_ENTER_NULL)
1022 : 0 : return NULL;
1023 : : /* report a generic message */
1024 [ # # ]: 0 : if (hashp->isshared)
1025 [ # # ]: 0 : ereport(ERROR,
1026 : : (errcode(ERRCODE_OUT_OF_MEMORY),
1027 : : errmsg("out of shared memory")));
1028 : : else
1029 [ # # ]: 0 : ereport(ERROR,
1030 : : (errcode(ERRCODE_OUT_OF_MEMORY),
1031 : : errmsg("out of memory")));
1032 : : }
1033 : :
1034 : : /* link into hashbucket chain */
8978 tgl@sss.pgh.pa.us 1035 :CBC 36085951 : *prevBucketPtr = currBucket;
1036 : 36085951 : currBucket->link = NULL;
1037 : :
1038 : : /* copy key into record */
8295 1039 : 36085951 : currBucket->hashvalue = hashvalue;
7626 1040 : 36085951 : hashp->keycopy(ELEMENTKEY(currBucket), keyPtr, keysize);
1041 : :
1042 : : /*
1043 : : * Caller is expected to fill the data field on return. DO NOT
1044 : : * insert any code that could possibly throw error here, as doing
1045 : : * so would leave the table entry incomplete and hence corrupt the
1046 : : * caller's data structure.
1047 : : */
1048 : :
523 peter@eisentraut.org 1049 : 36085951 : return ELEMENTKEY(currBucket);
1050 : : }
1051 : :
8320 tgl@sss.pgh.pa.us 1052 [ # # ]:UBC 0 : elog(ERROR, "unrecognized hash action code: %d", (int) action);
1053 : :
1054 : : return NULL; /* keep compiler quiet */
1055 : : }
1056 : :
1057 : : /*
1058 : : * hash_update_hash_key -- change the hash key of an existing table entry
1059 : : *
1060 : : * This is equivalent to removing the entry, making a new entry, and copying
1061 : : * over its data, except that the entry never goes to the table's freelist.
1062 : : * Therefore this cannot suffer an out-of-memory failure, even if there are
1063 : : * other processes operating in other partitions of the hashtable.
1064 : : *
1065 : : * Returns true if successful, false if the requested new hash key is already
1066 : : * present. Throws error if the specified entry pointer isn't actually a
1067 : : * table member.
1068 : : *
1069 : : * NB: currently, there is no special case for old and new hash keys being
1070 : : * identical, which means we'll report false for that situation. This is
1071 : : * preferable for existing uses.
1072 : : *
1073 : : * NB: for a partitioned hashtable, caller must hold lock on both relevant
1074 : : * partitions, if the new hash key would belong to a different partition.
1075 : : */
1076 : : bool
4860 tgl@sss.pgh.pa.us 1077 :CBC 739 : hash_update_hash_key(HTAB *hashp,
1078 : : void *existingEntry,
1079 : : const void *newKeyPtr)
1080 : : {
1081 : 739 : HASHELEMENT *existingElement = ELEMENT_FROM_KEY(existingEntry);
1082 : : uint32 newhashvalue;
1083 : : Size keysize;
1084 : : uint32 bucket;
1085 : : uint32 newbucket;
1086 : : HASHBUCKET currBucket;
1087 : : HASHBUCKET *prevBucketPtr;
1088 : : HASHBUCKET *oldPrevPtr;
1089 : : HashCompareFunc match;
1090 : :
1091 : : #ifdef HASH_STATISTICS
1092 : : HASHHDR *hctl = hashp->hctl;
1093 : :
1094 : : hctl->accesses++;
1095 : : #endif
1096 : :
1097 : : /* disallow updates if frozen */
1098 [ - + ]: 739 : if (hashp->frozen)
4860 tgl@sss.pgh.pa.us 1099 [ # # ]:UBC 0 : elog(ERROR, "cannot update in frozen hashtable \"%s\"",
1100 : : hashp->tabname);
1101 : :
1102 : : /*
1103 : : * Lookup the existing element using its saved hash value. We need to do
1104 : : * this to be able to unlink it from its hash chain, but as a side benefit
1105 : : * we can verify the validity of the passed existingEntry pointer.
1106 : : */
781 michael@paquier.xyz 1107 :CBC 739 : bucket = hash_initial_lookup(hashp, existingElement->hashvalue,
1108 : : &prevBucketPtr);
4860 tgl@sss.pgh.pa.us 1109 : 739 : currBucket = *prevBucketPtr;
1110 : :
1111 [ + - ]: 742 : while (currBucket != NULL)
1112 : : {
1113 [ + + ]: 742 : if (currBucket == existingElement)
1114 : 739 : break;
1115 : 3 : prevBucketPtr = &(currBucket->link);
1116 : 3 : currBucket = *prevBucketPtr;
1117 : : }
1118 : :
1119 [ - + ]: 739 : if (currBucket == NULL)
4860 tgl@sss.pgh.pa.us 1120 [ # # ]:UBC 0 : elog(ERROR, "hash_update_hash_key argument is not in hashtable \"%s\"",
1121 : : hashp->tabname);
1122 : :
4860 tgl@sss.pgh.pa.us 1123 :CBC 739 : oldPrevPtr = prevBucketPtr;
1124 : :
1125 : : /*
1126 : : * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1127 : : * chain we want to put the entry into.
1128 : : */
1129 : 739 : newhashvalue = hashp->hash(newKeyPtr, hashp->keysize);
781 michael@paquier.xyz 1130 : 739 : newbucket = hash_initial_lookup(hashp, newhashvalue, &prevBucketPtr);
4860 tgl@sss.pgh.pa.us 1131 : 739 : currBucket = *prevBucketPtr;
1132 : :
1133 : : /*
1134 : : * Follow collision chain looking for matching key
1135 : : */
1136 : 739 : match = hashp->match; /* save one fetch in inner loop */
1137 : 739 : keysize = hashp->keysize; /* ditto */
1138 : :
1139 [ + + ]: 788 : while (currBucket != NULL)
1140 : : {
1141 [ - + - - ]: 49 : if (currBucket->hashvalue == newhashvalue &&
4860 tgl@sss.pgh.pa.us 1142 :UBC 0 : match(ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1143 : 0 : break;
4860 tgl@sss.pgh.pa.us 1144 :CBC 49 : prevBucketPtr = &(currBucket->link);
1145 : 49 : currBucket = *prevBucketPtr;
1146 : : #ifdef HASH_STATISTICS
1147 : : hctl->collisions++;
1148 : : #endif
1149 : : }
1150 : :
1151 [ - + ]: 739 : if (currBucket != NULL)
4860 tgl@sss.pgh.pa.us 1152 :UBC 0 : return false; /* collision with an existing entry */
1153 : :
4860 tgl@sss.pgh.pa.us 1154 :CBC 739 : currBucket = existingElement;
1155 : :
1156 : : /*
1157 : : * If old and new hash values belong to the same bucket, we need not
1158 : : * change any chain links, and indeed should not since this simplistic
1159 : : * update will corrupt the list if currBucket is the last element. (We
1160 : : * cannot fall out earlier, however, since we need to scan the bucket to
1161 : : * check for duplicate keys.)
1162 : : */
4859 1163 [ + + ]: 739 : if (bucket != newbucket)
1164 : : {
1165 : : /* OK to remove record from old hash bucket's chain. */
1166 : 700 : *oldPrevPtr = currBucket->link;
1167 : :
1168 : : /* link into new hashbucket chain */
1169 : 700 : *prevBucketPtr = currBucket;
1170 : 700 : currBucket->link = NULL;
1171 : : }
1172 : :
1173 : : /* copy new key into record */
4860 1174 : 739 : currBucket->hashvalue = newhashvalue;
1175 : 739 : hashp->keycopy(ELEMENTKEY(currBucket), newKeyPtr, keysize);
1176 : :
1177 : : /* rest of record is untouched */
1178 : :
1179 : 739 : return true;
1180 : : }
1181 : :
1182 : : /*
1183 : : * Allocate a new hashtable entry if possible; return NULL if out of memory.
1184 : : * (Or, if the underlying space allocator throws error for out-of-memory,
1185 : : * we won't return at all.)
1186 : : */
1187 : : static HASHBUCKET
3695 rhaas@postgresql.org 1188 : 36085951 : get_hash_entry(HTAB *hashp, int freelist_idx)
1189 : : {
1190 : 36085951 : HASHHDR *hctl = hashp->hctl;
1191 : : HASHBUCKET newElement;
1192 : :
1193 : : for (;;)
1194 : : {
1195 : : /* if partitioned, must lock to touch nentries and freeList */
3854 1196 [ + + ]: 36471448 : if (IS_PARTITIONED(hctl))
3695 1197 [ + + ]: 6942884 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1198 : :
1199 : : /* try to get an entry from the freelist */
1200 : 36471448 : newElement = hctl->freeList[freelist_idx].freeList;
1201 : :
7227 tgl@sss.pgh.pa.us 1202 [ + + ]: 36471448 : if (newElement != NULL)
1203 : 36048035 : break;
1204 : :
3854 rhaas@postgresql.org 1205 [ + + ]: 423413 : if (IS_PARTITIONED(hctl))
3695 1206 : 37916 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1207 : :
1208 : : /*
1209 : : * No free elements in this freelist. In a partitioned table, there
1210 : : * might be entries in other freelists, but to reduce contention we
1211 : : * prefer to first try to get another chunk of buckets from the main
1212 : : * shmem allocator. If that fails, though, we *MUST* root through all
1213 : : * the other freelists before giving up. There are multiple callers
1214 : : * that assume that they can allocate every element in the initially
1215 : : * requested table size, or that deleting an element guarantees they
1216 : : * can insert a new element, even if shared memory is entirely full.
1217 : : * Failing because the needed element is in a different freelist is
1218 : : * not acceptable.
1219 : : */
396 tomas.vondra@postgre 1220 [ + + ]: 423413 : if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
1221 : : {
1222 : : int borrow_from_idx;
1223 : :
3695 rhaas@postgresql.org 1224 [ - + ]:GBC 37916 : if (!IS_PARTITIONED(hctl))
3695 rhaas@postgresql.org 1225 :UBC 0 : return NULL; /* out of memory */
1226 : :
1227 : : /* try to borrow element from another freelist */
3695 rhaas@postgresql.org 1228 :GBC 37916 : borrow_from_idx = freelist_idx;
1229 : : for (;;)
1230 : : {
1231 : 42159 : borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
1232 [ - + ]: 42159 : if (borrow_from_idx == freelist_idx)
3209 tgl@sss.pgh.pa.us 1233 :UBC 0 : break; /* examined all freelists, fail */
1234 : :
3695 rhaas@postgresql.org 1235 [ # # ]:GBC 42159 : SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
1236 : 42159 : newElement = hctl->freeList[borrow_from_idx].freeList;
1237 : :
1238 [ + + ]: 42159 : if (newElement != NULL)
1239 : : {
1240 : 37916 : hctl->freeList[borrow_from_idx].freeList = newElement->link;
1241 : 37916 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1242 : :
1243 : : /* careful: count the new element in its proper freelist */
1244 [ # # ]: 37916 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1245 : 37916 : hctl->freeList[freelist_idx].nentries++;
1246 : 37916 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1247 : :
3209 tgl@sss.pgh.pa.us 1248 : 37916 : return newElement;
1249 : : }
1250 : :
3695 rhaas@postgresql.org 1251 : 4243 : SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
1252 : : }
1253 : :
1254 : : /* no elements available to borrow either, so out of memory */
3209 tgl@sss.pgh.pa.us 1255 :UBC 0 : return NULL;
1256 : : }
1257 : : }
1258 : :
1259 : : /* remove entry from freelist, bump nentries */
3695 rhaas@postgresql.org 1260 :CBC 36048035 : hctl->freeList[freelist_idx].freeList = newElement->link;
1261 : 36048035 : hctl->freeList[freelist_idx].nentries++;
1262 : :
3854 1263 [ + + ]: 36048035 : if (IS_PARTITIONED(hctl))
3695 1264 : 6904968 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1265 : :
7227 tgl@sss.pgh.pa.us 1266 : 36048035 : return newElement;
1267 : : }
1268 : :
1269 : : /*
1270 : : * hash_get_num_entries -- get the number of entries in a hashtable
1271 : : */
1272 : : int64
1273 : 68807 : hash_get_num_entries(HTAB *hashp)
1274 : : {
1275 : : int i;
256 michael@paquier.xyz 1276 :GNC 68807 : int64 sum = hashp->hctl->freeList[0].nentries;
1277 : :
1278 : : /*
1279 : : * We currently don't bother with acquiring the mutexes; it's only
1280 : : * sensible to call this function if you've got lock on all partitions of
1281 : : * the table.
1282 : : */
3209 tgl@sss.pgh.pa.us 1283 [ + + ]:CBC 68807 : if (IS_PARTITIONED(hashp->hctl))
1284 : : {
1285 [ + + ]: 68544 : for (i = 1; i < NUM_FREELISTS; i++)
1286 : 66402 : sum += hashp->hctl->freeList[i].nentries;
1287 : : }
1288 : :
3695 rhaas@postgresql.org 1289 : 68807 : return sum;
1290 : : }
1291 : :
1292 : : /*
1293 : : * hash_seq_init/_search/_term
1294 : : * Sequentially search through hash table and return
1295 : : * all the elements one by one, return NULL when no more.
1296 : : *
1297 : : * hash_seq_term should be called if and only if the scan is abandoned before
1298 : : * completion; if hash_seq_search returns NULL then it has already done the
1299 : : * end-of-scan cleanup.
1300 : : *
1301 : : * NOTE: caller may delete the returned element before continuing the scan.
1302 : : * However, deleting any other element while the scan is in progress is
1303 : : * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1304 : : * if elements are added to the table while the scan is in progress, it is
1305 : : * unspecified whether they will be visited by the scan or not.
1306 : : *
1307 : : * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1308 : : * worry about hash_seq_term cleanup, if the hashtable is first locked against
1309 : : * further insertions by calling hash_freeze.
1310 : : *
1311 : : * NOTE: to use this with a partitioned hashtable, caller had better hold
1312 : : * at least shared lock on all partitions of the table throughout the scan!
1313 : : * We can cope with insertions or deletions by our own backend, but *not*
1314 : : * with concurrent insertions or deletions by another.
1315 : : */
1316 : : void
9254 tgl@sss.pgh.pa.us 1317 : 2826413 : hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
1318 : : {
1319 : 2826413 : status->hashp = hashp;
1320 : 2826413 : status->curBucket = 0;
8982 1321 : 2826413 : status->curEntry = NULL;
636 akorotkov@postgresql 1322 : 2826413 : status->hasHashvalue = false;
6949 tgl@sss.pgh.pa.us 1323 [ + - ]: 2826413 : if (!hashp->frozen)
1324 : 2826413 : register_seq_scan(hashp);
9254 1325 : 2826413 : }
1326 : :
1327 : : /*
1328 : : * Same as above but scan by the given hash value.
1329 : : * See also hash_seq_search().
1330 : : *
1331 : : * NOTE: the default hash function doesn't match syscache hash function.
1332 : : * Thus, if you're going to use this function in syscache callback, make sure
1333 : : * you're using custom hash function. See relatt_cache_syshash()
1334 : : * for example.
1335 : : */
1336 : : void
636 akorotkov@postgresql 1337 : 1282391 : hash_seq_init_with_hash_value(HASH_SEQ_STATUS *status, HTAB *hashp,
1338 : : uint32 hashvalue)
1339 : : {
1340 : : HASHBUCKET *bucketPtr;
1341 : :
1342 : 1282391 : hash_seq_init(status, hashp);
1343 : :
1344 : 1282391 : status->hasHashvalue = true;
1345 : 1282391 : status->hashvalue = hashvalue;
1346 : :
1347 : 1282391 : status->curBucket = hash_initial_lookup(hashp, hashvalue, &bucketPtr);
1348 : 1282391 : status->curEntry = *bucketPtr;
1349 : 1282391 : }
1350 : :
1351 : : void *
9254 tgl@sss.pgh.pa.us 1352 : 37017634 : hash_seq_search(HASH_SEQ_STATUS *status)
1353 : : {
1354 : : HTAB *hashp;
1355 : : HASHHDR *hctl;
1356 : : uint32 max_bucket;
1357 : : int64 segment_num;
1358 : : int64 segment_ndx;
1359 : : HASHSEGMENT segp;
1360 : : uint32 curBucket;
1361 : : HASHELEMENT *curElem;
1362 : :
636 akorotkov@postgresql 1363 [ + + ]: 37017634 : if (status->hasHashvalue)
1364 : : {
1365 : : /*
1366 : : * Scan entries only in the current bucket because only this bucket
1367 : : * can contain entries with the given hash value.
1368 : : */
1369 [ + + ]: 1452394 : while ((curElem = status->curEntry) != NULL)
1370 : : {
1371 : 170003 : status->curEntry = curElem->link;
1372 [ + + ]: 170003 : if (status->hashvalue != curElem->hashvalue)
1373 : 163460 : continue;
1374 : 6543 : return (void *) ELEMENTKEY(curElem);
1375 : : }
1376 : :
1377 : 1282391 : hash_seq_term(status);
1378 : 1282391 : return NULL;
1379 : : }
1380 : :
7669 tgl@sss.pgh.pa.us 1381 [ + + ]: 35728700 : if ((curElem = status->curEntry) != NULL)
1382 : : {
1383 : : /* Continuing scan of curBucket... */
1384 : 9444205 : status->curEntry = curElem->link;
7507 bruce@momjian.us 1385 [ + + ]: 9444205 : if (status->curEntry == NULL) /* end of this bucket */
7669 tgl@sss.pgh.pa.us 1386 : 6847951 : ++status->curBucket;
523 peter@eisentraut.org 1387 : 9444205 : return ELEMENTKEY(curElem);
1388 : : }
1389 : :
1390 : : /*
1391 : : * Search for next nonempty bucket starting at curBucket.
1392 : : */
7669 tgl@sss.pgh.pa.us 1393 : 26284495 : curBucket = status->curBucket;
1394 : 26284495 : hashp = status->hashp;
1395 : 26284495 : hctl = hashp->hctl;
1396 : 26284495 : max_bucket = hctl->max_bucket;
1397 : :
1398 [ + + ]: 26284495 : if (curBucket > max_bucket)
1399 : : {
6949 1400 : 50376 : hash_seq_term(status);
7507 bruce@momjian.us 1401 : 50376 : return NULL; /* search is done */
1402 : : }
1403 : :
1404 : : /*
1405 : : * first find the right segment in the table directory.
1406 : : */
35 heikki.linnakangas@i 1407 :GNC 26234119 : segment_num = curBucket >> HASH_SEGSIZE_SHIFT;
1408 : 26234119 : segment_ndx = MOD(curBucket, HASH_SEGSIZE);
1409 : :
7669 tgl@sss.pgh.pa.us 1410 :CBC 26234119 : segp = hashp->dir[segment_num];
1411 : :
1412 : : /*
1413 : : * Pick up the first item in this bucket's chain. If chain is not empty
1414 : : * we can begin searching it. Otherwise we have to advance to find the
1415 : : * next nonempty bucket. We try to optimize that case since searching a
1416 : : * near-empty hashtable has to iterate this loop a lot.
1417 : : */
1418 [ + + ]: 170401907 : while ((curElem = segp[segment_ndx]) == NULL)
1419 : : {
1420 : : /* empty bucket, advance to next */
1421 [ + + ]: 145640831 : if (++curBucket > max_bucket)
1422 : : {
1423 : 1473043 : status->curBucket = curBucket;
6949 1424 : 1473043 : hash_seq_term(status);
7507 bruce@momjian.us 1425 : 1473043 : return NULL; /* search is done */
1426 : : }
35 heikki.linnakangas@i 1427 [ + + ]:GNC 144167788 : if (++segment_ndx >= HASH_SEGSIZE)
1428 : : {
7669 tgl@sss.pgh.pa.us 1429 :CBC 380364 : segment_num++;
1430 : 380364 : segment_ndx = 0;
1431 : 380364 : segp = hashp->dir[segment_num];
1432 : : }
1433 : : }
1434 : :
1435 : : /* Begin scan of curBucket... */
1436 : 24761076 : status->curEntry = curElem->link;
3240 1437 [ + + ]: 24761076 : if (status->curEntry == NULL) /* end of this bucket */
7669 1438 : 17913040 : ++curBucket;
1439 : 24761076 : status->curBucket = curBucket;
523 peter@eisentraut.org 1440 : 24761076 : return ELEMENTKEY(curElem);
1441 : : }
1442 : :
1443 : : void
6949 tgl@sss.pgh.pa.us 1444 : 2826110 : hash_seq_term(HASH_SEQ_STATUS *status)
1445 : : {
1446 [ + - ]: 2826110 : if (!status->hashp->frozen)
1447 : 2826110 : deregister_seq_scan(status->hashp);
1448 : 2826110 : }
1449 : :
1450 : : /*
1451 : : * hash_freeze
1452 : : * Freeze a hashtable against future insertions (deletions are
1453 : : * still allowed)
1454 : : *
1455 : : * The reason for doing this is that by preventing any more bucket splits,
1456 : : * we no longer need to worry about registering hash_seq_search scans,
1457 : : * and thus caller need not be careful about ensuring hash_seq_term gets
1458 : : * called at the right times.
1459 : : *
1460 : : * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1461 : : * with active scans (since hash_seq_term would then do the wrong thing).
1462 : : */
1463 : : void
6949 tgl@sss.pgh.pa.us 1464 :UBC 0 : hash_freeze(HTAB *hashp)
1465 : : {
1466 [ # # ]: 0 : if (hashp->isshared)
6811 1467 [ # # ]: 0 : elog(ERROR, "cannot freeze shared hashtable \"%s\"", hashp->tabname);
6949 1468 [ # # # # ]: 0 : if (!hashp->frozen && has_seq_scans(hashp))
6811 1469 [ # # ]: 0 : elog(ERROR, "cannot freeze hashtable \"%s\" because it has active scans",
1470 : : hashp->tabname);
6949 1471 : 0 : hashp->frozen = true;
1472 : 0 : }
1473 : :
1474 : :
1475 : : /********************************* UTILITIES ************************/
1476 : :
1477 : : /*
1478 : : * Expand the table by adding one more hash bucket.
1479 : : */
1480 : : static bool
10466 bruce@momjian.us 1481 :CBC 449957 : expand_table(HTAB *hashp)
1482 : : {
8958 1483 : 449957 : HASHHDR *hctl = hashp->hctl;
1484 : : HASHSEGMENT old_seg,
1485 : : new_seg;
1486 : : int64 old_bucket,
1487 : : new_bucket;
1488 : : int64 new_segnum,
1489 : : new_segndx;
1490 : : int64 old_segnum,
1491 : : old_segndx;
1492 : : HASHBUCKET *oldlink,
1493 : : *newlink;
1494 : : HASHBUCKET currElement,
1495 : : nextElement;
1496 : :
7227 tgl@sss.pgh.pa.us 1497 [ - + ]: 449957 : Assert(!IS_PARTITIONED(hctl));
1498 : :
1499 : : #ifdef HASH_STATISTICS
1500 : : hctl->expansions++;
1501 : : #endif
1502 : :
9934 1503 : 449957 : new_bucket = hctl->max_bucket + 1;
35 heikki.linnakangas@i 1504 :GNC 449957 : new_segnum = new_bucket >> HASH_SEGSIZE_SHIFT;
1505 : 449957 : new_segndx = MOD(new_bucket, HASH_SEGSIZE);
1506 : :
10467 bruce@momjian.us 1507 [ + + ]:CBC 449957 : if (new_segnum >= hctl->nsegs)
1508 : : {
1509 : : /* Allocate new segment if necessary -- could fail if dir full */
1510 [ - + ]: 1503 : if (new_segnum >= hctl->dsize)
9842 bruce@momjian.us 1511 [ # # ]:UBC 0 : if (!dir_realloc(hashp))
8982 tgl@sss.pgh.pa.us 1512 : 0 : return false;
10467 bruce@momjian.us 1513 [ - + ]:CBC 1503 : if (!(hashp->dir[new_segnum] = seg_alloc(hashp)))
8982 tgl@sss.pgh.pa.us 1514 :UBC 0 : return false;
10467 bruce@momjian.us 1515 :CBC 1503 : hctl->nsegs++;
1516 : : }
1517 : :
1518 : : /* OK, we created a new bucket */
9934 tgl@sss.pgh.pa.us 1519 : 449957 : hctl->max_bucket++;
1520 : :
1521 : : /*
1522 : : * *Before* changing masks, find old bucket corresponding to same hash
1523 : : * values; values in that bucket may need to be relocated to new bucket.
1524 : : * Note that new_bucket is certainly larger than low_mask at this point,
1525 : : * so we can skip the first step of the regular hash mask calc.
1526 : : */
9836 1527 : 449957 : old_bucket = (new_bucket & hctl->low_mask);
1528 : :
1529 : : /*
1530 : : * If we crossed a power of 2, readjust masks.
1531 : : */
8823 1532 [ + + ]: 449957 : if ((uint32) new_bucket > hctl->high_mask)
1533 : : {
10467 bruce@momjian.us 1534 : 2876 : hctl->low_mask = hctl->high_mask;
8823 tgl@sss.pgh.pa.us 1535 : 2876 : hctl->high_mask = (uint32) new_bucket | hctl->low_mask;
1536 : : }
1537 : :
1538 : : /*
1539 : : * Relocate records to the new bucket. NOTE: because of the way the hash
1540 : : * masking is done in calc_bucket, only one old bucket can need to be
1541 : : * split at this point. With a different way of reducing the hash value,
1542 : : * that might not be true!
1543 : : */
35 heikki.linnakangas@i 1544 :GNC 449957 : old_segnum = old_bucket >> HASH_SEGSIZE_SHIFT;
1545 : 449957 : old_segndx = MOD(old_bucket, HASH_SEGSIZE);
1546 : :
8982 tgl@sss.pgh.pa.us 1547 :CBC 449957 : old_seg = hashp->dir[old_segnum];
1548 : 449957 : new_seg = hashp->dir[new_segnum];
1549 : :
1550 : 449957 : oldlink = &old_seg[old_segndx];
1551 : 449957 : newlink = &new_seg[new_segndx];
1552 : :
1553 : 449957 : for (currElement = *oldlink;
1554 [ + + ]: 1085744 : currElement != NULL;
1555 : 635787 : currElement = nextElement)
1556 : : {
1557 : 635787 : nextElement = currElement->link;
256 michael@paquier.xyz 1558 [ + + ]:GNC 635787 : if ((int64) calc_bucket(hctl, currElement->hashvalue) == old_bucket)
1559 : : {
8982 tgl@sss.pgh.pa.us 1560 :CBC 317053 : *oldlink = currElement;
1561 : 317053 : oldlink = &currElement->link;
1562 : : }
1563 : : else
1564 : : {
1565 : 318734 : *newlink = currElement;
1566 : 318734 : newlink = &currElement->link;
1567 : : }
1568 : : }
1569 : : /* don't forget to terminate the rebuilt hash chains... */
1570 : 449957 : *oldlink = NULL;
1571 : 449957 : *newlink = NULL;
1572 : :
1573 : 449957 : return true;
1574 : : }
1575 : :
1576 : :
1577 : : static bool
10466 bruce@momjian.us 1578 :UBC 0 : dir_realloc(HTAB *hashp)
1579 : : {
1580 : : HASHSEGMENT *p;
1581 : : HASHSEGMENT *old_p;
1582 : : int64 new_dsize;
1583 : : int64 old_dirsize;
1584 : : int64 new_dirsize;
1585 : :
10467 1586 [ # # ]: 0 : if (hashp->hctl->max_dsize != NO_MAX_DSIZE)
8982 tgl@sss.pgh.pa.us 1587 : 0 : return false;
1588 : :
1589 : : /* Reallocate directory */
9934 1590 : 0 : new_dsize = hashp->hctl->dsize << 1;
8982 1591 : 0 : old_dirsize = hashp->hctl->dsize * sizeof(HASHSEGMENT);
1592 : 0 : new_dirsize = new_dsize * sizeof(HASHSEGMENT);
1593 : :
1594 : 0 : old_p = hashp->dir;
35 heikki.linnakangas@i 1595 :UNC 0 : p = (HASHSEGMENT *) hashp->alloc((Size) new_dirsize, hashp->alloc_arg);
1596 : :
10467 bruce@momjian.us 1597 [ # # ]:UBC 0 : if (p != NULL)
1598 : : {
8982 tgl@sss.pgh.pa.us 1599 : 0 : memcpy(p, old_p, old_dirsize);
1600 [ # # # # : 0 : MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
# # # # #
# ]
31 heikki.linnakangas@i 1601 :UNC 0 : hashp->hctl->dir = p;
8982 tgl@sss.pgh.pa.us 1602 :UBC 0 : hashp->dir = p;
9934 1603 : 0 : hashp->hctl->dsize = new_dsize;
1604 : :
1605 : : /* XXX assume the allocator is palloc, so we know how to free */
7669 1606 [ # # ]: 0 : Assert(hashp->alloc == DynaHashAlloc);
1607 : 0 : pfree(old_p);
1608 : :
8982 1609 : 0 : return true;
1610 : : }
1611 : :
1612 : 0 : return false;
1613 : : }
1614 : :
1615 : :
1616 : : static HASHSEGMENT
10466 bruce@momjian.us 1617 :CBC 961184 : seg_alloc(HTAB *hashp)
1618 : : {
1619 : : HASHSEGMENT segp;
1620 : :
35 heikki.linnakangas@i 1621 :GNC 961184 : segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * HASH_SEGSIZE, hashp->alloc_arg);
1622 : :
10467 bruce@momjian.us 1623 [ - + ]:CBC 961184 : if (!segp)
8982 tgl@sss.pgh.pa.us 1624 :UBC 0 : return NULL;
1625 : :
35 heikki.linnakangas@i 1626 [ + - + - :GNC 961184 : MemSet(segp, 0, sizeof(HASHBUCKET) * HASH_SEGSIZE);
+ - - + -
- ]
1627 : :
8982 tgl@sss.pgh.pa.us 1628 :CBC 961184 : return segp;
1629 : : }
1630 : :
1631 : : /*
1632 : : * allocate some new elements and link them into the indicated free list
1633 : : */
1634 : : static bool
396 tomas.vondra@postgre 1635 : 761971 : element_alloc(HTAB *hashp, int nelem, int freelist_idx)
1636 : : {
3695 rhaas@postgresql.org 1637 : 761971 : HASHHDR *hctl = hashp->hctl;
1638 : : Size elementSize;
1639 : : Size requestSize;
1640 : : char *allocedBlock;
1641 : : HASHELEMENT *firstElement;
1642 : : HASHELEMENT *tmpElement;
1643 : : HASHELEMENT *prevElement;
1644 : : int i;
1645 : :
284 tgl@sss.pgh.pa.us 1646 [ + + ]:GNC 761971 : if (hctl->isfixed)
396 tomas.vondra@postgre 1647 :GBC 37916 : return false;
1648 : :
1649 : : /* Each element has a HASHELEMENT header plus user data. */
396 tomas.vondra@postgre 1650 :CBC 724055 : elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
1651 : :
276 tgl@sss.pgh.pa.us 1652 :GNC 724055 : requestSize = nelem * elementSize;
1653 : :
1654 : : /* Add space for slist_node list link if we need one. */
1655 : : #ifdef USE_VALGRIND
1656 : : if (!hashp->isshared)
1657 : : requestSize += MAXALIGN(sizeof(slist_node));
1658 : : #endif
1659 : :
1660 : : /* Allocate the memory. */
35 heikki.linnakangas@i 1661 : 724055 : allocedBlock = hashp->alloc(requestSize, hashp->alloc_arg);
1662 : :
276 tgl@sss.pgh.pa.us 1663 [ - + ]: 724055 : if (!allocedBlock)
396 tomas.vondra@postgre 1664 :UBC 0 : return false;
1665 : :
1666 : : /*
1667 : : * If USE_VALGRIND, each allocated block of elements of a non-shared
1668 : : * hashtable is chained into a list, so that Valgrind won't think it's
1669 : : * been leaked.
1670 : : */
1671 : : #ifdef USE_VALGRIND
1672 : : if (hashp->isshared)
1673 : : firstElement = (HASHELEMENT *) allocedBlock;
1674 : : else
1675 : : {
1676 : : slist_push_head(&hashp->element_blocks, (slist_node *) allocedBlock);
1677 : : firstElement = (HASHELEMENT *) (allocedBlock + MAXALIGN(sizeof(slist_node)));
1678 : : }
1679 : : #else
276 tgl@sss.pgh.pa.us 1680 :GNC 724055 : firstElement = (HASHELEMENT *) allocedBlock;
1681 : : #endif
1682 : :
1683 : : /* prepare to link all the new entries into the freelist */
7227 tgl@sss.pgh.pa.us 1684 :CBC 724055 : prevElement = NULL;
1685 : 724055 : tmpElement = firstElement;
7889 1686 [ + + ]: 97585038 : for (i = 0; i < nelem; i++)
1687 : : {
7227 1688 : 96860983 : tmpElement->link = prevElement;
1689 : 96860983 : prevElement = tmpElement;
8982 1690 : 96860983 : tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
1691 : : }
1692 : :
1693 : : /* if partitioned, must lock to touch freeList */
3854 rhaas@postgresql.org 1694 [ + + ]: 724055 : if (IS_PARTITIONED(hctl))
3695 1695 [ - + ]: 198560 : SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
1696 : :
1697 : : /* freelist could be nonempty if two backends did this concurrently */
1698 : 724055 : firstElement->link = hctl->freeList[freelist_idx].freeList;
1699 : 724055 : hctl->freeList[freelist_idx].freeList = prevElement;
1700 : :
3854 1701 [ + + ]: 724055 : if (IS_PARTITIONED(hctl))
3695 1702 : 198560 : SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
1703 : :
396 tomas.vondra@postgre 1704 : 724055 : return true;
1705 : : }
1706 : :
1707 : : /*
1708 : : * Do initial lookup of a bucket for the given hash value, retrieving its
1709 : : * bucket number and its hash bucket.
1710 : : */
1711 : : static inline uint32
781 michael@paquier.xyz 1712 : 243546744 : hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
1713 : : {
1714 : 243546744 : HASHHDR *hctl = hashp->hctl;
1715 : : HASHSEGMENT segp;
1716 : : int64 segment_num;
1717 : : int64 segment_ndx;
1718 : : uint32 bucket;
1719 : :
1720 : 243546744 : bucket = calc_bucket(hctl, hashvalue);
1721 : :
35 heikki.linnakangas@i 1722 :GNC 243546744 : segment_num = bucket >> HASH_SEGSIZE_SHIFT;
1723 : 243546744 : segment_ndx = MOD(bucket, HASH_SEGSIZE);
1724 : :
781 michael@paquier.xyz 1725 :CBC 243546744 : segp = hashp->dir[segment_num];
1726 : :
1727 [ - + ]: 243546744 : if (segp == NULL)
781 michael@paquier.xyz 1728 :UBC 0 : hash_corrupted(hashp);
1729 : :
781 michael@paquier.xyz 1730 :CBC 243546744 : *bucketptr = &segp[segment_ndx];
1731 : 243546744 : return bucket;
1732 : : }
1733 : :
1734 : : /* complain when we have detected a corrupted hashtable */
1735 : : static void
8978 tgl@sss.pgh.pa.us 1736 :UBC 0 : hash_corrupted(HTAB *hashp)
1737 : : {
1738 : : /*
1739 : : * If the corruption is in a shared hashtable, we'd better force a
1740 : : * systemwide restart. Otherwise, just shut down this one backend.
1741 : : */
1742 [ # # ]: 0 : if (hashp->isshared)
8320 1743 [ # # ]: 0 : elog(PANIC, "hash table \"%s\" corrupted", hashp->tabname);
1744 : : else
1745 [ # # ]: 0 : elog(FATAL, "hash table \"%s\" corrupted", hashp->tabname);
1746 : : }
1747 : :
1748 : : /* calculate ceil(log base 2) of num */
1749 : : static int
256 michael@paquier.xyz 1750 :GNC 869289 : my_log2(int64 num)
1751 : : {
1752 : : /*
1753 : : * guard against too-large input, which would be invalid for
1754 : : * pg_ceil_log2_*()
1755 : : */
1756 [ - + ]: 869289 : if (num > PG_INT64_MAX / 2)
256 michael@paquier.xyz 1757 :UNC 0 : num = PG_INT64_MAX / 2;
1758 : :
2218 drowley@postgresql.o 1759 :CBC 869289 : return pg_ceil_log2_64(num);
1760 : : }
1761 : :
1762 : : /* calculate first power of 2 >= num, bounded to what will fit in a int64 */
1763 : : static int64
256 michael@paquier.xyz 1764 :GNC 27030 : next_pow2_int64(int64 num)
1765 : : {
1766 : : /* my_log2's internal range check is sufficient */
16 peter@eisentraut.org 1767 : 27030 : return INT64CONST(1) << my_log2(num);
1768 : : }
1769 : :
1770 : : /* calculate first power of 2 >= num, bounded to what will fit in an int */
1771 : : static int
256 michael@paquier.xyz 1772 : 842259 : next_pow2_int(int64 num)
1773 : : {
4893 tgl@sss.pgh.pa.us 1774 [ - + ]:CBC 842259 : if (num > INT_MAX / 2)
4893 tgl@sss.pgh.pa.us 1775 :UBC 0 : num = INT_MAX / 2;
4893 tgl@sss.pgh.pa.us 1776 :CBC 842259 : return 1 << my_log2(num);
1777 : : }
1778 : :
1779 : :
1780 : : /************************* SEQ SCAN TRACKING ************************/
1781 : :
1782 : : /*
1783 : : * We track active hash_seq_search scans here. The need for this mechanism
1784 : : * comes from the fact that a scan will get confused if a bucket split occurs
1785 : : * while it's in progress: it might visit entries twice, or even miss some
1786 : : * entirely (if it's partway through the same bucket that splits). Hence
1787 : : * we want to inhibit bucket splits if there are any active scans on the
1788 : : * table being inserted into. This is a fairly rare case in current usage,
1789 : : * so just postponing the split until the next insertion seems sufficient.
1790 : : *
1791 : : * Given present usages of the function, only a few scans are likely to be
1792 : : * open concurrently; so a finite-size stack of open scans seems sufficient,
1793 : : * and we don't worry that linear search is too slow. Note that we do
1794 : : * allow multiple scans of the same hashtable to be open concurrently.
1795 : : *
1796 : : * This mechanism can support concurrent scan and insertion in a shared
1797 : : * hashtable if it's the same backend doing both. It would fail otherwise,
1798 : : * but locking reasons seem to preclude any such scenario anyway, so we don't
1799 : : * worry.
1800 : : *
1801 : : * This arrangement is reasonably robust if a transient hashtable is deleted
1802 : : * without notifying us. The absolute worst case is we might inhibit splits
1803 : : * in another table created later at exactly the same address. We will give
1804 : : * a warning at transaction end for reference leaks, so any bugs leading to
1805 : : * lack of notification should be easy to catch.
1806 : : */
1807 : :
1808 : : #define MAX_SEQ_SCANS 100
1809 : :
1810 : : static HTAB *seq_scan_tables[MAX_SEQ_SCANS]; /* tables being scanned */
1811 : : static int seq_scan_level[MAX_SEQ_SCANS]; /* subtransaction nest level */
1812 : : static int num_seq_scans = 0;
1813 : :
1814 : :
1815 : : /* Register a table as having an active hash_seq_search scan */
1816 : : static void
6949 1817 : 2826413 : register_seq_scan(HTAB *hashp)
1818 : : {
1819 [ - + ]: 2826413 : if (num_seq_scans >= MAX_SEQ_SCANS)
6811 tgl@sss.pgh.pa.us 1820 [ # # ]:UBC 0 : elog(ERROR, "too many active hash_seq_search scans, cannot start one on \"%s\"",
1821 : : hashp->tabname);
6949 tgl@sss.pgh.pa.us 1822 :CBC 2826413 : seq_scan_tables[num_seq_scans] = hashp;
1823 : 2826413 : seq_scan_level[num_seq_scans] = GetCurrentTransactionNestLevel();
1824 : 2826413 : num_seq_scans++;
1825 : 2826413 : }
1826 : :
1827 : : /* Deregister an active scan */
1828 : : static void
1829 : 2826110 : deregister_seq_scan(HTAB *hashp)
1830 : : {
1831 : : int i;
1832 : :
1833 : : /* Search backward since it's most likely at the stack top */
1834 [ + - ]: 2826110 : for (i = num_seq_scans - 1; i >= 0; i--)
1835 : : {
1836 [ + - ]: 2826110 : if (seq_scan_tables[i] == hashp)
1837 : : {
1838 : 2826110 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1839 : 2826110 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1840 : 2826110 : num_seq_scans--;
1841 : 2826110 : return;
1842 : : }
1843 : : }
6949 tgl@sss.pgh.pa.us 1844 [ # # ]:UBC 0 : elog(ERROR, "no hash_seq_search scan for hash table \"%s\"",
1845 : : hashp->tabname);
1846 : : }
1847 : :
1848 : : /* Check if a table has any active scan */
1849 : : static bool
6949 tgl@sss.pgh.pa.us 1850 :CBC 449957 : has_seq_scans(HTAB *hashp)
1851 : : {
1852 : : int i;
1853 : :
1854 [ + + ]: 450527 : for (i = 0; i < num_seq_scans; i++)
1855 : : {
1856 [ - + ]: 570 : if (seq_scan_tables[i] == hashp)
6949 tgl@sss.pgh.pa.us 1857 :UBC 0 : return true;
1858 : : }
6949 tgl@sss.pgh.pa.us 1859 :CBC 449957 : return false;
1860 : : }
1861 : :
1862 : : /* Clean up any open scans at end of transaction */
1863 : : void
1864 : 424504 : AtEOXact_HashTables(bool isCommit)
1865 : : {
1866 : : /*
1867 : : * During abort cleanup, open scans are expected; just silently clean 'em
1868 : : * out. An open scan at commit means someone forgot a hash_seq_term()
1869 : : * call, so complain.
1870 : : *
1871 : : * Note: it's tempting to try to print the tabname here, but refrain for
1872 : : * fear of touching deallocated memory. This isn't a user-facing message
1873 : : * anyway, so it needn't be pretty.
1874 : : */
1875 [ + + ]: 424504 : if (isCommit)
1876 : : {
1877 : : int i;
1878 : :
1879 [ - + ]: 389127 : for (i = 0; i < num_seq_scans; i++)
1880 : : {
6949 tgl@sss.pgh.pa.us 1881 [ # # ]:UBC 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1882 : : seq_scan_tables[i]);
1883 : : }
1884 : : }
6949 tgl@sss.pgh.pa.us 1885 :CBC 424504 : num_seq_scans = 0;
1886 : 424504 : }
1887 : :
1888 : : /* Clean up any open scans at end of subtransaction */
1889 : : void
1890 : 12658 : AtEOSubXact_HashTables(bool isCommit, int nestDepth)
1891 : : {
1892 : : int i;
1893 : :
1894 : : /*
1895 : : * Search backward to make cleanup easy. Note we must check all entries,
1896 : : * not only those at the end of the array, because deletion technique
1897 : : * doesn't keep them in order.
1898 : : */
1899 [ + + ]: 12661 : for (i = num_seq_scans - 1; i >= 0; i--)
1900 : : {
6949 tgl@sss.pgh.pa.us 1901 [ + - ]:GBC 3 : if (seq_scan_level[i] >= nestDepth)
1902 : : {
1903 [ - + ]: 3 : if (isCommit)
6949 tgl@sss.pgh.pa.us 1904 [ # # ]:UBC 0 : elog(WARNING, "leaked hash_seq_search scan for hash table %p",
1905 : : seq_scan_tables[i]);
6949 tgl@sss.pgh.pa.us 1906 :GBC 3 : seq_scan_tables[i] = seq_scan_tables[num_seq_scans - 1];
1907 : 3 : seq_scan_level[i] = seq_scan_level[num_seq_scans - 1];
1908 : 3 : num_seq_scans--;
1909 : : }
1910 : : }
6949 tgl@sss.pgh.pa.us 1911 :CBC 12658 : }
|