Age Owner Branch data TLA Line data Source code
1 : : /*
2 : : * hashfn_unstable.h
3 : : *
4 : : * Building blocks for creating fast inlineable hash functions. The
5 : : * functions in this file are not guaranteed to be stable between versions,
6 : : * and may differ by hardware platform. Hence they must not be used in
7 : : * indexes or other on-disk structures. See hashfn.h if you need stability.
8 : : *
9 : : *
10 : : * Portions Copyright (c) 2024-2026, PostgreSQL Global Development Group
11 : : *
12 : : * src/include/common/hashfn_unstable.h
13 : : */
14 : : #ifndef HASHFN_UNSTABLE_H
15 : : #define HASHFN_UNSTABLE_H
16 : :
17 : :
18 : : /*
19 : : * fasthash is a modification of code taken from
20 : : * https://code.google.com/archive/p/fast-hash/source/default/source
21 : : * under the terms of the MIT license. The original copyright
22 : : * notice follows:
23 : : */
24 : :
25 : : /*
26 : : * The MIT License
27 : : *
28 : : * Copyright (C) 2012 Zilong Tan (eric.zltan@gmail.com)
29 : : *
30 : : * Permission is hereby granted, free of charge, to any person
31 : : * obtaining a copy of this software and associated documentation
32 : : * files (the "Software"), to deal in the Software without
33 : : * restriction, including without limitation the rights to use, copy,
34 : : * modify, merge, publish, distribute, sublicense, and/or sell copies
35 : : * of the Software, and to permit persons to whom the Software is
36 : : * furnished to do so, subject to the following conditions:
37 : : *
38 : : * The above copyright notice and this permission notice shall be
39 : : * included in all copies or substantial portions of the Software.
40 : : *
41 : : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
42 : : * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
43 : : * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
44 : : * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
45 : : * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
46 : : * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
47 : : * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48 : : * SOFTWARE.
49 : : */
50 : :
51 : : /*
52 : : * fasthash as implemented here has two interfaces:
53 : : *
54 : : * 1) Standalone functions that take a single input.
55 : : *
56 : : * 2) Incremental interface. This can used for incorporating multiple
57 : : * inputs. First, initialize the hash state (here with a zero seed):
58 : : *
59 : : * fasthash_state hs;
60 : : * fasthash_init(&hs, 0);
61 : : *
62 : : * Next, accumulate input into the hash state.
63 : : * If the inputs are of types that can be trivially cast to uint64, it's
64 : : * sufficient to do:
65 : : *
66 : : * hs.accum = value1;
67 : : * fasthash_combine(&hs);
68 : : * hs.accum = value2;
69 : : * fasthash_combine(&hs);
70 : : * ...
71 : : *
72 : : * For longer or variable-length input, fasthash_accum() is a more
73 : : * flexible, but more verbose method. The standalone functions use this
74 : : * internally, so see fasthash64() for an example of this.
75 : : *
76 : : * After all inputs have been mixed in, finalize the hash and optionally
77 : : * reduce to 32 bits. If all inputs are fixed-length, it's sufficient
78 : : * to pass zero for the tweak:
79 : : *
80 : : * hashcode = fasthash_final32(&hs, 0);
81 : : *
82 : : * For variable length input, experimentation has found that SMHasher
83 : : * fails unless we pass the length for the tweak. When accumulating
84 : : * multiple varlen values, it's probably safest to calculate a tweak
85 : : * such that the bits of all individual lengths are present, for example:
86 : : *
87 : : * lengths = len1 + (len2 << 10) + (len3 << 20);
88 : : * hashcode = fasthash_final32(&hs, lengths);
89 : : *
90 : : * The incremental interface allows an optimization for NUL-terminated
91 : : * C strings:
92 : : *
93 : : * len = fasthash_accum_cstring(&hs, str);
94 : : * hashcode = fasthash_final32(&hs, len);
95 : : *
96 : : * By computing the length on-the-fly, we can avoid needing a strlen()
97 : : * call to tell us how many bytes to hash.
98 : : */
99 : :
100 : :
101 : : typedef struct fasthash_state
102 : : {
103 : : /* staging area for chunks of input */
104 : : uint64 accum;
105 : :
106 : : uint64 hash;
107 : : } fasthash_state;
108 : :
109 : : #define FH_SIZEOF_ACCUM sizeof(uint64)
110 : :
111 : :
112 : : /*
113 : : * Initialize the hash state.
114 : : *
115 : : * 'seed' can be zero.
116 : : */
117 : : static inline void
860 john.naylor@postgres 118 :CBC 8893549 : fasthash_init(fasthash_state *hs, uint64 seed)
119 : : {
915 120 : 8893549 : memset(hs, 0, sizeof(fasthash_state));
860 121 : 8893549 : hs->hash = seed ^ 0x880355f21e6d1965;
915 122 : 8893549 : }
123 : :
124 : : /* both the finalizer and part of the combining step */
125 : : static inline uint64
126 : 28591731 : fasthash_mix(uint64 h, uint64 tweak)
127 : : {
128 : 28591731 : h ^= (h >> 23) + tweak;
129 : 28591731 : h *= 0x2127599bf4325c37;
130 : 28591731 : h ^= h >> 47;
131 : 28591731 : return h;
132 : : }
133 : :
134 : : /* combine one chunk of input into the hash */
135 : : static inline void
136 : 19698182 : fasthash_combine(fasthash_state *hs)
137 : : {
138 : 19698182 : hs->hash ^= fasthash_mix(hs->accum, 0);
139 : 19698182 : hs->hash *= 0x880355f21e6d1965;
140 : 19698182 : }
141 : :
142 : : /* accumulate up to 8 bytes of input and combine it into the hash */
143 : : static inline void
842 144 : 26643339 : fasthash_accum(fasthash_state *hs, const char *k, size_t len)
145 : : {
146 : : uint32 lower_four;
147 : :
915 148 [ - + ]: 26643339 : Assert(len <= FH_SIZEOF_ACCUM);
784 149 : 26643339 : hs->accum = 0;
150 : :
151 : : /*
152 : : * For consistency, bytewise loads must match the platform's endianness.
153 : : */
154 : : #ifdef WORDS_BIGENDIAN
155 : : switch (len)
156 : : {
157 : : case 8:
158 : : memcpy(&hs->accum, k, 8);
159 : : break;
160 : : case 7:
161 : : hs->accum |= (uint64) k[6] << 8;
162 : : pg_fallthrough;
163 : : case 6:
164 : : hs->accum |= (uint64) k[5] << 16;
165 : : pg_fallthrough;
166 : : case 5:
167 : : hs->accum |= (uint64) k[4] << 24;
168 : : pg_fallthrough;
169 : : case 4:
170 : : memcpy(&lower_four, k, sizeof(lower_four));
171 : : hs->accum |= (uint64) lower_four << 32;
172 : : break;
173 : : case 3:
174 : : hs->accum |= (uint64) k[2] << 40;
175 : : pg_fallthrough;
176 : : case 2:
177 : : hs->accum |= (uint64) k[1] << 48;
178 : : pg_fallthrough;
179 : : case 1:
180 : : hs->accum |= (uint64) k[0] << 56;
181 : : break;
182 : : case 0:
183 : : return;
184 : : }
185 : : #else
915 186 [ + + + + : 26643339 : switch (len)
+ + + + +
- ]
187 : : {
188 : 16696614 : case 8:
189 : 16696614 : memcpy(&hs->accum, k, 8);
190 : 16696614 : break;
191 : 387722 : case 7:
192 : 387722 : hs->accum |= (uint64) k[6] << 48;
193 : : pg_fallthrough;
194 : 542356 : case 6:
195 : 542356 : hs->accum |= (uint64) k[5] << 40;
196 : : pg_fallthrough;
197 : 576588 : case 5:
198 : 576588 : hs->accum |= (uint64) k[4] << 32;
199 : : pg_fallthrough;
200 : 817662 : case 4:
201 : 817662 : memcpy(&lower_four, k, sizeof(lower_four));
202 : 817662 : hs->accum |= lower_four;
203 : 817662 : break;
204 : 551334 : case 3:
205 : 551334 : hs->accum |= (uint64) k[2] << 16;
206 : : pg_fallthrough;
207 : 858222 : case 2:
208 : 858222 : hs->accum |= (uint64) k[1] << 8;
209 : : pg_fallthrough;
210 : 1211940 : case 1:
211 : 1211940 : hs->accum |= (uint64) k[0];
212 : 1211940 : break;
213 : 7917123 : case 0:
214 : 7917123 : return;
215 : : }
216 : : #endif
217 : :
218 : 18726216 : fasthash_combine(hs);
219 : : }
220 : :
221 : : /*
222 : : * Set high bit in lowest byte where the input is zero, from:
223 : : * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
224 : : */
225 : : #define haszero64(v) \
226 : : (((v) - 0x0101010101010101) & ~(v) & 0x8080808080808080)
227 : :
228 : : /*
229 : : * all-purpose workhorse for fasthash_accum_cstring
230 : : */
231 : : static inline size_t
865 232 : 2264762 : fasthash_accum_cstring_unaligned(fasthash_state *hs, const char *str)
233 : : {
234 : 2264762 : const char *const start = str;
235 : :
236 [ + + ]: 5156732 : while (*str)
237 : : {
842 238 : 2891970 : size_t chunk_len = 0;
239 : :
865 240 [ + + + + ]: 17189724 : while (chunk_len < FH_SIZEOF_ACCUM && str[chunk_len] != '\0')
241 : 14297754 : chunk_len++;
242 : :
243 : 2891970 : fasthash_accum(hs, str, chunk_len);
244 : 2891970 : str += chunk_len;
245 : : }
246 : :
247 : 2264762 : return str - start;
248 : : }
249 : :
250 : : /*
251 : : * specialized workhorse for fasthash_accum_cstring
252 : : *
253 : : * With an aligned pointer, we consume the string a word at a time.
254 : : * Loading the word containing the NUL terminator cannot segfault since
255 : : * allocation boundaries are suitably aligned. To keep from setting
256 : : * off alarms with address sanitizers, exclude this function from
257 : : * such testing.
258 : : */
259 : : pg_attribute_no_sanitize_address()
260 : : static inline size_t
261 : 1132381 : fasthash_accum_cstring_aligned(fasthash_state *hs, const char *str)
262 : : {
263 : 1132381 : const char *const start = str;
264 : : size_t remainder;
265 : : uint64 zero_byte_low;
266 : :
267 [ + - ]: 1132381 : Assert(PointerIsAligned(start, uint64));
268 : :
269 : : /*
270 : : * For every chunk of input, check for zero bytes before mixing into the
271 : : * hash. The chunk with zeros must contain the NUL terminator.
272 : : */
273 : : for (;;)
274 : 862368 : {
124 peter@eisentraut.org 275 :GNC 1994749 : uint64 chunk = *(const uint64 *) str;
276 : :
844 john.naylor@postgres 277 :CBC 1994749 : zero_byte_low = haszero64(chunk);
278 [ + + ]: 1994749 : if (zero_byte_low)
865 279 : 1132381 : break;
280 : :
281 : 862368 : hs->accum = chunk;
282 : 862368 : fasthash_combine(hs);
283 : 862368 : str += FH_SIZEOF_ACCUM;
284 : : }
285 : :
286 : : /* mix in remaining bytes */
460 287 : 1132381 : remainder = fasthash_accum_cstring_unaligned(hs, str);
486 288 : 1132381 : str += remainder;
289 : :
865 290 : 1132381 : return str - start;
291 : : }
292 : :
293 : : /*
294 : : * Mix 'str' into the hash state and return the length of the string.
295 : : */
296 : : static inline size_t
297 : 1132381 : fasthash_accum_cstring(fasthash_state *hs, const char *str)
298 : : {
299 : : #if SIZEOF_VOID_P >= 8
300 : :
301 : : size_t len;
302 : : #ifdef USE_ASSERT_CHECKING
303 : : size_t len_check;
304 : : fasthash_state hs_check;
305 : :
306 : 1132381 : memcpy(&hs_check, hs, sizeof(fasthash_state));
307 : 1132381 : len_check = fasthash_accum_cstring_unaligned(&hs_check, str);
308 : : #endif
309 [ + - ]: 1132381 : if (PointerIsAligned(str, uint64))
310 : : {
311 : 1132381 : len = fasthash_accum_cstring_aligned(hs, str);
784 312 [ - + ]: 1132381 : Assert(len_check == len);
313 [ - + ]: 1132381 : Assert(hs_check.hash == hs->hash);
865 314 : 1132381 : return len;
315 : : }
316 : : #endif /* SIZEOF_VOID_P */
317 : :
318 : : /*
319 : : * It's not worth it to try to make the word-at-a-time optimization work
320 : : * on 32-bit platforms.
321 : : */
865 john.naylor@postgres 322 :UBC 0 : return fasthash_accum_cstring_unaligned(hs, str);
323 : : }
324 : :
325 : : /*
326 : : * The finalizer
327 : : *
328 : : * 'tweak' is intended to be the input length when the caller doesn't know
329 : : * the length ahead of time, such as for NUL-terminated strings, otherwise
330 : : * zero.
331 : : */
332 : : static inline uint64
915 john.naylor@postgres 333 :CBC 8893549 : fasthash_final64(fasthash_state *hs, uint64 tweak)
334 : : {
335 : 8893549 : return fasthash_mix(hs->hash, tweak);
336 : : }
337 : :
338 : : /*
339 : : * Reduce a 64-bit hash to a 32-bit hash.
340 : : *
341 : : * This optional step provides a bit more additional mixing compared to
342 : : * just taking the lower 32-bits.
343 : : */
344 : : static inline uint32
345 : 8893549 : fasthash_reduce32(uint64 h)
346 : : {
347 : : /*
348 : : * Convert the 64-bit hashcode to Fermat residue, which shall retain
349 : : * information from both the higher and lower parts of hashcode.
350 : : */
351 : 8893549 : return h - (h >> 32);
352 : : }
353 : :
354 : : /* finalize and reduce */
355 : : static inline uint32
356 : 976426 : fasthash_final32(fasthash_state *hs, uint64 tweak)
357 : : {
358 : 976426 : return fasthash_reduce32(fasthash_final64(hs, tweak));
359 : : }
360 : :
361 : :
362 : : /* Standalone functions */
363 : :
364 : : /*
365 : : * The original fasthash64 function, re-implemented using the incremental
366 : : * interface. Returns the same 64-bit hashcode as the original,
367 : : * at least on little-endian machines. 'len' controls not only how
368 : : * many bytes to hash, but also modifies the internal seed.
369 : : * 'seed' can be zero.
370 : : */
371 : : static inline uint64
842 372 : 7917123 : fasthash64(const char *k, size_t len, uint64 seed)
373 : : {
374 : : fasthash_state hs;
375 : :
860 376 : 7917123 : fasthash_init(&hs, 0);
377 : :
378 : : /* re-initialize the seed according to input length */
379 : 7917123 : hs.hash = seed ^ (len * 0x880355f21e6d1965);
380 : :
915 381 [ + + ]: 23751369 : while (len >= FH_SIZEOF_ACCUM)
382 : : {
383 : 15834246 : fasthash_accum(&hs, k, FH_SIZEOF_ACCUM);
384 : 15834246 : k += FH_SIZEOF_ACCUM;
385 : 15834246 : len -= FH_SIZEOF_ACCUM;
386 : : }
387 : :
388 : 7917123 : fasthash_accum(&hs, k, len);
389 : :
390 : : /*
391 : : * Since we already mixed the input length into the seed, we can just pass
392 : : * zero here. This matches upstream behavior as well.
393 : : */
394 : 7917123 : return fasthash_final64(&hs, 0);
395 : : }
396 : :
397 : : /* like fasthash64, but returns a 32-bit hashcode */
398 : : static inline uint32
842 399 : 7917123 : fasthash32(const char *k, size_t len, uint64 seed)
400 : : {
915 401 : 7917123 : return fasthash_reduce32(fasthash64(k, len, seed));
402 : : }
403 : :
404 : : /*
405 : : * Convenience function for hashing NUL-terminated strings
406 : : *
407 : : * Note: This is faster than, and computes a different result from,
408 : : * "fasthash32(s, strlen(s))"
409 : : */
410 : : static inline uint32
784 411 : 418642 : hash_string(const char *s)
412 : : {
413 : : fasthash_state hs;
414 : : size_t s_len;
415 : :
416 : 418642 : fasthash_init(&hs, 0);
417 : :
418 : : /*
419 : : * Combine string into the hash and save the length for tweaking the final
420 : : * mix.
421 : : */
422 : 418642 : s_len = fasthash_accum_cstring(&hs, s);
423 : :
424 : 418642 : return fasthash_final32(&hs, s_len);
425 : : }
426 : :
427 : : #endif /* HASHFN_UNSTABLE_H */
|