Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * pg_popcount_x86.c
4 : : * Holds the x86-64 pg_popcount() implementations.
5 : : *
6 : : * Copyright (c) 2024-2026, PostgreSQL Global Development Group
7 : : *
8 : : * IDENTIFICATION
9 : : * src/port/pg_popcount_x86.c
10 : : *
11 : : *-------------------------------------------------------------------------
12 : : */
13 : : #include "c.h"
14 : :
15 : : #ifdef HAVE_X86_64_POPCNTQ
16 : :
17 : : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
18 : : #include <immintrin.h>
19 : : #endif
20 : :
21 : : #include "port/pg_bitutils.h"
22 : : #include "port/pg_cpu.h"
23 : :
24 : : /*
25 : : * The SSE4.2 versions are built regardless of whether we are building the
26 : : * AVX-512 versions.
27 : : *
28 : : * Technically, POPCNT is not part of SSE4.2, and isn't even a vector
29 : : * operation, but in practice this is close enough, and "sse42" seems easier to
30 : : * follow than "popcnt" for these names.
31 : : */
32 : : static uint64 pg_popcount_sse42(const char *buf, int bytes);
33 : : static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
34 : :
35 : : /*
36 : : * These are the AVX-512 implementations of the popcount functions.
37 : : */
38 : : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
39 : : static uint64 pg_popcount_avx512(const char *buf, int bytes);
40 : : static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
41 : : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
42 : :
43 : : /*
44 : : * The function pointers are initially set to "choose" functions. These
45 : : * functions will first set the pointers to the right implementations (base on
46 : : * what the current CPU supports) and then will call the pointer to fulfill the
47 : : * caller's request.
48 : : */
49 : : static uint64 pg_popcount_choose(const char *buf, int bytes);
50 : : static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
51 : : uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
52 : : uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
53 : :
54 : :
55 : : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
56 : :
57 : : /*
58 : : * Returns true if the CPU supports the instructions required for the AVX-512
59 : : * pg_popcount() implementation.
60 : : */
61 : : static bool
493 nathan@postgresql.or 62 :GNC 359 : pg_popcount_avx512_available(void)
63 : : {
16 john.naylor@postgres 64 [ - + - - ]: 359 : return x86_feature_available(PG_AVX512_BW) &&
16 john.naylor@postgres 65 :UNC 0 : x86_feature_available(PG_AVX512_VPOPCNTDQ);
66 : : }
67 : :
68 : : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
69 : :
70 : : /*
71 : : * These functions get called on the first call to pg_popcount(), etc.
72 : : * They detect whether we can use the asm implementations, and replace
73 : : * the function pointers so that subsequent calls are routed directly to
74 : : * the chosen implementation.
75 : : */
76 : : static inline void
53 nathan@postgresql.or 77 :GNC 359 : choose_popcount_functions(void)
78 : : {
16 john.naylor@postgres 79 [ + - ]: 359 : if (x86_feature_available(PG_POPCNT))
80 : : {
53 nathan@postgresql.or 81 : 359 : pg_popcount_optimized = pg_popcount_sse42;
82 : 359 : pg_popcount_masked_optimized = pg_popcount_masked_sse42;
83 : : }
84 : : else
85 : : {
53 nathan@postgresql.or 86 :UNC 0 : pg_popcount_optimized = pg_popcount_portable;
87 : 0 : pg_popcount_masked_optimized = pg_popcount_masked_portable;
88 : : }
89 : :
90 : : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
53 nathan@postgresql.or 91 [ - + ]:GNC 359 : if (pg_popcount_avx512_available())
92 : : {
53 nathan@postgresql.or 93 :UNC 0 : pg_popcount_optimized = pg_popcount_avx512;
94 : 0 : pg_popcount_masked_optimized = pg_popcount_masked_avx512;
95 : : }
96 : : #endif
53 nathan@postgresql.or 97 :GNC 359 : }
98 : :
99 : : static uint64
100 : 27 : pg_popcount_choose(const char *buf, int bytes)
101 : : {
102 : 27 : choose_popcount_functions();
103 : 27 : return pg_popcount_optimized(buf, bytes);
104 : : }
105 : :
106 : : static uint64
107 : 332 : pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
108 : : {
109 : 332 : choose_popcount_functions();
110 : 332 : return pg_popcount_masked(buf, bytes, mask);
111 : : }
112 : :
113 : : #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
114 : :
115 : : /*
116 : : * pg_popcount_avx512
117 : : * Returns the number of 1-bits in buf
118 : : */
119 : : pg_attribute_target("avx512vpopcntdq,avx512bw")
120 : : static uint64
708 nathan@postgresql.or 121 :UNC 0 : pg_popcount_avx512(const char *buf, int bytes)
122 : : {
123 : : __m512i val,
124 : : cnt;
125 : 0 : __m512i accum = _mm512_setzero_si512();
126 : : const char *final;
127 : : int tail_idx;
128 : 0 : __mmask64 mask = ~UINT64CONST(0);
129 : :
130 : : /*
131 : : * Align buffer down to avoid double load overhead from unaligned access.
132 : : * Calculate a mask to ignore preceding bytes. Find start offset of final
133 : : * iteration and ensure it is not empty.
134 : : */
135 : 0 : mask <<= ((uintptr_t) buf) % sizeof(__m512i);
136 : 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
137 : 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
138 : 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
139 : :
140 : : /*
141 : : * Iterate through all but the final iteration. Starting from the second
142 : : * iteration, the mask is ignored.
143 : : */
144 [ # # ]: 0 : if (buf < final)
145 : : {
146 : 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
147 : 0 : cnt = _mm512_popcnt_epi64(val);
148 : 0 : accum = _mm512_add_epi64(accum, cnt);
149 : :
150 : 0 : buf += sizeof(__m512i);
151 : 0 : mask = ~UINT64CONST(0);
152 : :
153 [ # # ]: 0 : for (; buf < final; buf += sizeof(__m512i))
154 : : {
155 : 0 : val = _mm512_load_si512((const __m512i *) buf);
156 : 0 : cnt = _mm512_popcnt_epi64(val);
157 : 0 : accum = _mm512_add_epi64(accum, cnt);
158 : : }
159 : : }
160 : :
161 : : /* Final iteration needs to ignore bytes that are not within the length */
162 : 0 : mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
163 : :
164 : 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
165 : 0 : cnt = _mm512_popcnt_epi64(val);
166 : 0 : accum = _mm512_add_epi64(accum, cnt);
167 : :
168 : 0 : return _mm512_reduce_add_epi64(accum);
169 : : }
170 : :
171 : : /*
172 : : * pg_popcount_masked_avx512
173 : : * Returns the number of 1-bits in buf after applying the mask to each byte
174 : : */
175 : : pg_attribute_target("avx512vpopcntdq,avx512bw")
176 : : static uint64
177 : 0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
178 : : {
179 : : __m512i val,
180 : : vmasked,
181 : : cnt;
182 : 0 : __m512i accum = _mm512_setzero_si512();
183 : : const char *final;
184 : : int tail_idx;
185 : 0 : __mmask64 bmask = ~UINT64CONST(0);
670 tgl@sss.pgh.pa.us 186 : 0 : const __m512i maskv = _mm512_set1_epi8(mask);
187 : :
188 : : /*
189 : : * Align buffer down to avoid double load overhead from unaligned access.
190 : : * Calculate a mask to ignore preceding bytes. Find start offset of final
191 : : * iteration and ensure it is not empty.
192 : : */
708 nathan@postgresql.or 193 : 0 : bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
194 : 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
195 : 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
196 : 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
197 : :
198 : : /*
199 : : * Iterate through all but the final iteration. Starting from the second
200 : : * iteration, the mask is ignored.
201 : : */
202 [ # # ]: 0 : if (buf < final)
203 : : {
204 : 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
205 : 0 : vmasked = _mm512_and_si512(val, maskv);
206 : 0 : cnt = _mm512_popcnt_epi64(vmasked);
207 : 0 : accum = _mm512_add_epi64(accum, cnt);
208 : :
209 : 0 : buf += sizeof(__m512i);
210 : 0 : bmask = ~UINT64CONST(0);
211 : :
212 [ # # ]: 0 : for (; buf < final; buf += sizeof(__m512i))
213 : : {
214 : 0 : val = _mm512_load_si512((const __m512i *) buf);
215 : 0 : vmasked = _mm512_and_si512(val, maskv);
216 : 0 : cnt = _mm512_popcnt_epi64(vmasked);
217 : 0 : accum = _mm512_add_epi64(accum, cnt);
218 : : }
219 : : }
220 : :
221 : : /* Final iteration needs to ignore bytes that are not within the length */
222 : 0 : bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
223 : :
224 : 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
225 : 0 : vmasked = _mm512_and_si512(val, maskv);
226 : 0 : cnt = _mm512_popcnt_epi64(vmasked);
227 : 0 : accum = _mm512_add_epi64(accum, cnt);
228 : :
229 : 0 : return _mm512_reduce_add_epi64(accum);
230 : : }
231 : :
232 : : #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
233 : :
234 : : /*
235 : : * pg_popcount64_sse42
236 : : * Return the number of 1 bits set in word
237 : : */
238 : : static inline int
53 nathan@postgresql.or 239 :GNC 17911361 : pg_popcount64_sse42(uint64 word)
240 : : {
241 : : #ifdef _MSC_VER
242 : : return __popcnt64(word);
243 : : #else
244 : : uint64 res;
245 : :
246 : 17911361 : __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
247 : 17911361 : return (int) res;
248 : : #endif
249 : : }
250 : :
251 : : /*
252 : : * pg_popcount_sse42
253 : : * Returns the number of 1-bits in buf
254 : : */
255 : : pg_attribute_no_sanitize_alignment()
256 : : static uint64
257 : 11152 : pg_popcount_sse42(const char *buf, int bytes)
258 : : {
259 : 11152 : uint64 popcnt = 0;
31 260 : 11152 : const uint64 *words = (const uint64 *) buf;
261 : :
262 [ + + ]: 318431 : while (bytes >= 8)
263 : : {
264 : 307279 : popcnt += pg_popcount64_sse42(*words++);
265 : 307279 : bytes -= 8;
266 : : }
267 : :
268 : 11152 : buf = (const char *) words;
269 : :
270 : : /* Process any remaining bytes */
53 271 [ + + ]: 11288 : while (bytes--)
272 : 136 : popcnt += pg_number_of_ones[(unsigned char) *buf++];
273 : :
274 : 11152 : return popcnt;
275 : : }
276 : :
277 : : /*
278 : : * pg_popcount_masked_sse42
279 : : * Returns the number of 1-bits in buf after applying the mask to each byte
280 : : */
281 : : pg_attribute_no_sanitize_alignment()
282 : : static uint64
283 : 17242 : pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
284 : : {
285 : 17242 : uint64 popcnt = 0;
286 : 17242 : uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
31 287 : 17242 : const uint64 *words = (const uint64 *) buf;
288 : :
289 [ + + ]: 17621324 : while (bytes >= 8)
290 : : {
291 : 17604082 : popcnt += pg_popcount64_sse42(*words++ & maskv);
292 : 17604082 : bytes -= 8;
293 : : }
294 : :
295 : 17242 : buf = (const char *) words;
296 : :
297 : : /* Process any remaining bytes */
53 298 [ - + ]: 17242 : while (bytes--)
53 nathan@postgresql.or 299 :UNC 0 : popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
300 : :
53 nathan@postgresql.or 301 :GNC 17242 : return popcnt;
302 : : }
303 : :
304 : : #endif /* HAVE_X86_64_POPCNTQ */
|