Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * method_io_uring.c
4 : : * AIO - perform AIO using Linux' io_uring
5 : : *
6 : : * For now we create one io_uring instance for each backend. These io_uring
7 : : * instances have to be created in postmaster, during startup, to allow other
8 : : * backends to process IO completions, if the issuing backend is currently
9 : : * busy doing other things. Other backends may not use another backend's
10 : : * io_uring instance to submit IO, that'd require additional locking that
11 : : * would likely be harmful for performance.
12 : : *
13 : : * We likely will want to introduce a backend-local io_uring instance in the
14 : : * future, e.g. for FE/BE network IO.
15 : : *
16 : : * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
17 : : * Portions Copyright (c) 1994, Regents of the University of California
18 : : *
19 : : * IDENTIFICATION
20 : : * src/backend/storage/aio/method_io_uring.c
21 : : *
22 : : *-------------------------------------------------------------------------
23 : : */
24 : :
25 : : #include "postgres.h"
26 : :
27 : : /* included early, for IOMETHOD_IO_URING_ENABLED */
28 : : #include "storage/aio.h"
29 : :
30 : : #ifdef IOMETHOD_IO_URING_ENABLED
31 : :
32 : : #include <sys/mman.h>
33 : : #include <unistd.h>
34 : :
35 : : #include <liburing.h>
36 : :
37 : : #include "miscadmin.h"
38 : : #include "storage/aio_internal.h"
39 : : #include "storage/fd.h"
40 : : #include "storage/proc.h"
41 : : #include "storage/shmem.h"
42 : : #include "storage/lwlock.h"
43 : : #include "storage/procnumber.h"
44 : : #include "utils/wait_event.h"
45 : :
46 : :
47 : : /* number of completions processed at once */
48 : : #define PGAIO_MAX_LOCAL_COMPLETED_IO 32
49 : :
50 : :
51 : : /* Entry points for IoMethodOps. */
52 : : static void pgaio_uring_shmem_request(void *arg);
53 : : static void pgaio_uring_shmem_init(void *arg);
54 : : static void pgaio_uring_init_backend(void);
55 : : static int pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios);
56 : : static void pgaio_uring_wait_one(PgAioHandle *ioh, uint64 ref_generation);
57 : : static void pgaio_uring_check_one(PgAioHandle *ioh, uint64 ref_generation);
58 : :
59 : : /* helper functions */
60 : : static void pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe);
61 : :
62 : :
63 : : const IoMethodOps pgaio_uring_ops = {
64 : : /*
65 : : * While io_uring mostly is OK with FDs getting closed while the IO is in
66 : : * flight, that is not true for IOs submitted with IOSQE_ASYNC.
67 : : *
68 : : * See
69 : : * https://postgr.es/m/5ons2rtmwarqqhhexb3dnqulw5rjgwgoct57vpdau4rujlrffj%403fls6d2mkiwc
70 : : */
71 : : .wait_on_fd_before_close = true,
72 : :
73 : : .shmem_callbacks.request_fn = pgaio_uring_shmem_request,
74 : : .shmem_callbacks.init_fn = pgaio_uring_shmem_init,
75 : : .init_backend = pgaio_uring_init_backend,
76 : :
77 : : .submit = pgaio_uring_submit,
78 : : .wait_one = pgaio_uring_wait_one,
79 : : .check_one = pgaio_uring_check_one,
80 : : };
81 : :
82 : : /*
83 : : * Per-backend state when using io_method=io_uring
84 : : */
85 : : typedef struct PgAioUringContext
86 : : {
87 : : /*
88 : : * Align the whole struct to a cacheline boundary, to prevent false
89 : : * sharing between completion_lock and prior backend's io_uring_ring.
90 : : */
91 : : alignas(PG_CACHE_LINE_SIZE)
92 : :
93 : : /*
94 : : * Multiple backends can process completions for this backend's io_uring
95 : : * instance (e.g. when the backend issuing IO is busy doing something
96 : : * else). To make that safe we have to ensure that only a single backend
97 : : * gets io completions from the io_uring instance at a time.
98 : : */
99 : : LWLock completion_lock;
100 : :
101 : : struct io_uring io_uring_ring;
102 : : } PgAioUringContext;
103 : :
104 : : /*
105 : : * Information about the capabilities that io_uring has.
106 : : *
107 : : * Depending on liburing and kernel version different features are
108 : : * supported. At least for the kernel a kernel version check does not suffice
109 : : * as various vendors do backport features to older kernels :(.
110 : : */
111 : : typedef struct PgAioUringCaps
112 : : {
113 : : bool checked;
114 : : /* -1 if io_uring_queue_init_mem() is unsupported */
115 : : int mem_init_size;
116 : : } PgAioUringCaps;
117 : :
118 : :
119 : : /* PgAioUringContexts for all backends */
120 : : static PgAioUringContext *pgaio_uring_contexts;
121 : :
122 : : /* the current backend's context */
123 : : static PgAioUringContext *pgaio_my_uring_context;
124 : :
125 : : static PgAioUringCaps pgaio_uring_caps =
126 : : {
127 : : .checked = false,
128 : : .mem_init_size = -1,
129 : : };
130 : :
131 : : static uint32
413 andres@anarazel.de 132 :CBC 103 : pgaio_uring_procs(void)
133 : : {
134 : : /*
135 : : * We can subtract MAX_IO_WORKERS here as io workers are never used at the
136 : : * same time as io_method=io_uring.
137 : : */
138 : 103 : return MaxBackends + NUM_AUXILIARY_PROCS - MAX_IO_WORKERS;
139 : : }
140 : :
141 : : /*
142 : : * Initializes pgaio_uring_caps, unless that's already done.
143 : : */
144 : : static void
302 145 : 7 : pgaio_uring_check_capabilities(void)
146 : : {
147 [ - + ]: 7 : if (pgaio_uring_caps.checked)
302 andres@anarazel.de 148 :LBC (7) : return;
149 : :
150 : : /*
151 : : * By default io_uring creates a shared memory mapping for each io_uring
152 : : * instance, leading to a large number of memory mappings. Unfortunately a
153 : : * large number of memory mappings slows things down, backend exit is
154 : : * particularly affected. To address that, newer kernels (6.5) support
155 : : * using user-provided memory for the memory, by putting the relevant
156 : : * memory into shared memory we don't need any additional mappings.
157 : : *
158 : : * To know whether this is supported, we unfortunately need to probe the
159 : : * kernel by trying to create a ring with userspace-provided memory. This
160 : : * also has a secondary benefit: We can determine precisely how much
161 : : * memory we need for each io_uring instance.
162 : : */
163 : : #if defined(HAVE_IO_URING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
164 : : {
165 : : struct io_uring test_ring;
166 : : size_t ring_size;
167 : : void *ring_ptr;
302 andres@anarazel.de 168 :CBC 7 : struct io_uring_params p = {0};
169 : : int ret;
170 : :
171 : : /*
172 : : * Liburing does not yet provide an API to query how much memory a
173 : : * ring will need. So we over-estimate it here. As the memory is freed
174 : : * just below that's small temporary waste of memory.
175 : : *
176 : : * 1MB is more than enough for rings within io_max_concurrency's
177 : : * range.
178 : : */
179 : 7 : ring_size = 1024 * 1024;
180 : :
181 : : /*
182 : : * Hard to believe a system exists where 1MB would not be a multiple
183 : : * of the page size. But it's cheap to ensure...
184 : : */
185 : 7 : ring_size -= ring_size % sysconf(_SC_PAGESIZE);
186 : :
187 : 7 : ring_ptr = mmap(NULL, ring_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
188 [ - + ]: 7 : if (ring_ptr == MAP_FAILED)
302 andres@anarazel.de 189 [ # # ]:UBC 0 : elog(ERROR,
190 : : "mmap(%zu) to determine io_uring_queue_init_mem() support failed: %m",
191 : : ring_size);
192 : :
302 andres@anarazel.de 193 :CBC 7 : ret = io_uring_queue_init_mem(io_max_concurrency, &test_ring, &p, ring_ptr, ring_size);
194 [ + - ]: 7 : if (ret > 0)
195 : : {
196 : 7 : pgaio_uring_caps.mem_init_size = ret;
197 : :
198 [ + + ]: 7 : elog(DEBUG1,
199 : : "can use combined memory mapping for io_uring, each ring needs %d bytes",
200 : : ret);
201 : :
202 : : /* clean up the created ring, it was just for a test */
203 : 7 : io_uring_queue_exit(&test_ring);
204 : : }
205 : : else
206 : : {
207 : : /*
208 : : * There are different reasons for ring creation to fail, but it's
209 : : * ok to treat that just as io_uring_queue_init_mem() not being
210 : : * supported. We'll report a more detailed error in
211 : : * pgaio_uring_shmem_init().
212 : : */
302 andres@anarazel.de 213 :UBC 0 : errno = -ret;
214 [ # # ]: 0 : elog(DEBUG1,
215 : : "cannot use combined memory mapping for io_uring, ring creation failed: %m");
216 : :
217 : : }
218 : :
302 andres@anarazel.de 219 [ - + ]:CBC 7 : if (munmap(ring_ptr, ring_size) != 0)
302 andres@anarazel.de 220 [ # # ]:UBC 0 : elog(ERROR, "munmap() failed: %m");
221 : : }
222 : : #else
223 : : {
224 : : elog(DEBUG1,
225 : : "can't use combined memory mapping for io_uring, kernel or liburing too old");
226 : : }
227 : : #endif
228 : :
302 andres@anarazel.de 229 :CBC 7 : pgaio_uring_caps.checked = true;
230 : : }
231 : :
232 : : /*
233 : : * Memory for all PgAioUringContext instances
234 : : */
235 : : static size_t
413 236 : 14 : pgaio_uring_context_shmem_size(void)
237 : : {
238 : 14 : return mul_size(pgaio_uring_procs(), sizeof(PgAioUringContext));
239 : : }
240 : :
241 : : /*
242 : : * Memory for the combined memory used by io_uring instances. Returns 0 if
243 : : * that is not supported by kernel/liburing.
244 : : */
245 : : static size_t
302 246 : 14 : pgaio_uring_ring_shmem_size(void)
247 : : {
248 : 14 : size_t sz = 0;
249 : :
250 [ + - ]: 14 : if (pgaio_uring_caps.mem_init_size > 0)
251 : : {
252 : : /*
253 : : * Memory for rings needs to be allocated to the page boundary,
254 : : * reserve space. Luckily it does not need to be aligned to hugepage
255 : : * boundaries, even if huge pages are used.
256 : : */
257 : 14 : sz = add_size(sz, sysconf(_SC_PAGESIZE));
258 : 14 : sz = add_size(sz, mul_size(pgaio_uring_procs(),
259 : 14 : pgaio_uring_caps.mem_init_size));
260 : : }
261 : :
262 : 14 : return sz;
263 : : }
264 : :
265 : : static size_t
413 266 : 7 : pgaio_uring_shmem_size(void)
267 : : {
268 : : size_t sz;
269 : :
29 heikki.linnakangas@i 270 :GNC 7 : sz = pgaio_uring_context_shmem_size();
271 : 7 : sz = add_size(sz, pgaio_uring_ring_shmem_size());
272 : :
273 : 7 : return sz;
274 : : }
275 : :
276 : : static void
277 : 7 : pgaio_uring_shmem_request(void *arg)
278 : : {
279 : : /*
280 : : * Kernel and liburing support for various features influences how much
281 : : * shmem we need, perform the necessary checks.
282 : : */
302 andres@anarazel.de 283 :CBC 7 : pgaio_uring_check_capabilities();
284 : :
29 heikki.linnakangas@i 285 :GNC 7 : ShmemRequestStruct(.name = "AioUringContext",
286 : : .size = pgaio_uring_shmem_size(),
287 : : .ptr = (void **) &pgaio_uring_contexts,
288 : : );
413 andres@anarazel.de 289 :GIC 7 : }
290 : :
291 : : static void
29 heikki.linnakangas@i 292 :GNC 7 : pgaio_uring_shmem_init(void *arg)
293 : : {
334 michael@paquier.xyz 294 :CBC 7 : int TotalProcs = pgaio_uring_procs();
295 : : char *shmem;
302 andres@anarazel.de 296 : 7 : size_t ring_mem_remain = 0;
297 : 7 : char *ring_mem_next = 0;
298 : :
299 : : /*
300 : : * We allocate memory for all PgAioUringContext instances and, if
301 : : * supported, the memory required for each of the io_uring instances, in
302 : : * one combined allocation.
303 : : *
304 : : * pgaio_uring_contexts is already set to the base of the allocation.
305 : : */
29 heikki.linnakangas@i 306 :GNC 7 : shmem = (char *) pgaio_uring_contexts;
302 andres@anarazel.de 307 :CBC 7 : shmem += pgaio_uring_context_shmem_size();
308 : :
309 : : /* if supported, handle memory alignment / sizing for io_uring memory */
310 [ + - ]: 7 : if (pgaio_uring_caps.mem_init_size > 0)
311 : : {
312 : 7 : ring_mem_remain = pgaio_uring_ring_shmem_size();
154 peter@eisentraut.org 313 :GNC 7 : ring_mem_next = shmem;
314 : :
315 : : /* align to page boundary, see also pgaio_uring_ring_shmem_size() */
302 andres@anarazel.de 316 :CBC 7 : ring_mem_next = (char *) TYPEALIGN(sysconf(_SC_PAGESIZE), ring_mem_next);
317 : :
318 : : /* account for alignment */
319 : 7 : ring_mem_remain -= ring_mem_next - shmem;
320 : 7 : shmem += ring_mem_next - shmem;
321 : :
322 : 7 : shmem += ring_mem_remain;
323 : : }
324 : :
413 325 [ + + ]: 879 : for (int contextno = 0; contextno < TotalProcs; contextno++)
326 : : {
327 : 872 : PgAioUringContext *context = &pgaio_uring_contexts[contextno];
328 : : int ret;
329 : :
330 : : /*
331 : : * Right now a high TotalProcs will cause problems in two ways:
332 : : *
333 : : * - RLIMIT_NOFILE needs to be big enough to allow all
334 : : * io_uring_queue_init() calls to succeed.
335 : : *
336 : : * - RLIMIT_NOFILE needs to be big enough to still have enough file
337 : : * descriptors to satisfy set_max_safe_fds() left over. Or, even
338 : : * better, have max_files_per_process left over FDs.
339 : : *
340 : : * We probably should adjust the soft RLIMIT_NOFILE to ensure that.
341 : : *
342 : : *
343 : : * XXX: Newer versions of io_uring support sharing the workers that
344 : : * execute some asynchronous IOs between io_uring instances. It might
345 : : * be worth using that - also need to evaluate if that causes
346 : : * noticeable additional contention?
347 : : */
348 : :
349 : : /*
350 : : * If supported (c.f. pgaio_uring_check_capabilities()), create ring
351 : : * with its data in shared memory. Otherwise fall back io_uring
352 : : * creating a memory mapping for each ring.
353 : : */
354 : : #if defined(HAVE_IO_URING_QUEUE_INIT_MEM) && defined(IORING_SETUP_NO_MMAP)
302 355 [ + - ]: 872 : if (pgaio_uring_caps.mem_init_size > 0)
356 : : {
357 : 872 : struct io_uring_params p = {0};
358 : :
359 : 872 : ret = io_uring_queue_init_mem(io_max_concurrency, &context->io_uring_ring, &p, ring_mem_next, ring_mem_remain);
360 : :
361 : 872 : ring_mem_remain -= ret;
362 : 872 : ring_mem_next += ret;
363 : : }
364 : : else
365 : : #endif
366 : : {
302 andres@anarazel.de 367 :UBC 0 : ret = io_uring_queue_init(io_max_concurrency, &context->io_uring_ring, 0);
368 : : }
369 : :
413 andres@anarazel.de 370 [ - + ]:CBC 872 : if (ret < 0)
371 : : {
413 andres@anarazel.de 372 :UBC 0 : char *hint = NULL;
373 : 0 : int err = ERRCODE_INTERNAL_ERROR;
374 : :
375 : : /* add hints for some failures that errno explains sufficiently */
376 [ # # ]: 0 : if (-ret == EPERM)
377 : : {
378 : 0 : err = ERRCODE_INSUFFICIENT_PRIVILEGE;
379 : 0 : hint = _("Check if io_uring is disabled via /proc/sys/kernel/io_uring_disabled.");
380 : : }
381 [ # # ]: 0 : else if (-ret == EMFILE)
382 : : {
383 : 0 : err = ERRCODE_INSUFFICIENT_RESOURCES;
384 : 0 : hint = psprintf(_("Consider increasing \"ulimit -n\" to at least %d."),
385 : : TotalProcs + max_files_per_process);
386 : : }
387 [ # # ]: 0 : else if (-ret == ENOSYS)
388 : : {
389 : 0 : err = ERRCODE_FEATURE_NOT_SUPPORTED;
252 peter@eisentraut.org 390 : 0 : hint = _("The kernel does not support io_uring.");
391 : : }
392 : :
393 : : /* update errno to allow %m to work */
413 andres@anarazel.de 394 : 0 : errno = -ret;
395 : :
396 [ # # # # ]: 0 : ereport(ERROR,
397 : : errcode(err),
398 : : errmsg("could not setup io_uring queue: %m"),
399 : : hint != NULL ? errhint("%s", hint) : 0);
400 : : }
401 : :
413 andres@anarazel.de 402 :CBC 872 : LWLockInitialize(&context->completion_lock, LWTRANCHE_AIO_URING_COMPLETION);
403 : : }
413 andres@anarazel.de 404 :GIC 7 : }
405 : :
406 : : static void
413 andres@anarazel.de 407 :CBC 68 : pgaio_uring_init_backend(void)
408 : : {
409 [ - + ]: 68 : Assert(MyProcNumber < pgaio_uring_procs());
410 : :
411 : 68 : pgaio_my_uring_context = &pgaio_uring_contexts[MyProcNumber];
412 : 68 : }
413 : :
414 : : static int
415 : 561 : pgaio_uring_submit(uint16 num_staged_ios, PgAioHandle **staged_ios)
416 : : {
417 : 561 : struct io_uring *uring_instance = &pgaio_my_uring_context->io_uring_ring;
418 : :
419 [ - + ]: 561 : Assert(num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
420 : :
421 [ + + ]: 1122 : for (int i = 0; i < num_staged_ios; i++)
422 : : {
423 : 561 : PgAioHandle *ioh = staged_ios[i];
424 : : struct io_uring_sqe *sqe;
425 : :
426 : 561 : sqe = io_uring_get_sqe(uring_instance);
427 : :
428 [ - + ]: 561 : if (!sqe)
413 andres@anarazel.de 429 [ # # ]:UBC 0 : elog(ERROR, "io_uring submission queue is unexpectedly full");
430 : :
413 andres@anarazel.de 431 :CBC 561 : pgaio_io_prepare_submit(ioh);
432 : 561 : pgaio_uring_sq_from_io(ioh, sqe);
433 : : }
434 : :
435 : : while (true)
413 andres@anarazel.de 436 :UBC 0 : {
437 : : int ret;
438 : :
413 andres@anarazel.de 439 :CBC 561 : pgstat_report_wait_start(WAIT_EVENT_AIO_IO_URING_SUBMIT);
440 : 561 : ret = io_uring_submit(uring_instance);
441 : 561 : pgstat_report_wait_end();
442 : :
443 [ - + ]: 561 : if (ret == -EINTR)
444 : : {
413 andres@anarazel.de 445 [ # # ]:UBC 0 : pgaio_debug(DEBUG3,
446 : : "aio method uring: submit EINTR, nios: %d",
447 : : num_staged_ios);
448 : : }
413 andres@anarazel.de 449 [ - + ]:CBC 561 : else if (ret < 0)
450 : : {
451 : : /*
452 : : * The io_uring_enter() manpage suggests that the appropriate
453 : : * reaction to EAGAIN is:
454 : : *
455 : : * "The application should wait for some completions and try
456 : : * again"
457 : : *
458 : : * However, it seems unlikely that that would help in our case, as
459 : : * we apply a low limit to the number of outstanding IOs and thus
460 : : * also outstanding completions, making it unlikely that we'd get
461 : : * EAGAIN while the OS is in good working order.
462 : : *
463 : : * Additionally, it would be problematic to just wait here, our
464 : : * caller might hold critical locks. It'd possibly lead to
465 : : * delaying the crash-restart that seems likely to occur when the
466 : : * kernel is under such heavy memory pressure.
467 : : *
468 : : * Update errno to allow %m to work.
469 : : */
413 andres@anarazel.de 470 :UBC 0 : errno = -ret;
471 [ # # ]: 0 : elog(PANIC, "io_uring submit failed: %m");
472 : : }
413 andres@anarazel.de 473 [ - + ]:CBC 561 : else if (ret != num_staged_ios)
474 : : {
475 : : /* likely unreachable, but if it is, we would need to re-submit */
413 andres@anarazel.de 476 [ # # ]:UBC 0 : elog(PANIC, "io_uring submit submitted only %d of %d",
477 : : ret, num_staged_ios);
478 : : }
479 : : else
480 : : {
413 andres@anarazel.de 481 [ - + ]:CBC 561 : pgaio_debug(DEBUG4,
482 : : "aio method uring: submitted %d IOs",
483 : : num_staged_ios);
484 : 561 : break;
485 : : }
486 : : }
487 : :
488 : 561 : return num_staged_ios;
489 : : }
490 : :
491 : : static void
399 melanieplageman@gmai 492 : 2179 : pgaio_uring_completion_error_callback(void *arg)
493 : : {
494 : : ProcNumber owner;
495 : : PGPROC *owner_proc;
496 : : int32 owner_pid;
497 : 2179 : PgAioHandle *ioh = arg;
498 : :
499 [ + + ]: 2179 : if (!ioh)
500 : 493 : return;
501 : :
502 : : /* No need for context if a backend is completing the IO for itself */
503 [ + + ]: 1686 : if (ioh->owner_procno == MyProcNumber)
504 : 1676 : return;
505 : :
506 : 10 : owner = ioh->owner_procno;
507 : 10 : owner_proc = GetPGProcByNumber(owner);
508 : 10 : owner_pid = owner_proc->pid;
509 : :
510 : 10 : errcontext("completing I/O on behalf of process %d", owner_pid);
511 : : }
512 : :
513 : : static void
413 andres@anarazel.de 514 : 541 : pgaio_uring_drain_locked(PgAioUringContext *context)
515 : : {
516 : : int ready;
517 : : int orig_ready;
399 melanieplageman@gmai 518 : 541 : ErrorContextCallback errcallback = {0};
519 : :
413 andres@anarazel.de 520 [ - + ]: 541 : Assert(LWLockHeldByMeInMode(&context->completion_lock, LW_EXCLUSIVE));
521 : :
399 melanieplageman@gmai 522 : 541 : errcallback.callback = pgaio_uring_completion_error_callback;
523 : 541 : errcallback.previous = error_context_stack;
524 : 541 : error_context_stack = &errcallback;
525 : :
526 : : /*
527 : : * Don't drain more events than available right now. Otherwise it's
528 : : * plausible that one backend could get stuck, for a while, receiving CQEs
529 : : * without actually processing them.
530 : : */
413 andres@anarazel.de 531 : 541 : orig_ready = ready = io_uring_cq_ready(&context->io_uring_ring);
532 : :
533 [ + + ]: 1082 : while (ready > 0)
534 : : {
535 : : struct io_uring_cqe *cqes[PGAIO_MAX_LOCAL_COMPLETED_IO];
536 : : uint32 ncqes;
537 : :
538 : 541 : START_CRIT_SECTION();
539 : : ncqes =
540 : 541 : io_uring_peek_batch_cqe(&context->io_uring_ring,
541 : : cqes,
542 : 541 : Min(PGAIO_MAX_LOCAL_COMPLETED_IO, ready));
543 [ - + ]: 541 : Assert(ncqes <= ready);
544 : :
545 : 541 : ready -= ncqes;
546 : :
547 [ + + ]: 1102 : for (int i = 0; i < ncqes; i++)
548 : : {
549 : 561 : struct io_uring_cqe *cqe = cqes[i];
110 550 : 561 : PgAioHandle *ioh = io_uring_cqe_get_data(cqe);
551 : 561 : int result = cqe->res;
552 : :
399 melanieplageman@gmai 553 : 561 : errcallback.arg = ioh;
554 : :
413 andres@anarazel.de 555 : 561 : io_uring_cqe_seen(&context->io_uring_ring, cqe);
556 : :
110 557 : 561 : pgaio_io_process_completion(ioh, result);
399 melanieplageman@gmai 558 : 561 : errcallback.arg = NULL;
559 : : }
560 : :
413 andres@anarazel.de 561 [ - + ]: 541 : END_CRIT_SECTION();
562 : :
563 [ + + ]: 541 : pgaio_debug(DEBUG3,
564 : : "drained %d/%d, now expecting %d",
565 : : ncqes, orig_ready, io_uring_cq_ready(&context->io_uring_ring));
566 : : }
567 : :
399 melanieplageman@gmai 568 : 541 : error_context_stack = errcallback.previous;
413 andres@anarazel.de 569 : 541 : }
570 : :
571 : : static void
572 : 103 : pgaio_uring_wait_one(PgAioHandle *ioh, uint64 ref_generation)
573 : : {
574 : : PgAioHandleState state;
575 : 103 : ProcNumber owner_procno = ioh->owner_procno;
576 : 103 : PgAioUringContext *owner_context = &pgaio_uring_contexts[owner_procno];
577 : : bool expect_cqe;
578 : 103 : int waited = 0;
579 : :
580 : : /*
581 : : * XXX: It would be nice to have a smarter locking scheme, nearly all the
582 : : * time the backend owning the ring will consume the completions, making
583 : : * the locking unnecessarily expensive.
584 : : */
585 : 103 : LWLockAcquire(&owner_context->completion_lock, LW_EXCLUSIVE);
586 : :
587 : : while (true)
588 : : {
589 [ + + ]: 309 : pgaio_debug_io(DEBUG3, ioh,
590 : : "wait_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64 ", cycle %d",
591 : : ioh->generation,
592 : : ref_generation,
593 : : waited);
594 : :
595 [ + + ]: 206 : if (pgaio_io_was_recycled(ioh, ref_generation, &state) ||
596 [ + - ]: 103 : state != PGAIO_HS_SUBMITTED)
597 : : {
598 : : /* the IO was completed by another backend */
599 : : break;
600 : : }
601 [ + + ]: 103 : else if (io_uring_cq_ready(&owner_context->io_uring_ring))
602 : : {
603 : : /* no need to wait in the kernel, io_uring has a completion */
604 : 49 : expect_cqe = true;
605 : : }
606 : : else
607 : : {
608 : : int ret;
609 : : struct io_uring_cqe *cqes;
610 : :
611 : : /* need to wait in the kernel */
413 andres@anarazel.de 612 :GBC 54 : pgstat_report_wait_start(WAIT_EVENT_AIO_IO_URING_EXECUTION);
613 : 54 : ret = io_uring_wait_cqes(&owner_context->io_uring_ring, &cqes, 1, NULL, NULL);
614 : 54 : pgstat_report_wait_end();
615 : :
616 [ - + ]: 54 : if (ret == -EINTR)
617 : : {
413 andres@anarazel.de 618 :UBC 0 : continue;
619 : : }
413 andres@anarazel.de 620 [ - + ]:GBC 54 : else if (ret != 0)
621 : : {
622 : : /* see comment after io_uring_submit() */
413 andres@anarazel.de 623 :UBC 0 : errno = -ret;
624 [ # # ]: 0 : elog(PANIC, "io_uring wait failed: %m");
625 : : }
626 : : else
627 : : {
413 andres@anarazel.de 628 [ - + ]:GBC 54 : Assert(cqes != NULL);
629 : 54 : expect_cqe = true;
630 : 54 : waited++;
631 : : }
632 : : }
633 : :
413 andres@anarazel.de 634 [ + - ]:CBC 103 : if (expect_cqe)
635 : : {
636 : 103 : pgaio_uring_drain_locked(owner_context);
637 : : }
638 : : }
639 : :
640 : 103 : LWLockRelease(&owner_context->completion_lock);
641 : :
642 [ + + ]: 103 : pgaio_debug(DEBUG3,
643 : : "wait_one with %d sleeps",
644 : : waited);
645 : 103 : }
646 : :
647 : : static void
34 andres@anarazel.de 648 :GNC 511 : pgaio_uring_check_one(PgAioHandle *ioh, uint64 ref_generation)
649 : : {
650 : 511 : ProcNumber owner_procno = ioh->owner_procno;
651 : 511 : PgAioUringContext *owner_context = &pgaio_uring_contexts[owner_procno];
652 : :
653 : : /*
654 : : * This check is not reliable when not holding the completion lock, but
655 : : * it's a useful cheap pre-check to see if it's worth trying to get the
656 : : * completion lock.
657 : : */
658 [ + + ]: 511 : if (!io_uring_cq_ready(&owner_context->io_uring_ring))
659 : 73 : return;
660 : :
661 : : /*
662 : : * If the completion lock is currently held, the holder will likely
663 : : * process any pending completions, give up.
664 : : */
665 [ - + ]: 438 : if (!LWLockConditionalAcquire(&owner_context->completion_lock, LW_EXCLUSIVE))
34 andres@anarazel.de 666 :UNC 0 : return;
667 : :
34 andres@anarazel.de 668 [ + + ]:GNC 438 : pgaio_debug_io(DEBUG3, ioh,
669 : : "check_one io_gen: %" PRIu64 ", ref_gen: %" PRIu64,
670 : : ioh->generation,
671 : : ref_generation);
672 : :
673 : : /*
674 : : * Recheck if there are any completions, another backend could have
675 : : * processed them since we checked above, or our unlocked pre-check could
676 : : * have been reading outdated values.
677 : : *
678 : : * It is possible that the IO handle has been reused since the start of
679 : : * the call, but now that we have the lock, we can just as well drain all
680 : : * completions.
681 : : */
682 [ + - ]: 438 : if (io_uring_cq_ready(&owner_context->io_uring_ring))
683 : 438 : pgaio_uring_drain_locked(owner_context);
684 : :
685 : 438 : LWLockRelease(&owner_context->completion_lock);
686 : : }
687 : :
688 : : /*
689 : : * io_uring executes IO in process context if possible. That's generally good,
690 : : * as it reduces context switching. When performing a lot of buffered IO that
691 : : * means that copying between page cache and userspace memory happens in the
692 : : * foreground, as it can't be offloaded to DMA hardware as is possible when
693 : : * using direct IO. When executing a lot of buffered IO this causes io_uring
694 : : * to be slower than worker mode, as worker mode parallelizes the
695 : : * copying. io_uring can be told to offload work to worker threads instead.
696 : : *
697 : : * If the IOs are small, we only benefit from forcing things into the
698 : : * background if there is a lot of IO, as otherwise the overhead from context
699 : : * switching is higher than the gain.
700 : : *
701 : : * If IOs are large, there is benefit from asynchronous processing at lower
702 : : * queue depths, as IO latency is less of a crucial factor and parallelizing
703 : : * memory copies is more important. In addition, it is important to trigger
704 : : * asynchronous processing even at low queue depth, as with foreground
705 : : * processing we might never actually reach deep enough IO depths to trigger
706 : : * asynchronous processing, which in turn would deprive readahead control
707 : : * logic of information about whether a deeper look-ahead distance would be
708 : : * advantageous.
709 : : *
710 : : * We have done some basic benchmarking to validate the thresholds used, but
711 : : * it's quite plausible that there are better values. See
712 : : * https://postgr.es/m/3gkuvs3lz3u3skuaxfkxnsysfqslf2srigl6546vhesekve6v2%40va3r5esummvg
713 : : * for some details of this benchmarking.
714 : : */
715 : : static bool
30 716 : 561 : pgaio_uring_should_use_async(PgAioHandle *ioh, size_t io_size)
717 : : {
718 : : /*
719 : : * With DIO there's no benefit from forcing asynchronous processing, as
720 : : * io_uring will never execute direct IO synchronously during submission.
721 : : */
722 [ - + ]: 561 : if (!(ioh->flags & PGAIO_HF_BUFFERED))
30 andres@anarazel.de 723 :UNC 0 : return false;
724 : :
725 : : /*
726 : : * Once the IO queue depth is not that shallow anymore, the overhead of
727 : : * dispatching to the background is a less significant factor.
728 : : */
30 andres@anarazel.de 729 [ - + ]:GNC 561 : if (dclist_count(&pgaio_my_backend->in_flight_ios) > 4)
30 andres@anarazel.de 730 :UNC 0 : return true;
731 : :
732 : : /*
733 : : * If the IO is larger, the gains from parallelizing the memory copy are
734 : : * larger and typically the impact of the latency is smaller.
735 : : */
30 andres@anarazel.de 736 [ + + ]:GNC 561 : if (io_size >= (BLCKSZ * 4))
737 : 133 : return true;
738 : :
739 : 428 : return false;
740 : : }
741 : :
742 : : static void
413 andres@anarazel.de 743 :CBC 561 : pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe)
744 : : {
745 : : struct iovec *iov;
30 andres@anarazel.de 746 :GNC 561 : size_t io_size = 0;
747 : :
251 andres@anarazel.de 748 [ + - - - ]:CBC 561 : switch ((PgAioOp) ioh->op)
749 : : {
413 750 : 561 : case PGAIO_OP_READV:
751 : 561 : iov = &pgaio_ctl->iovecs[ioh->iovec_off];
752 [ + + ]: 561 : if (ioh->op_data.read.iov_length == 1)
753 : : {
754 : 547 : io_uring_prep_read(sqe,
755 : : ioh->op_data.read.fd,
756 : : iov->iov_base,
757 : 547 : iov->iov_len,
758 : 547 : ioh->op_data.read.offset);
759 : :
30 andres@anarazel.de 760 :GNC 547 : io_size = iov->iov_len;
761 : : }
762 : : else
763 : : {
413 andres@anarazel.de 764 :CBC 14 : io_uring_prep_readv(sqe,
765 : : ioh->op_data.read.fd,
766 : : iov,
767 : 14 : ioh->op_data.read.iov_length,
768 : 14 : ioh->op_data.read.offset);
769 : :
30 andres@anarazel.de 770 [ + + ]:GNC 49 : for (int i = 0; i < ioh->op_data.read.iov_length; i++, iov++)
771 : 35 : io_size += iov->iov_len;
772 : : }
773 : :
774 [ + + ]: 561 : if (pgaio_uring_should_use_async(ioh, io_size))
775 : 133 : io_uring_sqe_set_flags(sqe, IOSQE_ASYNC);
776 : :
413 andres@anarazel.de 777 :CBC 561 : break;
778 : :
413 andres@anarazel.de 779 :UBC 0 : case PGAIO_OP_WRITEV:
780 : 0 : iov = &pgaio_ctl->iovecs[ioh->iovec_off];
781 [ # # ]: 0 : if (ioh->op_data.write.iov_length == 1)
782 : : {
783 : 0 : io_uring_prep_write(sqe,
784 : : ioh->op_data.write.fd,
785 : 0 : iov->iov_base,
786 : 0 : iov->iov_len,
787 : 0 : ioh->op_data.write.offset);
788 : : }
789 : : else
790 : : {
791 : 0 : io_uring_prep_writev(sqe,
792 : : ioh->op_data.write.fd,
793 : : iov,
794 : 0 : ioh->op_data.write.iov_length,
795 : 0 : ioh->op_data.write.offset);
796 : : }
797 : :
798 : : /*
799 : : * For now don't trigger use of IOSQE_ASYNC for writes, it's not
800 : : * clear there is a performance benefit in doing so.
801 : : */
802 : :
803 : 0 : break;
804 : :
805 : 0 : case PGAIO_OP_INVALID:
806 [ # # ]: 0 : elog(ERROR, "trying to prepare invalid IO operation for execution");
807 : : }
808 : :
413 andres@anarazel.de 809 :CBC 561 : io_uring_sqe_set_data(sqe, ioh);
810 : 561 : }
811 : :
812 : : #endif /* IOMETHOD_IO_URING_ENABLED */
|