Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * aio.c
4 : : * AIO - Core Logic
5 : : *
6 : : * For documentation about how AIO works on a higher level, including a
7 : : * schematic example, see README.md.
8 : : *
9 : : *
10 : : * AIO is a complicated subsystem. To keep things navigable, it is split
11 : : * across a number of files:
12 : : *
13 : : * - method_*.c - different ways of executing AIO (e.g. worker process)
14 : : *
15 : : * - aio_target.c - IO on different kinds of targets
16 : : *
17 : : * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
18 : : *
19 : : * - aio_callback.c - callbacks at IO operation lifecycle events
20 : : *
21 : : * - aio_init.c - per-server and per-backend initialization
22 : : *
23 : : * - aio.c - all other topics
24 : : *
25 : : * - read_stream.c - helper for reading buffered relation data
26 : : *
27 : : * - README.md - higher-level overview over AIO
28 : : *
29 : : *
30 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
31 : : * Portions Copyright (c) 1994, Regents of the University of California
32 : : *
33 : : * IDENTIFICATION
34 : : * src/backend/storage/aio/aio.c
35 : : *
36 : : *-------------------------------------------------------------------------
37 : : */
38 : :
39 : : #include "postgres.h"
40 : :
41 : : #include "lib/ilist.h"
42 : : #include "miscadmin.h"
43 : : #include "port/atomics.h"
44 : : #include "storage/aio.h"
45 : : #include "storage/aio_internal.h"
46 : : #include "storage/aio_subsys.h"
47 : : #include "utils/guc.h"
48 : : #include "utils/guc_hooks.h"
49 : : #include "utils/injection_point.h"
50 : : #include "utils/resowner.h"
51 : : #include "utils/wait_event_types.h"
52 : :
53 : :
54 : : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
55 : : static void pgaio_io_reclaim(PgAioHandle *ioh);
56 : : static void pgaio_io_resowner_register(PgAioHandle *ioh);
57 : : static void pgaio_io_wait_for_free(void);
58 : : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
59 : : static const char *pgaio_io_state_get_name(PgAioHandleState s);
60 : : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
61 : :
62 : :
63 : : /* Options for io_method. */
64 : : const struct config_enum_entry io_method_options[] = {
65 : : {"sync", IOMETHOD_SYNC, false},
66 : : {"worker", IOMETHOD_WORKER, false},
67 : : #ifdef IOMETHOD_IO_URING_ENABLED
68 : : {"io_uring", IOMETHOD_IO_URING, false},
69 : : #endif
70 : : {NULL, 0, false}
71 : : };
72 : :
73 : : /* GUCs */
74 : : int io_method = DEFAULT_IO_METHOD;
75 : : int io_max_concurrency = -1;
76 : :
77 : : /* global control for AIO */
78 : : PgAioCtl *pgaio_ctl;
79 : :
80 : : /* current backend's per-backend state */
81 : : PgAioBackend *pgaio_my_backend;
82 : :
83 : :
84 : : static const IoMethodOps *const pgaio_method_ops_table[] = {
85 : : [IOMETHOD_SYNC] = &pgaio_sync_ops,
86 : : [IOMETHOD_WORKER] = &pgaio_worker_ops,
87 : : #ifdef IOMETHOD_IO_URING_ENABLED
88 : : [IOMETHOD_IO_URING] = &pgaio_uring_ops,
89 : : #endif
90 : : };
91 : :
92 : : /* callbacks for the configured io_method, set by assign_io_method */
93 : : const IoMethodOps *pgaio_method_ops;
94 : :
95 : :
96 : : /* --------------------------------------------------------------------------------
97 : : * Public Functions related to PgAioHandle
98 : : * --------------------------------------------------------------------------------
99 : : */
100 : :
101 : : /*
102 : : * Acquire an AioHandle, waiting for IO completion if necessary.
103 : : *
104 : : * Each backend can only have one AIO handle that has been "handed out" to
105 : : * code, but not yet submitted or released. This restriction is necessary to
106 : : * ensure that it is possible for code to wait for an unused handle by waiting
107 : : * for in-flight IO to complete. There is a limited number of handles in each
108 : : * backend, if multiple handles could be handed out without being submitted,
109 : : * waiting for all in-flight IO to complete would not guarantee that handles
110 : : * free up.
111 : : *
112 : : * It is cheap to acquire an IO handle, unless all handles are in use. In that
113 : : * case this function waits for the oldest IO to complete. If that is not
114 : : * desirable, use pgaio_io_acquire_nb().
115 : : *
116 : : * If a handle was acquired but then does not turn out to be needed,
117 : : * e.g. because pgaio_io_acquire() is called before starting an IO in a
118 : : * critical section, the handle needs to be released with pgaio_io_release().
119 : : *
120 : : *
121 : : * To react to the completion of the IO as soon as it is known to have
122 : : * completed, callbacks can be registered with pgaio_io_register_callbacks().
123 : : *
124 : : * To actually execute IO using the returned handle, the pgaio_io_start_*()
125 : : * family of functions is used. In many cases the pgaio_io_start_*() call will
126 : : * not be done directly by code that acquired the handle, but by lower level
127 : : * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
128 : : * AIO, it typically will pass the handle to smgr.c, which will pass it on to
129 : : * md.c, on to fd.c, which then finally calls pgaio_io_start_*(). This
130 : : * forwarding allows the various layers to react to the IO's completion by
131 : : * registering callbacks. These callbacks in turn can translate a lower
132 : : * layer's result into a result understandable by a higher layer.
133 : : *
134 : : * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
135 : : * not submitted to the kernel). Unless in batchmode
136 : : * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
137 : : * execution. Note that, whether in batchmode or not, the IO might even
138 : : * complete before the functions return.
139 : : *
140 : : * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
141 : : * referenced by the IO issuing code. To e.g. wait for IO, references to the
142 : : * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
143 : : * is called. pgaio_wref_wait() can be used to wait for the IO to complete.
144 : : *
145 : : *
146 : : * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
147 : : * passed to pgaio_io_acquire(). Once the issuing backend has called
148 : : * pgaio_wref_wait(), the PgAioReturn contains information about whether the
149 : : * operation succeeded and details about the first failure, if any. The error
150 : : * can be raised / logged with pgaio_result_report().
151 : : *
152 : : * The lifetime of the memory pointed to be *ret needs to be at least as long
153 : : * as the passed in resowner. If the resowner releases resources before the IO
154 : : * completes (typically due to an error), the reference to *ret will be
155 : : * cleared. In case of resowner cleanup *ret will not be updated with the
156 : : * results of the IO operation.
157 : : */
158 : : PgAioHandle *
173 andres@anarazel.de 159 :CBC 3565 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
160 : : {
161 : : PgAioHandle *h;
162 : :
163 : : while (true)
164 : : {
165 : 6980 : h = pgaio_io_acquire_nb(resowner, ret);
166 : :
167 [ + + ]: 6977 : if (h != NULL)
168 : 3562 : return h;
169 : :
170 : : /*
171 : : * Evidently all handles by this backend are in use. Just wait for
172 : : * some to complete.
173 : : */
174 : 3415 : pgaio_io_wait_for_free();
175 : : }
176 : : }
177 : :
178 : : /*
179 : : * Acquire an AioHandle, returning NULL if no handles are free.
180 : : *
181 : : * See pgaio_io_acquire(). The only difference is that this function will return
182 : : * NULL if there are no idle handles, instead of blocking.
183 : : */
184 : : PgAioHandle *
185 : 1254368 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
186 : : {
110 187 : 1254368 : PgAioHandle *ioh = NULL;
188 : :
173 189 [ - + ]: 1254368 : if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
190 : : {
173 andres@anarazel.de 191 [ # # ]:UBC 0 : Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
192 : 0 : pgaio_submit_staged();
193 : : }
194 : :
173 andres@anarazel.de 195 [ + + ]:CBC 1254368 : if (pgaio_my_backend->handed_out_io)
196 [ + - ]: 3 : elog(ERROR, "API violation: Only one IO can be handed out");
197 : :
198 : : /*
199 : : * Probably not needed today, as interrupts should not process this IO,
200 : : * but...
201 : : */
110 202 : 1254365 : HOLD_INTERRUPTS();
203 : :
173 204 [ + + ]: 1254365 : if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
205 : : {
206 : 1247535 : dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
207 : :
110 208 : 1247535 : ioh = dclist_container(PgAioHandle, node, ion);
209 : :
173 210 [ - + ]: 1247535 : Assert(ioh->state == PGAIO_HS_IDLE);
211 [ - + ]: 1247535 : Assert(ioh->owner_procno == MyProcNumber);
212 : :
213 : 1247535 : pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
214 : 1247535 : pgaio_my_backend->handed_out_io = ioh;
215 : :
216 [ + - ]: 1247535 : if (resowner)
217 : 1247535 : pgaio_io_resowner_register(ioh);
218 : :
219 [ + + ]: 1247535 : if (ret)
220 : : {
221 : 1247496 : ioh->report_return = ret;
168 222 : 1247496 : ret->result.status = PGAIO_RS_UNKNOWN;
223 : : }
224 : : }
225 : :
110 226 [ - + ]: 1254365 : RESUME_INTERRUPTS();
227 : :
228 : 1254365 : return ioh;
229 : : }
230 : :
231 : : /*
232 : : * Release IO handle that turned out to not be required.
233 : : *
234 : : * See pgaio_io_acquire() for more details.
235 : : */
236 : : void
173 237 : 1959 : pgaio_io_release(PgAioHandle *ioh)
238 : : {
239 [ + + ]: 1959 : if (ioh == pgaio_my_backend->handed_out_io)
240 : : {
241 [ - + ]: 1956 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
242 [ - + ]: 1956 : Assert(ioh->resowner);
243 : :
244 : 1956 : pgaio_my_backend->handed_out_io = NULL;
245 : :
246 : : /*
247 : : * Note that no interrupts are processed between the handed_out_io
248 : : * check and the call to reclaim - that's important as otherwise an
249 : : * interrupt could have already reclaimed the handle.
250 : : */
251 : 1956 : pgaio_io_reclaim(ioh);
252 : : }
253 : : else
254 : : {
255 [ + - ]: 3 : elog(ERROR, "release in unexpected state");
256 : : }
257 : 1956 : }
258 : :
259 : : /*
260 : : * Release IO handle during resource owner cleanup.
261 : : */
262 : : void
263 : 67 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
264 : : {
265 : 67 : PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
266 : :
267 [ - + ]: 67 : Assert(ioh->resowner);
268 : :
269 : : /*
270 : : * Otherwise an interrupt, in the middle of releasing the IO, could end up
271 : : * trying to wait for the IO, leading to state confusion.
272 : : */
110 273 : 67 : HOLD_INTERRUPTS();
274 : :
173 275 : 67 : ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
276 : 67 : ioh->resowner = NULL;
277 : :
10 278 [ - + - + : 67 : switch ((PgAioHandleState) ioh->state)
- ]
279 : : {
173 andres@anarazel.de 280 :UBC 0 : case PGAIO_HS_IDLE:
281 [ # # ]: 0 : elog(ERROR, "unexpected");
282 : : break;
173 andres@anarazel.de 283 :CBC 42 : case PGAIO_HS_HANDED_OUT:
284 [ - + - - ]: 42 : Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
285 : :
286 [ + - ]: 42 : if (ioh == pgaio_my_backend->handed_out_io)
287 : : {
288 : 42 : pgaio_my_backend->handed_out_io = NULL;
289 [ + + ]: 42 : if (!on_error)
290 [ + - ]: 15 : elog(WARNING, "leaked AIO handle");
291 : : }
292 : :
293 : 42 : pgaio_io_reclaim(ioh);
294 : 42 : break;
173 andres@anarazel.de 295 :UBC 0 : case PGAIO_HS_DEFINED:
296 : : case PGAIO_HS_STAGED:
297 [ # # ]: 0 : if (!on_error)
298 [ # # ]: 0 : elog(WARNING, "AIO handle was not submitted");
299 : 0 : pgaio_submit_staged();
300 : 0 : break;
173 andres@anarazel.de 301 :CBC 25 : case PGAIO_HS_SUBMITTED:
302 : : case PGAIO_HS_COMPLETED_IO:
303 : : case PGAIO_HS_COMPLETED_SHARED:
304 : : case PGAIO_HS_COMPLETED_LOCAL:
305 : : /* this is expected to happen */
306 : 25 : break;
307 : : }
308 : :
309 : : /*
310 : : * Need to unregister the reporting of the IO's result, the memory it's
311 : : * referencing likely has gone away.
312 : : */
313 [ + + ]: 67 : if (ioh->report_return)
314 : 25 : ioh->report_return = NULL;
315 : :
110 316 [ - + ]: 67 : RESUME_INTERRUPTS();
173 317 : 67 : }
318 : :
319 : : /*
320 : : * Add a [set of] flags to the IO.
321 : : *
322 : : * Note that this combines flags with already set flags, rather than set flags
323 : : * to explicitly the passed in parameters. This is to allow multiple callsites
324 : : * to set flags.
325 : : */
326 : : void
327 : 2489607 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
328 : : {
329 [ - + ]: 2489607 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
330 : :
331 : 2489607 : ioh->flags |= flag;
332 : 2489607 : }
333 : :
334 : : /*
335 : : * Returns an ID uniquely identifying the IO handle. This is only really
336 : : * useful for logging, as handles are reused across multiple IOs.
337 : : */
338 : : int
339 : 584585 : pgaio_io_get_id(PgAioHandle *ioh)
340 : : {
341 [ + - - + ]: 584585 : Assert(ioh >= pgaio_ctl->io_handles &&
342 : : ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
343 : 584585 : return ioh - pgaio_ctl->io_handles;
344 : : }
345 : :
346 : : /*
347 : : * Return the ProcNumber for the process that can use an IO handle. The
348 : : * mapping from IO handles to PGPROCs is static, therefore this even works
349 : : * when the corresponding PGPROC is not in use.
350 : : */
351 : : ProcNumber
352 : 1826 : pgaio_io_get_owner(PgAioHandle *ioh)
353 : : {
354 : 1826 : return ioh->owner_procno;
355 : : }
356 : :
357 : : /*
358 : : * Return a wait reference for the IO. Only wait references can be used to
359 : : * wait for an IOs completion, as handles themselves can be reused after
360 : : * completion. See also the comment above pgaio_io_acquire().
361 : : */
362 : : void
363 : 2491089 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
364 : : {
365 [ + + - + : 2491089 : Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
- - ]
366 : : ioh->state == PGAIO_HS_DEFINED ||
367 : : ioh->state == PGAIO_HS_STAGED);
368 [ - + ]: 2491089 : Assert(ioh->generation != 0);
369 : :
370 : 2491089 : iow->aio_index = ioh - pgaio_ctl->io_handles;
371 : 2491089 : iow->generation_upper = (uint32) (ioh->generation >> 32);
372 : 2491089 : iow->generation_lower = (uint32) ioh->generation;
373 : 2491089 : }
374 : :
375 : :
376 : :
377 : : /* --------------------------------------------------------------------------------
378 : : * Internal Functions related to PgAioHandle
379 : : * --------------------------------------------------------------------------------
380 : : */
381 : :
382 : : static inline void
383 : 9761374 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
384 : : {
385 : : /*
386 : : * All callers need to have held interrupts in some form, otherwise
387 : : * interrupt processing could wait for the IO to complete, while in an
388 : : * intermediary state.
389 : : */
110 390 [ + + - + : 9761374 : Assert(!INTERRUPTS_CAN_BE_PROCESSED());
- - ]
391 : :
173 392 [ - + ]: 9761374 : pgaio_debug_io(DEBUG5, ioh,
393 : : "updating state to %s",
394 : : pgaio_io_state_get_name(new_state));
395 : :
396 : : /*
397 : : * Ensure the changes signified by the new state are visible before the
398 : : * new state becomes visible.
399 : : */
400 : 9761374 : pg_write_barrier();
401 : :
402 : 9761374 : ioh->state = new_state;
403 : 9761374 : }
404 : :
405 : : static void
406 : 1247535 : pgaio_io_resowner_register(PgAioHandle *ioh)
407 : : {
408 [ - + ]: 1247535 : Assert(!ioh->resowner);
409 [ - + ]: 1247535 : Assert(CurrentResourceOwner);
410 : :
411 : 1247535 : ResourceOwnerRememberAioHandle(CurrentResourceOwner, &ioh->resowner_node);
412 : 1247535 : ioh->resowner = CurrentResourceOwner;
413 : 1247535 : }
414 : :
415 : : /*
416 : : * Stage IO for execution and, if appropriate, submit it immediately.
417 : : *
418 : : * Should only be called from pgaio_io_start_*().
419 : : */
420 : : void
421 : 1245537 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
422 : : {
423 : : bool needs_synchronous;
424 : :
425 [ - + ]: 1245537 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
426 [ - + ]: 1245537 : Assert(pgaio_my_backend->handed_out_io == ioh);
427 [ - + ]: 1245537 : Assert(pgaio_io_has_target(ioh));
428 : :
429 : : /*
430 : : * Otherwise an interrupt, in the middle of staging and possibly executing
431 : : * the IO, could end up trying to wait for the IO, leading to state
432 : : * confusion.
433 : : */
110 434 : 1245537 : HOLD_INTERRUPTS();
435 : :
173 436 : 1245537 : ioh->op = op;
437 : 1245537 : ioh->result = 0;
438 : :
439 : 1245537 : pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
440 : :
441 : : /* allow a new IO to be staged */
442 : 1245537 : pgaio_my_backend->handed_out_io = NULL;
443 : :
444 : 1245537 : pgaio_io_call_stage(ioh);
445 : :
446 : 1245537 : pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
447 : :
448 : : /*
449 : : * Synchronous execution has to be executed, well, synchronously, so check
450 : : * that first.
451 : : */
452 : 1245537 : needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
453 : :
454 [ + + ]: 1245537 : pgaio_debug_io(DEBUG3, ioh,
455 : : "staged (synchronous: %d, in_batch: %d)",
456 : : needs_synchronous, pgaio_my_backend->in_batchmode);
457 : :
458 [ + + ]: 1245537 : if (!needs_synchronous)
459 : : {
460 : 552253 : pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
461 [ - + ]: 552253 : Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
462 : :
463 : : /*
464 : : * Unless code explicitly opted into batching IOs, submit the IO
465 : : * immediately.
466 : : */
467 [ + + ]: 552253 : if (!pgaio_my_backend->in_batchmode)
468 : 26650 : pgaio_submit_staged();
469 : : }
470 : : else
471 : : {
472 : 693284 : pgaio_io_prepare_submit(ioh);
473 : 693284 : pgaio_io_perform_synchronously(ioh);
474 : : }
475 : :
110 476 [ - + ]: 1245537 : RESUME_INTERRUPTS();
173 477 : 1245537 : }
478 : :
479 : : bool
480 : 1245537 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
481 : : {
482 : : /*
483 : : * If the caller said to execute the IO synchronously, do so.
484 : : *
485 : : * XXX: We could optimize the logic when to execute synchronously by first
486 : : * checking if there are other IOs in flight and only synchronously
487 : : * executing if not. Unclear whether that'll be sufficiently common to be
488 : : * worth worrying about.
489 : : */
490 [ + + ]: 1245537 : if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
491 : 688671 : return true;
492 : :
493 : : /* Check if the IO method requires synchronous execution of IO */
494 [ + + ]: 556866 : if (pgaio_method_ops->needs_synchronous_execution)
495 : 556474 : return pgaio_method_ops->needs_synchronous_execution(ioh);
496 : :
497 : 392 : return false;
498 : : }
499 : :
500 : : /*
501 : : * Handle IO being processed by IO method.
502 : : *
503 : : * Should be called by IO methods / synchronous IO execution, just before the
504 : : * IO is performed.
505 : : */
506 : : void
507 : 1245537 : pgaio_io_prepare_submit(PgAioHandle *ioh)
508 : : {
509 : 1245537 : pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
510 : :
511 : 1245537 : dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
512 : 1245537 : }
513 : :
514 : : /*
515 : : * Handle IO getting completed by a method.
516 : : *
517 : : * Should be called by IO methods / synchronous IO execution, just after the
518 : : * IO has been performed.
519 : : *
520 : : * Expects to be called in a critical section. We expect IOs to be usable for
521 : : * WAL etc, which requires being able to execute completion callbacks in a
522 : : * critical section.
523 : : */
524 : : void
525 : 1142078 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
526 : : {
527 [ - + ]: 1142078 : Assert(ioh->state == PGAIO_HS_SUBMITTED);
528 : :
529 [ - + ]: 1142078 : Assert(CritSectionCount > 0);
530 : :
531 : 1142078 : ioh->result = result;
532 : :
533 : 1142078 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
534 : :
535 : : INJECTION_POINT("aio-process-completion-before-shared", ioh);
536 : :
537 : 1142078 : pgaio_io_call_complete_shared(ioh);
538 : :
539 : 1142078 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
540 : :
541 : : /* condition variable broadcast ensures state is visible before wakeup */
542 : 1142078 : ConditionVariableBroadcast(&ioh->cv);
543 : :
544 : : /* contains call to pgaio_io_call_complete_local() */
545 [ + + ]: 1142078 : if (ioh->owner_procno == MyProcNumber)
546 : 693673 : pgaio_io_reclaim(ioh);
547 : 1142078 : }
548 : :
549 : : /*
550 : : * Has the IO completed and thus the IO handle been reused?
551 : : *
552 : : * This is useful when waiting for IO completion at a low level (e.g. in an IO
553 : : * method's ->wait_one() callback).
554 : : */
555 : : bool
556 : 2905912 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
557 : : {
558 : 2905912 : *state = ioh->state;
559 : :
560 : : /*
561 : : * Ensure that we don't see an earlier state of the handle than ioh->state
562 : : * due to compiler or CPU reordering. This protects both ->generation as
563 : : * directly used here, and other fields in the handle accessed in the
564 : : * caller if the handle was not reused.
565 : : */
566 : 2905912 : pg_read_barrier();
567 : :
568 : 2905912 : return ioh->generation != ref_generation;
569 : : }
570 : :
571 : : /*
572 : : * Wait for IO to complete. External code should never use this, outside of
573 : : * the AIO subsystem waits are only allowed via pgaio_wref_wait().
574 : : */
575 : : static void
576 : 277556 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
577 : : {
578 : : PgAioHandleState state;
579 : : bool am_owner;
580 : :
581 : 277556 : am_owner = ioh->owner_procno == MyProcNumber;
582 : :
583 [ + + ]: 277556 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
584 : 35 : return;
585 : :
586 [ + + ]: 277521 : if (am_owner)
587 : : {
588 [ + + ]: 276132 : if (state != PGAIO_HS_SUBMITTED
589 [ + + ]: 13328 : && state != PGAIO_HS_COMPLETED_IO
590 [ + - ]: 139 : && state != PGAIO_HS_COMPLETED_SHARED
173 andres@anarazel.de 591 [ # # ]:UBC 0 : && state != PGAIO_HS_COMPLETED_LOCAL)
592 : : {
110 593 [ # # ]: 0 : elog(PANIC, "waiting for own IO %d in wrong state: %s",
594 : : pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
595 : : }
596 : : }
597 : :
598 : : while (true)
599 : : {
173 andres@anarazel.de 600 [ + + ]:CBC 554875 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
601 : 1227 : return;
602 : :
10 603 [ - + + + : 553648 : switch ((PgAioHandleState) state)
- ]
604 : : {
173 andres@anarazel.de 605 :UBC 0 : case PGAIO_HS_IDLE:
606 : : case PGAIO_HS_HANDED_OUT:
607 [ # # ]: 0 : elog(ERROR, "IO in wrong state: %d", state);
608 : : break;
609 : :
173 andres@anarazel.de 610 :CBC 263678 : case PGAIO_HS_SUBMITTED:
611 : :
612 : : /*
613 : : * If we need to wait via the IO method, do so now. Don't
614 : : * check via the IO method if the issuing backend is executing
615 : : * the IO synchronously.
616 : : */
617 [ + + + - ]: 263678 : if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
618 : : {
619 : 383 : pgaio_method_ops->wait_one(ioh, ref_generation);
620 : 383 : continue;
621 : : }
622 : : /* fallthrough */
623 : :
624 : : /* waiting for owner to submit */
625 : : case PGAIO_HS_DEFINED:
626 : : case PGAIO_HS_STAGED:
627 : : /* waiting for reaper to complete */
628 : : /* fallthrough */
629 : : case PGAIO_HS_COMPLETED_IO:
630 : : /* shouldn't be able to hit this otherwise */
631 [ - + ]: 276972 : Assert(IsUnderPostmaster);
632 : : /* ensure we're going to get woken up */
633 : 276972 : ConditionVariablePrepareToSleep(&ioh->cv);
634 : :
635 [ + + ]: 553748 : while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
636 : : {
637 [ + + ]: 552907 : if (state == PGAIO_HS_COMPLETED_SHARED ||
638 [ + + ]: 276783 : state == PGAIO_HS_COMPLETED_LOCAL)
639 : : break;
640 : 276777 : ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
641 : : }
642 : :
643 : 276971 : ConditionVariableCancelSleep();
644 : 276971 : break;
645 : :
646 : 276293 : case PGAIO_HS_COMPLETED_SHARED:
647 : : case PGAIO_HS_COMPLETED_LOCAL:
648 : :
649 : : /*
650 : : * Note that no interrupts are processed between
651 : : * pgaio_io_was_recycled() and this check - that's important
652 : : * as otherwise an interrupt could have already reclaimed the
653 : : * handle.
654 : : */
655 [ + + ]: 276293 : if (am_owner)
656 : 275751 : pgaio_io_reclaim(ioh);
657 : 276293 : return;
658 : : }
659 : : }
660 : : }
661 : :
662 : : /*
663 : : * Make IO handle ready to be reused after IO has completed or after the
664 : : * handle has been released without being used.
665 : : *
666 : : * Note that callers need to be careful about only calling this in the right
667 : : * state and that no interrupts can be processed between the state check and
668 : : * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
669 : : * already have reclaimed the handle.
670 : : */
671 : : static void
672 : 1247535 : pgaio_io_reclaim(PgAioHandle *ioh)
673 : : {
674 : : /* This is only ok if it's our IO */
675 [ - + ]: 1247535 : Assert(ioh->owner_procno == MyProcNumber);
676 [ - + ]: 1247535 : Assert(ioh->state != PGAIO_HS_IDLE);
677 : :
678 : : /* see comment in function header */
110 679 : 1247535 : HOLD_INTERRUPTS();
680 : :
681 : : /*
682 : : * It's a bit ugly, but right now the easiest place to put the execution
683 : : * of local completion callbacks is this function, as we need to execute
684 : : * local callbacks just before reclaiming at multiple callsites.
685 : : */
173 686 [ + + ]: 1247535 : if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
687 : : {
688 : : PgAioResult local_result;
689 : :
164 690 : 1245537 : local_result = pgaio_io_call_complete_local(ioh);
173 691 : 1245537 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
692 : :
164 693 [ + + ]: 1245537 : if (ioh->report_return)
694 : : {
695 : 1245512 : ioh->report_return->result = local_result;
696 : 1245512 : ioh->report_return->target_data = ioh->target_data;
697 : : }
698 : : }
699 : :
173 700 [ - + ]: 1247535 : pgaio_debug_io(DEBUG4, ioh,
701 : : "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
702 : : pgaio_result_status_string(ioh->distilled_result.status),
703 : : ioh->distilled_result.id,
704 : : ioh->distilled_result.error_data,
705 : : ioh->result);
706 : :
707 : : /* if the IO has been defined, it's on the in-flight list, remove */
708 [ + + ]: 1247535 : if (ioh->state != PGAIO_HS_HANDED_OUT)
709 : 1245537 : dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
710 : :
711 [ + + ]: 1247535 : if (ioh->resowner)
712 : : {
713 : 1247468 : ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
714 : 1247468 : ioh->resowner = NULL;
715 : : }
716 : :
717 [ - + ]: 1247535 : Assert(!ioh->resowner);
718 : :
719 : : /*
720 : : * Update generation & state first, before resetting the IO's fields,
721 : : * otherwise a concurrent "viewer" could think the fields are valid, even
722 : : * though they are being reset. Increment the generation first, so that
723 : : * we can assert elsewhere that we never wait for an IDLE IO. While it's
724 : : * a bit weird for the state to go backwards for a generation, it's OK
725 : : * here, as there cannot be references to the "reborn" IO yet. Can't
726 : : * update both at once, so something has to give.
727 : : */
134 728 : 1247535 : ioh->generation++;
729 : 1247535 : pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
730 : :
731 : : /* ensure the state update is visible before we reset fields */
732 : 1247535 : pg_write_barrier();
733 : :
173 734 : 1247535 : ioh->op = PGAIO_OP_INVALID;
735 : 1247535 : ioh->target = PGAIO_TID_INVALID;
736 : 1247535 : ioh->flags = 0;
737 : 1247535 : ioh->num_callbacks = 0;
738 : 1247535 : ioh->handle_data_len = 0;
739 : 1247535 : ioh->report_return = NULL;
740 : 1247535 : ioh->result = 0;
168 741 : 1247535 : ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
742 : :
743 : : /*
744 : : * We push the IO to the head of the idle IO list, that seems more cache
745 : : * efficient in cases where only a few IOs are used.
746 : : */
173 747 : 1247535 : dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
748 : :
110 749 [ - + ]: 1247535 : RESUME_INTERRUPTS();
173 750 : 1247535 : }
751 : :
752 : : /*
753 : : * Wait for an IO handle to become usable.
754 : : *
755 : : * This only really is useful for pgaio_io_acquire().
756 : : */
757 : : static void
758 : 3415 : pgaio_io_wait_for_free(void)
759 : : {
760 : 3415 : int reclaimed = 0;
761 : :
96 peter@eisentraut.org 762 [ + + ]: 3415 : pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
763 : : pgaio_my_backend->num_staged_ios,
764 : : dclist_count(&pgaio_my_backend->in_flight_ios),
765 : : dclist_count(&pgaio_my_backend->idle_ios));
766 : :
767 : : /*
768 : : * First check if any of our IOs actually have completed - when using
769 : : * worker, that'll often be the case. We could do so as part of the loop
770 : : * below, but that'd potentially lead us to wait for some IO submitted
771 : : * before.
772 : : */
173 andres@anarazel.de 773 [ + + ]: 6830 : for (int i = 0; i < io_max_concurrency; i++)
774 : : {
775 : 3415 : PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
776 : :
777 [ + + ]: 3415 : if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
778 : : {
779 : : /*
780 : : * Note that no interrupts are processed between the state check
781 : : * and the call to reclaim - that's important as otherwise an
782 : : * interrupt could have already reclaimed the handle.
783 : : *
784 : : * Need to ensure that there's no reordering, in the more common
785 : : * paths, where we wait for IO, that's done by
786 : : * pgaio_io_was_recycled().
787 : : */
82 788 : 2275 : pg_read_barrier();
173 789 : 2275 : pgaio_io_reclaim(ioh);
790 : 2275 : reclaimed++;
791 : : }
792 : : }
793 : :
794 [ + + ]: 3415 : if (reclaimed > 0)
795 : 2275 : return;
796 : :
797 : : /*
798 : : * If we have any unsubmitted IOs, submit them now. We'll start waiting in
799 : : * a second, so it's better they're in flight. This also addresses the
800 : : * edge-case that all IOs are unsubmitted.
801 : : */
802 [ - + ]: 1140 : if (pgaio_my_backend->num_staged_ios > 0)
173 andres@anarazel.de 803 :UBC 0 : pgaio_submit_staged();
804 : :
805 : : /* possibly some IOs finished during submission */
110 andres@anarazel.de 806 [ - + ]:CBC 1140 : if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
110 andres@anarazel.de 807 :UBC 0 : return;
808 : :
173 andres@anarazel.de 809 [ - + ]:CBC 1140 : if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
134 andres@anarazel.de 810 [ # # ]:UBC 0 : ereport(ERROR,
811 : : errmsg_internal("no free IOs despite no in-flight IOs"),
812 : : errdetail_internal("%d pending, %u in-flight, %u idle IOs",
813 : : pgaio_my_backend->num_staged_ios,
814 : : dclist_count(&pgaio_my_backend->in_flight_ios),
815 : : dclist_count(&pgaio_my_backend->idle_ios)));
816 : :
817 : : /*
818 : : * Wait for the oldest in-flight IO to complete.
819 : : *
820 : : * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
821 : : * for that specific IO to complete, we just need *any* IO to complete.
822 : : */
823 : : {
173 andres@anarazel.de 824 :CBC 1140 : PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
825 : : &pgaio_my_backend->in_flight_ios);
110 826 : 1140 : uint64 generation = ioh->generation;
827 : :
10 828 [ - + + - ]: 1140 : switch ((PgAioHandleState) ioh->state)
829 : : {
830 : : /* should not be in in-flight list */
173 andres@anarazel.de 831 :UBC 0 : case PGAIO_HS_IDLE:
832 : : case PGAIO_HS_DEFINED:
833 : : case PGAIO_HS_HANDED_OUT:
834 : : case PGAIO_HS_STAGED:
835 : : case PGAIO_HS_COMPLETED_LOCAL:
836 [ # # ]: 0 : elog(ERROR, "shouldn't get here with io:%d in state %d",
837 : : pgaio_io_get_id(ioh), ioh->state);
838 : : break;
839 : :
173 andres@anarazel.de 840 :CBC 1139 : case PGAIO_HS_COMPLETED_IO:
841 : : case PGAIO_HS_SUBMITTED:
842 [ + + ]: 1139 : pgaio_debug_io(DEBUG2, ioh,
843 : : "waiting for free io with %u in flight",
844 : : dclist_count(&pgaio_my_backend->in_flight_ios));
845 : :
846 : : /*
847 : : * In a more general case this would be racy, because the
848 : : * generation could increase after we read ioh->state above.
849 : : * But we are only looking at IOs by the current backend and
850 : : * the IO can only be recycled by this backend. Even this is
851 : : * only OK because we get the handle's generation before
852 : : * potentially processing interrupts, e.g. as part of
853 : : * pgaio_debug_io().
854 : : */
110 855 : 1139 : pgaio_io_wait(ioh, generation);
173 856 : 1139 : break;
857 : :
858 : 1 : case PGAIO_HS_COMPLETED_SHARED:
859 : :
860 : : /*
861 : : * It's possible that another backend just finished this IO.
862 : : *
863 : : * Note that no interrupts are processed between the state
864 : : * check and the call to reclaim - that's important as
865 : : * otherwise an interrupt could have already reclaimed the
866 : : * handle.
867 : : *
868 : : * Need to ensure that there's no reordering, in the more
869 : : * common paths, where we wait for IO, that's done by
870 : : * pgaio_io_was_recycled().
871 : : */
82 872 : 1 : pg_read_barrier();
173 873 : 1 : pgaio_io_reclaim(ioh);
874 : 1 : break;
875 : : }
876 : :
877 [ - + ]: 1140 : if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
173 andres@anarazel.de 878 [ # # ]:UBC 0 : elog(PANIC, "no idle IO after waiting for IO to terminate");
173 andres@anarazel.de 879 :CBC 1140 : return;
880 : : }
881 : : }
882 : :
883 : : /*
884 : : * Internal - code outside of AIO should never need this and it'd be hard for
885 : : * such code to be safe.
886 : : */
887 : : static PgAioHandle *
888 : 1795363 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
889 : : {
890 : : PgAioHandle *ioh;
891 : :
892 [ - + ]: 1795363 : Assert(iow->aio_index < pgaio_ctl->io_handle_count);
893 : :
894 : 1795363 : ioh = &pgaio_ctl->io_handles[iow->aio_index];
895 : :
896 : 1795363 : *ref_generation = ((uint64) iow->generation_upper) << 32 |
897 : 1795363 : iow->generation_lower;
898 : :
899 [ - + ]: 1795363 : Assert(*ref_generation != 0);
900 : :
901 : 1795363 : return ioh;
902 : : }
903 : :
904 : : static const char *
905 : 11006 : pgaio_io_state_get_name(PgAioHandleState s)
906 : : {
907 : : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
10 908 [ + + + + : 11006 : switch ((PgAioHandleState) s)
+ + + -
- ]
909 : : {
173 910 : 331 : PGAIO_HS_TOSTR_CASE(IDLE);
911 : 3420 : PGAIO_HS_TOSTR_CASE(HANDED_OUT);
912 : 1710 : PGAIO_HS_TOSTR_CASE(DEFINED);
913 : 1710 : PGAIO_HS_TOSTR_CASE(STAGED);
914 : 401 : PGAIO_HS_TOSTR_CASE(SUBMITTED);
915 : 1710 : PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
916 : 1724 : PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
173 andres@anarazel.de 917 :UBC 0 : PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
918 : : }
919 : : #undef PGAIO_HS_TOSTR_CASE
920 : :
921 : 0 : return NULL; /* silence compiler */
922 : : }
923 : :
924 : : const char *
173 andres@anarazel.de 925 :CBC 11006 : pgaio_io_get_state_name(PgAioHandle *ioh)
926 : : {
927 : 11006 : return pgaio_io_state_get_name(ioh->state);
928 : : }
929 : :
930 : : const char *
931 : 3420 : pgaio_result_status_string(PgAioResultStatus rs)
932 : : {
10 933 [ - + + - : 3420 : switch ((PgAioResultStatus) rs)
+ - ]
934 : : {
168 andres@anarazel.de 935 :UBC 0 : case PGAIO_RS_UNKNOWN:
173 936 : 0 : return "UNKNOWN";
168 andres@anarazel.de 937 :CBC 3213 : case PGAIO_RS_OK:
173 938 : 3213 : return "OK";
160 939 : 102 : case PGAIO_RS_WARNING:
940 : 102 : return "WARNING";
168 andres@anarazel.de 941 :UBC 0 : case PGAIO_RS_PARTIAL:
173 942 : 0 : return "PARTIAL";
168 andres@anarazel.de 943 :CBC 105 : case PGAIO_RS_ERROR:
173 944 : 105 : return "ERROR";
945 : : }
946 : :
173 andres@anarazel.de 947 :UBC 0 : return NULL; /* silence compiler */
948 : : }
949 : :
950 : :
951 : :
952 : : /* --------------------------------------------------------------------------------
953 : : * Functions primarily related to IO Wait References
954 : : * --------------------------------------------------------------------------------
955 : : */
956 : :
957 : : /*
958 : : * Mark a wait reference as invalid
959 : : */
960 : : void
173 andres@anarazel.de 961 :CBC 12184763 : pgaio_wref_clear(PgAioWaitRef *iow)
962 : : {
963 : 12184763 : iow->aio_index = PG_UINT32_MAX;
964 : 12184763 : }
965 : :
966 : : /* Is the wait reference valid? */
967 : : bool
968 : 3791351 : pgaio_wref_valid(PgAioWaitRef *iow)
969 : : {
970 : 3791351 : return iow->aio_index != PG_UINT32_MAX;
971 : : }
972 : :
973 : : /*
974 : : * Similar to pgaio_io_get_id(), just for wait references.
975 : : */
976 : : int
173 andres@anarazel.de 977 :UBC 0 : pgaio_wref_get_id(PgAioWaitRef *iow)
978 : : {
979 [ # # ]: 0 : Assert(pgaio_wref_valid(iow));
980 : 0 : return iow->aio_index;
981 : : }
982 : :
983 : : /*
984 : : * Wait for the IO to have completed. Can be called in any process, not just
985 : : * in the issuing backend.
986 : : */
987 : : void
173 andres@anarazel.de 988 :CBC 276396 : pgaio_wref_wait(PgAioWaitRef *iow)
989 : : {
990 : : uint64 ref_generation;
991 : : PgAioHandle *ioh;
992 : :
993 : 276396 : ioh = pgaio_io_from_wref(iow, &ref_generation);
994 : :
995 : 276396 : pgaio_io_wait(ioh, ref_generation);
996 : 276395 : }
997 : :
998 : : /*
999 : : * Check if the referenced IO completed, without blocking.
1000 : : */
1001 : : bool
1002 : 1518967 : pgaio_wref_check_done(PgAioWaitRef *iow)
1003 : : {
1004 : : uint64 ref_generation;
1005 : : PgAioHandleState state;
1006 : : bool am_owner;
1007 : : PgAioHandle *ioh;
1008 : :
1009 : 1518967 : ioh = pgaio_io_from_wref(iow, &ref_generation);
1010 : :
1011 [ + + ]: 1518967 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
1012 : 970202 : return true;
1013 : :
1014 [ - + ]: 548765 : if (state == PGAIO_HS_IDLE)
173 andres@anarazel.de 1015 :UBC 0 : return true;
1016 : :
173 andres@anarazel.de 1017 :CBC 548765 : am_owner = ioh->owner_procno == MyProcNumber;
1018 : :
1019 [ + + ]: 548765 : if (state == PGAIO_HS_COMPLETED_SHARED ||
1020 [ - + ]: 274928 : state == PGAIO_HS_COMPLETED_LOCAL)
1021 : : {
1022 : : /*
1023 : : * Note that no interrupts are processed between
1024 : : * pgaio_io_was_recycled() and this check - that's important as
1025 : : * otherwise an interrupt could have already reclaimed the handle.
1026 : : */
1027 [ + - ]: 273837 : if (am_owner)
1028 : 273837 : pgaio_io_reclaim(ioh);
1029 : 273837 : return true;
1030 : : }
1031 : :
1032 : : /*
1033 : : * XXX: It likely would be worth checking in with the io method, to give
1034 : : * the IO method a chance to check if there are completion events queued.
1035 : : */
1036 : :
1037 : 274928 : return false;
1038 : : }
1039 : :
1040 : :
1041 : :
1042 : : /* --------------------------------------------------------------------------------
1043 : : * Actions on multiple IOs.
1044 : : * --------------------------------------------------------------------------------
1045 : : */
1046 : :
1047 : : /*
1048 : : * Submit IOs in batches going forward.
1049 : : *
1050 : : * Submitting multiple IOs at once can be substantially faster than doing so
1051 : : * one-by-one. At the same time, submitting multiple IOs at once requires more
1052 : : * care to avoid deadlocks.
1053 : : *
1054 : : * Consider backend A staging an IO for buffer 1 and then trying to start IO
1055 : : * on buffer 2, while backend B does the inverse. If A submitted the IO before
1056 : : * moving on to buffer 2, this works just fine, B will wait for the IO to
1057 : : * complete. But if batching were used, each backend will wait for IO that has
1058 : : * not yet been submitted to complete, i.e. forever.
1059 : : *
1060 : : * End batch submission mode with pgaio_exit_batchmode(). (Throwing errors is
1061 : : * allowed; error recovery will end the batch.)
1062 : : *
1063 : : * To avoid deadlocks, code needs to ensure that it will not wait for another
1064 : : * backend while there is unsubmitted IO. E.g. by using conditional lock
1065 : : * acquisition when acquiring buffer locks. To check if there currently are
1066 : : * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
1067 : : * pgaio_submit_staged().
1068 : : *
1069 : : * It is not allowed to enter batchmode while already in batchmode, it's
1070 : : * unlikely to ever be needed, as code needs to be explicitly aware of being
1071 : : * called in batchmode, to avoid the deadlock risks explained above.
1072 : : *
1073 : : * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
1074 : : * e.g. because too many IOs have been staged or because pgaio_submit_staged()
1075 : : * was called.
1076 : : */
1077 : : void
1078 : 2482881 : pgaio_enter_batchmode(void)
1079 : : {
1080 [ - + ]: 2482881 : if (pgaio_my_backend->in_batchmode)
173 andres@anarazel.de 1081 [ # # ]:UBC 0 : elog(ERROR, "starting batch while batch already in progress");
173 andres@anarazel.de 1082 :CBC 2482881 : pgaio_my_backend->in_batchmode = true;
1083 : 2482881 : }
1084 : :
1085 : : /*
1086 : : * Stop submitting IOs in batches.
1087 : : */
1088 : : void
1089 : 2482869 : pgaio_exit_batchmode(void)
1090 : : {
1091 [ - + ]: 2482869 : Assert(pgaio_my_backend->in_batchmode);
1092 : :
1093 : 2482869 : pgaio_submit_staged();
1094 : 2482869 : pgaio_my_backend->in_batchmode = false;
1095 : 2482869 : }
1096 : :
1097 : : /*
1098 : : * Are there staged but unsubmitted IOs?
1099 : : *
1100 : : * See comment above pgaio_enter_batchmode() for why code may need to check if
1101 : : * there is IO in that state.
1102 : : */
1103 : : bool
1104 : 1247388 : pgaio_have_staged(void)
1105 : : {
1106 [ + + - + ]: 1247388 : Assert(pgaio_my_backend->in_batchmode ||
1107 : : pgaio_my_backend->num_staged_ios == 0);
1108 : 1247388 : return pgaio_my_backend->num_staged_ios > 0;
1109 : : }
1110 : :
1111 : : /*
1112 : : * Submit all staged but not yet submitted IOs.
1113 : : *
1114 : : * Unless in batch mode, this never needs to be called, as IOs get submitted
1115 : : * as soon as possible. While in batchmode pgaio_submit_staged() can be called
1116 : : * before waiting on another backend, to avoid the risk of deadlocks. See
1117 : : * pgaio_enter_batchmode().
1118 : : */
1119 : : void
1120 : 2512950 : pgaio_submit_staged(void)
1121 : : {
1122 : 2512950 : int total_submitted = 0;
1123 : : int did_submit;
1124 : :
1125 [ + + ]: 2512950 : if (pgaio_my_backend->num_staged_ios == 0)
1126 : 1961244 : return;
1127 : :
1128 : :
1129 : 551706 : START_CRIT_SECTION();
1130 : :
1131 : 551706 : did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
1132 : 551706 : pgaio_my_backend->staged_ios);
1133 : :
1134 [ - + ]: 551706 : END_CRIT_SECTION();
1135 : :
1136 : 551706 : total_submitted += did_submit;
1137 : :
1138 [ - + ]: 551706 : Assert(total_submitted == did_submit);
1139 : :
1140 : 551706 : pgaio_my_backend->num_staged_ios = 0;
1141 : :
1142 [ - + ]: 551706 : pgaio_debug(DEBUG4,
1143 : : "aio: submitted %d IOs",
1144 : : total_submitted);
1145 : : }
1146 : :
1147 : :
1148 : :
1149 : : /* --------------------------------------------------------------------------------
1150 : : * Other
1151 : : * --------------------------------------------------------------------------------
1152 : : */
1153 : :
1154 : :
1155 : : /*
1156 : : * Perform AIO related cleanup after an error.
1157 : : *
1158 : : * This should be called early in the error recovery paths, as later steps may
1159 : : * need to issue AIO (e.g. to record a transaction abort WAL record).
1160 : : */
1161 : : void
1162 : 29660 : pgaio_error_cleanup(void)
1163 : : {
1164 : : /*
1165 : : * It is possible that code errored out after pgaio_enter_batchmode() but
1166 : : * before pgaio_exit_batchmode() was called. In that case we need to
1167 : : * submit the IO now.
1168 : : */
1169 [ + + ]: 29660 : if (pgaio_my_backend->in_batchmode)
1170 : : {
1171 : 12 : pgaio_my_backend->in_batchmode = false;
1172 : :
1173 : 12 : pgaio_submit_staged();
1174 : : }
1175 : :
1176 : : /*
1177 : : * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1178 : : */
1179 [ - + ]: 29660 : Assert(pgaio_my_backend->num_staged_ios == 0);
1180 : 29660 : }
1181 : :
1182 : : /*
1183 : : * Perform AIO related checks at (sub-)transactional boundaries.
1184 : : *
1185 : : * This should be called late during (sub-)transactional commit/abort, after
1186 : : * all steps that might need to perform AIO, so that we can verify that the
1187 : : * AIO subsystem is in a valid state at the end of a transaction.
1188 : : */
1189 : : void
1190 : 340517 : AtEOXact_Aio(bool is_commit)
1191 : : {
1192 : : /*
1193 : : * We should never be in batch mode at transactional boundaries. In case
1194 : : * an error was thrown while in batch mode, pgaio_error_cleanup() should
1195 : : * have exited batchmode.
1196 : : *
1197 : : * In case we are in batchmode somehow, make sure to submit all staged
1198 : : * IOs, other backends may need them to complete to continue.
1199 : : */
1200 [ + + ]: 340517 : if (pgaio_my_backend->in_batchmode)
1201 : : {
1202 : 6 : pgaio_error_cleanup();
1203 [ + - ]: 6 : elog(WARNING, "open AIO batch at end of (sub-)transaction");
1204 : : }
1205 : :
1206 : : /*
1207 : : * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1208 : : */
1209 [ - + ]: 340517 : Assert(pgaio_my_backend->num_staged_ios == 0);
1210 : 340517 : }
1211 : :
1212 : : /*
1213 : : * Need to submit staged but not yet submitted IOs using the fd, otherwise
1214 : : * the IO would end up targeting something bogus.
1215 : : */
1216 : : void
1217 : 885234 : pgaio_closing_fd(int fd)
1218 : : {
1219 : : /*
1220 : : * Might be called before AIO is initialized or in a subprocess that
1221 : : * doesn't use AIO.
1222 : : */
1223 [ + + ]: 885234 : if (!pgaio_my_backend)
1224 : 8115 : return;
1225 : :
1226 : : /*
1227 : : * For now just submit all staged IOs - we could be more selective, but
1228 : : * it's probably not worth it.
1229 : : */
134 1230 [ + + ]: 877119 : if (pgaio_my_backend->num_staged_ios > 0)
1231 : : {
1232 [ + - ]: 4 : pgaio_debug(DEBUG2,
1233 : : "submitting %d IOs before FD %d gets closed",
1234 : : pgaio_my_backend->num_staged_ios, fd);
1235 : 4 : pgaio_submit_staged();
1236 : : }
1237 : :
1238 : : /*
1239 : : * If requested by the IO method, wait for all IOs that use the
1240 : : * to-be-closed FD.
1241 : : */
172 1242 [ + + ]: 877119 : if (pgaio_method_ops->wait_on_fd_before_close)
1243 : : {
1244 : : /*
1245 : : * As waiting for one IO to complete may complete multiple IOs, we
1246 : : * can't just use a mutable list iterator. The maximum number of
1247 : : * in-flight IOs is fairly small, so just restart the loop after
1248 : : * waiting for an IO.
1249 : : */
1250 [ + + ]: 5376 : while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
1251 : : {
1252 : : dlist_iter iter;
1253 : 9 : PgAioHandle *ioh = NULL;
1254 : : uint64 generation;
1255 : :
1256 [ + - + - ]: 9 : dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
1257 : : {
1258 : 9 : ioh = dclist_container(PgAioHandle, node, iter.cur);
1259 : :
110 1260 : 9 : generation = ioh->generation;
1261 : :
172 1262 [ + - ]: 9 : if (pgaio_io_uses_fd(ioh, fd))
1263 : 9 : break;
1264 : : else
172 andres@anarazel.de 1265 :UBC 0 : ioh = NULL;
1266 : : }
1267 : :
172 andres@anarazel.de 1268 [ - + ]:CBC 9 : if (!ioh)
172 andres@anarazel.de 1269 :UBC 0 : break;
1270 : :
134 andres@anarazel.de 1271 [ + - ]:CBC 9 : pgaio_debug_io(DEBUG2, ioh,
1272 : : "waiting for IO before FD %d gets closed, %u in-flight IOs",
1273 : : fd, dclist_count(&pgaio_my_backend->in_flight_ios));
1274 : :
1275 : : /* see comment in pgaio_io_wait_for_free() about raciness */
110 1276 : 9 : pgaio_io_wait(ioh, generation);
1277 : : }
1278 : : }
1279 : : }
1280 : :
1281 : : /*
1282 : : * Registered as before_shmem_exit() callback in pgaio_init_backend()
1283 : : */
1284 : : void
173 1285 : 17257 : pgaio_shutdown(int code, Datum arg)
1286 : : {
1287 [ - + ]: 17257 : Assert(pgaio_my_backend);
1288 [ - + ]: 17257 : Assert(!pgaio_my_backend->handed_out_io);
1289 : :
1290 : : /* first clean up resources as we would at a transaction boundary */
1291 : 17257 : AtEOXact_Aio(code == 0);
1292 : :
1293 : : /*
1294 : : * Before exiting, make sure that all IOs are finished. That has two main
1295 : : * purposes:
1296 : : *
1297 : : * - Some kernel-level AIO mechanisms don't deal well with the issuer of
1298 : : * an AIO exiting before IO completed
1299 : : *
1300 : : * - It'd be confusing to see partially finished IOs in stats views etc
1301 : : */
1302 [ + + ]: 17269 : while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
1303 : : {
1304 : 12 : PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
110 1305 : 12 : uint64 generation = ioh->generation;
1306 : :
134 1307 [ + + ]: 12 : pgaio_debug_io(DEBUG2, ioh,
1308 : : "waiting for IO to complete during shutdown, %u in-flight IOs",
1309 : : dclist_count(&pgaio_my_backend->in_flight_ios));
1310 : :
1311 : : /* see comment in pgaio_io_wait_for_free() about raciness */
110 1312 : 12 : pgaio_io_wait(ioh, generation);
1313 : : }
1314 : :
173 1315 : 17257 : pgaio_my_backend = NULL;
1316 : 17257 : }
1317 : :
1318 : : void
1319 : 1083 : assign_io_method(int newval, void *extra)
1320 : : {
1321 [ - + ]: 1083 : Assert(pgaio_method_ops_table[newval] != NULL);
1322 [ - + ]: 1083 : Assert(newval < lengthof(io_method_options));
1323 : :
1324 : 1083 : pgaio_method_ops = pgaio_method_ops_table[newval];
1325 : 1083 : }
1326 : :
1327 : : bool
1328 : 2095 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
1329 : : {
1330 [ + + ]: 2095 : if (*newval == -1)
1331 : : {
1332 : : /*
1333 : : * Auto-tuning will be applied later during startup, as auto-tuning
1334 : : * depends on the value of various GUCs.
1335 : : */
1336 : 1067 : return true;
1337 : : }
1338 [ - + ]: 1028 : else if (*newval == 0)
1339 : : {
173 andres@anarazel.de 1340 :UBC 0 : GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
1341 : 0 : return false;
1342 : : }
1343 : :
173 andres@anarazel.de 1344 :CBC 1028 : return true;
1345 : : }
|