Age Owner Branch data TLA Line data Source code
1 : : /*-------------------------------------------------------------------------
2 : : *
3 : : * aio.c
4 : : * AIO - Core Logic
5 : : *
6 : : * For documentation about how AIO works on a higher level, including a
7 : : * schematic example, see README.md.
8 : : *
9 : : *
10 : : * AIO is a complicated subsystem. To keep things navigable, it is split
11 : : * across a number of files:
12 : : *
13 : : * - method_*.c - different ways of executing AIO (e.g. worker process)
14 : : *
15 : : * - aio_target.c - IO on different kinds of targets
16 : : *
17 : : * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
18 : : *
19 : : * - aio_callback.c - callbacks at IO operation lifecycle events
20 : : *
21 : : * - aio_init.c - per-server and per-backend initialization
22 : : *
23 : : * - aio.c - all other topics
24 : : *
25 : : * - read_stream.c - helper for reading buffered relation data
26 : : *
27 : : * - README.md - higher-level overview over AIO
28 : : *
29 : : *
30 : : * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
31 : : * Portions Copyright (c) 1994, Regents of the University of California
32 : : *
33 : : * IDENTIFICATION
34 : : * src/backend/storage/aio/aio.c
35 : : *
36 : : *-------------------------------------------------------------------------
37 : : */
38 : :
39 : : #include "postgres.h"
40 : :
41 : : #include "lib/ilist.h"
42 : : #include "miscadmin.h"
43 : : #include "port/atomics.h"
44 : : #include "storage/aio.h"
45 : : #include "storage/aio_internal.h"
46 : : #include "storage/aio_subsys.h"
47 : : #include "utils/guc.h"
48 : : #include "utils/guc_hooks.h"
49 : : #include "utils/injection_point.h"
50 : : #include "utils/resowner.h"
51 : : #include "utils/wait_event_types.h"
52 : :
53 : :
54 : : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
55 : : static void pgaio_io_reclaim(PgAioHandle *ioh);
56 : : static void pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner);
57 : : static void pgaio_io_wait_for_free(void);
58 : : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
59 : : static const char *pgaio_io_state_get_name(PgAioHandleState s);
60 : : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
61 : :
62 : :
63 : : /* Options for io_method. */
64 : : const struct config_enum_entry io_method_options[] = {
65 : : {"sync", IOMETHOD_SYNC, false},
66 : : {"worker", IOMETHOD_WORKER, false},
67 : : #ifdef IOMETHOD_IO_URING_ENABLED
68 : : {"io_uring", IOMETHOD_IO_URING, false},
69 : : #endif
70 : : {NULL, 0, false}
71 : : };
72 : :
73 : : /* GUCs */
74 : : int io_method = DEFAULT_IO_METHOD;
75 : : int io_max_concurrency = -1;
76 : :
77 : : /* global control for AIO */
78 : : PgAioCtl *pgaio_ctl;
79 : :
80 : : /* current backend's per-backend state */
81 : : PgAioBackend *pgaio_my_backend;
82 : :
83 : :
84 : : static const IoMethodOps *const pgaio_method_ops_table[] = {
85 : : [IOMETHOD_SYNC] = &pgaio_sync_ops,
86 : : [IOMETHOD_WORKER] = &pgaio_worker_ops,
87 : : #ifdef IOMETHOD_IO_URING_ENABLED
88 : : [IOMETHOD_IO_URING] = &pgaio_uring_ops,
89 : : #endif
90 : : };
91 : :
92 : : StaticAssertDecl(lengthof(io_method_options) == lengthof(pgaio_method_ops_table) + 1,
93 : : "io_method_options out of sync with pgaio_method_ops_table");
94 : :
95 : : /* callbacks for the configured io_method, set by assign_io_method */
96 : : const IoMethodOps *pgaio_method_ops;
97 : :
98 : :
99 : : /* --------------------------------------------------------------------------------
100 : : * Public Functions related to PgAioHandle
101 : : * --------------------------------------------------------------------------------
102 : : */
103 : :
104 : : /*
105 : : * Acquire an AioHandle, waiting for IO completion if necessary.
106 : : *
107 : : * Each backend can only have one AIO handle that has been "handed out" to
108 : : * code, but not yet submitted or released. This restriction is necessary to
109 : : * ensure that it is possible for code to wait for an unused handle by waiting
110 : : * for in-flight IO to complete. There is a limited number of handles in each
111 : : * backend, if multiple handles could be handed out without being submitted,
112 : : * waiting for all in-flight IO to complete would not guarantee that handles
113 : : * free up.
114 : : *
115 : : * It is cheap to acquire an IO handle, unless all handles are in use. In that
116 : : * case this function waits for the oldest IO to complete. If that is not
117 : : * desirable, use pgaio_io_acquire_nb().
118 : : *
119 : : * If a handle was acquired but then does not turn out to be needed,
120 : : * e.g. because pgaio_io_acquire() is called before starting an IO in a
121 : : * critical section, the handle needs to be released with pgaio_io_release().
122 : : *
123 : : *
124 : : * To react to the completion of the IO as soon as it is known to have
125 : : * completed, callbacks can be registered with pgaio_io_register_callbacks().
126 : : *
127 : : * To actually execute IO using the returned handle, the pgaio_io_start_*()
128 : : * family of functions is used. In many cases the pgaio_io_start_*() call will
129 : : * not be done directly by code that acquired the handle, but by lower level
130 : : * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
131 : : * AIO, it typically will pass the handle to smgr.c, which will pass it on to
132 : : * md.c, on to fd.c, which then finally calls pgaio_io_start_*(). This
133 : : * forwarding allows the various layers to react to the IO's completion by
134 : : * registering callbacks. These callbacks in turn can translate a lower
135 : : * layer's result into a result understandable by a higher layer.
136 : : *
137 : : * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
138 : : * not submitted to the kernel). Unless in batchmode
139 : : * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
140 : : * execution. Note that, whether in batchmode or not, the IO might even
141 : : * complete before the functions return.
142 : : *
143 : : * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
144 : : * referenced by the IO issuing code. To e.g. wait for IO, references to the
145 : : * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
146 : : * is called. pgaio_wref_wait() can be used to wait for the IO to complete.
147 : : *
148 : : *
149 : : * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
150 : : * passed to pgaio_io_acquire(). Once the issuing backend has called
151 : : * pgaio_wref_wait(), the PgAioReturn contains information about whether the
152 : : * operation succeeded and details about the first failure, if any. The error
153 : : * can be raised / logged with pgaio_result_report().
154 : : *
155 : : * The lifetime of the memory pointed to be *ret needs to be at least as long
156 : : * as the passed in resowner. If the resowner releases resources before the IO
157 : : * completes (typically due to an error), the reference to *ret will be
158 : : * cleared. In case of resowner cleanup *ret will not be updated with the
159 : : * results of the IO operation.
160 : : */
161 : : PgAioHandle *
275 andres@anarazel.de 162 :CBC 3397 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
163 : : {
164 : : PgAioHandle *h;
165 : :
166 : : while (true)
167 : : {
168 : 6644 : h = pgaio_io_acquire_nb(resowner, ret);
169 : :
170 [ + + ]: 6641 : if (h != NULL)
171 : 3394 : return h;
172 : :
173 : : /*
174 : : * Evidently all handles by this backend are in use. Just wait for
175 : : * some to complete.
176 : : */
177 : 3247 : pgaio_io_wait_for_free();
178 : : }
179 : : }
180 : :
181 : : /*
182 : : * Acquire an AioHandle, returning NULL if no handles are free.
183 : : *
184 : : * See pgaio_io_acquire(). The only difference is that this function will return
185 : : * NULL if there are no idle handles, instead of blocking.
186 : : */
187 : : PgAioHandle *
188 : 1330040 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
189 : : {
212 190 : 1330040 : PgAioHandle *ioh = NULL;
191 : :
275 192 [ - + ]: 1330040 : if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
193 : : {
275 andres@anarazel.de 194 [ # # ]:UBC 0 : Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
195 : 0 : pgaio_submit_staged();
196 : : }
197 : :
275 andres@anarazel.de 198 [ + + ]:CBC 1330040 : if (pgaio_my_backend->handed_out_io)
199 [ + - ]: 3 : elog(ERROR, "API violation: Only one IO can be handed out");
200 : :
201 : : /*
202 : : * Probably not needed today, as interrupts should not process this IO,
203 : : * but...
204 : : */
212 205 : 1330037 : HOLD_INTERRUPTS();
206 : :
275 207 [ + + ]: 1330037 : if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
208 : : {
209 : 1323543 : dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
210 : :
212 211 : 1323543 : ioh = dclist_container(PgAioHandle, node, ion);
212 : :
275 213 [ - + ]: 1323543 : Assert(ioh->state == PGAIO_HS_IDLE);
214 [ - + ]: 1323543 : Assert(ioh->owner_procno == MyProcNumber);
215 : :
216 : 1323543 : pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
217 : 1323543 : pgaio_my_backend->handed_out_io = ioh;
218 : :
219 [ + - ]: 1323543 : if (resowner)
7 heikki.linnakangas@i 220 : 1323543 : pgaio_io_resowner_register(ioh, resowner);
221 : :
275 andres@anarazel.de 222 [ + + ]: 1323543 : if (ret)
223 : : {
224 : 1323504 : ioh->report_return = ret;
270 225 : 1323504 : ret->result.status = PGAIO_RS_UNKNOWN;
226 : : }
227 : : }
228 : :
212 229 [ - + ]: 1330037 : RESUME_INTERRUPTS();
230 : :
231 : 1330037 : return ioh;
232 : : }
233 : :
234 : : /*
235 : : * Release IO handle that turned out to not be required.
236 : : *
237 : : * See pgaio_io_acquire() for more details.
238 : : */
239 : : void
275 240 : 3012 : pgaio_io_release(PgAioHandle *ioh)
241 : : {
242 [ + + ]: 3012 : if (ioh == pgaio_my_backend->handed_out_io)
243 : : {
244 [ - + ]: 3009 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
245 [ - + ]: 3009 : Assert(ioh->resowner);
246 : :
247 : 3009 : pgaio_my_backend->handed_out_io = NULL;
248 : :
249 : : /*
250 : : * Note that no interrupts are processed between the handed_out_io
251 : : * check and the call to reclaim - that's important as otherwise an
252 : : * interrupt could have already reclaimed the handle.
253 : : */
254 : 3009 : pgaio_io_reclaim(ioh);
255 : : }
256 : : else
257 : : {
258 [ + - ]: 3 : elog(ERROR, "release in unexpected state");
259 : : }
260 : 3009 : }
261 : :
262 : : /*
263 : : * Release IO handle during resource owner cleanup.
264 : : */
265 : : void
266 : 66 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
267 : : {
268 : 66 : PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
269 : :
270 [ - + ]: 66 : Assert(ioh->resowner);
271 : :
272 : : /*
273 : : * Otherwise an interrupt, in the middle of releasing the IO, could end up
274 : : * trying to wait for the IO, leading to state confusion.
275 : : */
212 276 : 66 : HOLD_INTERRUPTS();
277 : :
275 278 : 66 : ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
279 : 66 : ioh->resowner = NULL;
280 : :
112 281 [ - + - + : 66 : switch ((PgAioHandleState) ioh->state)
- ]
282 : : {
275 andres@anarazel.de 283 :UBC 0 : case PGAIO_HS_IDLE:
284 [ # # ]: 0 : elog(ERROR, "unexpected");
285 : : break;
275 andres@anarazel.de 286 :CBC 42 : case PGAIO_HS_HANDED_OUT:
287 [ - + - - ]: 42 : Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
288 : :
289 [ + - ]: 42 : if (ioh == pgaio_my_backend->handed_out_io)
290 : : {
291 : 42 : pgaio_my_backend->handed_out_io = NULL;
292 [ + + ]: 42 : if (!on_error)
293 [ + - ]: 15 : elog(WARNING, "leaked AIO handle");
294 : : }
295 : :
296 : 42 : pgaio_io_reclaim(ioh);
297 : 42 : break;
275 andres@anarazel.de 298 :UBC 0 : case PGAIO_HS_DEFINED:
299 : : case PGAIO_HS_STAGED:
300 [ # # ]: 0 : if (!on_error)
301 [ # # ]: 0 : elog(WARNING, "AIO handle was not submitted");
302 : 0 : pgaio_submit_staged();
303 : 0 : break;
275 andres@anarazel.de 304 :CBC 24 : case PGAIO_HS_SUBMITTED:
305 : : case PGAIO_HS_COMPLETED_IO:
306 : : case PGAIO_HS_COMPLETED_SHARED:
307 : : case PGAIO_HS_COMPLETED_LOCAL:
308 : : /* this is expected to happen */
309 : 24 : break;
310 : : }
311 : :
312 : : /*
313 : : * Need to unregister the reporting of the IO's result, the memory it's
314 : : * referencing likely has gone away.
315 : : */
316 [ + + ]: 66 : if (ioh->report_return)
317 : 24 : ioh->report_return = NULL;
318 : :
212 319 [ - + ]: 66 : RESUME_INTERRUPTS();
275 320 : 66 : }
321 : :
322 : : /*
323 : : * Add a [set of] flags to the IO.
324 : : *
325 : : * Note that this combines flags with already set flags, rather than set flags
326 : : * to explicitly the passed in parameters. This is to allow multiple callsites
327 : : * to set flags.
328 : : */
329 : : void
330 : 2639523 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
331 : : {
332 [ - + ]: 2639523 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
333 : :
334 : 2639523 : ioh->flags |= flag;
335 : 2639523 : }
336 : :
337 : : /*
338 : : * Returns an ID uniquely identifying the IO handle. This is only really
339 : : * useful for logging, as handles are reused across multiple IOs.
340 : : */
341 : : int
342 : 631613 : pgaio_io_get_id(PgAioHandle *ioh)
343 : : {
344 [ + - - + ]: 631613 : Assert(ioh >= pgaio_ctl->io_handles &&
345 : : ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
346 : 631613 : return ioh - pgaio_ctl->io_handles;
347 : : }
348 : :
349 : : /*
350 : : * Return the ProcNumber for the process that can use an IO handle. The
351 : : * mapping from IO handles to PGPROCs is static, therefore this even works
352 : : * when the corresponding PGPROC is not in use.
353 : : */
354 : : ProcNumber
355 : 1829 : pgaio_io_get_owner(PgAioHandle *ioh)
356 : : {
357 : 1829 : return ioh->owner_procno;
358 : : }
359 : :
360 : : /*
361 : : * Return a wait reference for the IO. Only wait references can be used to
362 : : * wait for an IOs completion, as handles themselves can be reused after
363 : : * completion. See also the comment above pgaio_io_acquire().
364 : : */
365 : : void
366 : 2640999 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
367 : : {
368 [ + + - + : 2640999 : Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
- - ]
369 : : ioh->state == PGAIO_HS_DEFINED ||
370 : : ioh->state == PGAIO_HS_STAGED);
371 [ - + ]: 2640999 : Assert(ioh->generation != 0);
372 : :
373 : 2640999 : iow->aio_index = ioh - pgaio_ctl->io_handles;
374 : 2640999 : iow->generation_upper = (uint32) (ioh->generation >> 32);
375 : 2640999 : iow->generation_lower = (uint32) ioh->generation;
376 : 2640999 : }
377 : :
378 : :
379 : :
380 : : /* --------------------------------------------------------------------------------
381 : : * Internal Functions related to PgAioHandle
382 : : * --------------------------------------------------------------------------------
383 : : */
384 : :
385 : : static inline void
386 : 10362690 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
387 : : {
388 : : /*
389 : : * All callers need to have held interrupts in some form, otherwise
390 : : * interrupt processing could wait for the IO to complete, while in an
391 : : * intermediary state.
392 : : */
212 393 [ + + - + : 10362690 : Assert(!INTERRUPTS_CAN_BE_PROCESSED());
- - ]
394 : :
275 395 [ - + ]: 10362690 : pgaio_debug_io(DEBUG5, ioh,
396 : : "updating state to %s",
397 : : pgaio_io_state_get_name(new_state));
398 : :
399 : : /*
400 : : * Ensure the changes signified by the new state are visible before the
401 : : * new state becomes visible.
402 : : */
403 : 10362690 : pg_write_barrier();
404 : :
405 : 10362690 : ioh->state = new_state;
406 : 10362690 : }
407 : :
408 : : static void
7 heikki.linnakangas@i 409 : 1323543 : pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner)
410 : : {
275 andres@anarazel.de 411 [ - + ]: 1323543 : Assert(!ioh->resowner);
7 heikki.linnakangas@i 412 [ - + ]: 1323543 : Assert(resowner);
413 : :
414 : 1323543 : ResourceOwnerRememberAioHandle(resowner, &ioh->resowner_node);
415 : 1323543 : ioh->resowner = resowner;
275 andres@anarazel.de 416 : 1323543 : }
417 : :
418 : : /*
419 : : * Stage IO for execution and, if appropriate, submit it immediately.
420 : : *
421 : : * Should only be called from pgaio_io_start_*().
422 : : */
423 : : void
424 : 1320492 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
425 : : {
426 : : bool needs_synchronous;
427 : :
428 [ - + ]: 1320492 : Assert(ioh->state == PGAIO_HS_HANDED_OUT);
429 [ - + ]: 1320492 : Assert(pgaio_my_backend->handed_out_io == ioh);
430 [ - + ]: 1320492 : Assert(pgaio_io_has_target(ioh));
431 : :
432 : : /*
433 : : * Otherwise an interrupt, in the middle of staging and possibly executing
434 : : * the IO, could end up trying to wait for the IO, leading to state
435 : : * confusion.
436 : : */
212 437 : 1320492 : HOLD_INTERRUPTS();
438 : :
275 439 : 1320492 : ioh->op = op;
440 : 1320492 : ioh->result = 0;
441 : :
442 : 1320492 : pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
443 : :
444 : : /* allow a new IO to be staged */
445 : 1320492 : pgaio_my_backend->handed_out_io = NULL;
446 : :
447 : 1320492 : pgaio_io_call_stage(ioh);
448 : :
449 : 1320492 : pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
450 : :
451 : : /*
452 : : * Synchronous execution has to be executed, well, synchronously, so check
453 : : * that first.
454 : : */
455 : 1320492 : needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
456 : :
457 [ + + ]: 1320492 : pgaio_debug_io(DEBUG3, ioh,
458 : : "staged (synchronous: %d, in_batch: %d)",
459 : : needs_synchronous, pgaio_my_backend->in_batchmode);
460 : :
461 [ + + ]: 1320492 : if (!needs_synchronous)
462 : : {
463 : 599265 : pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
464 [ - + ]: 599265 : Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
465 : :
466 : : /*
467 : : * Unless code explicitly opted into batching IOs, submit the IO
468 : : * immediately.
469 : : */
470 [ + + ]: 599265 : if (!pgaio_my_backend->in_batchmode)
471 : 28873 : pgaio_submit_staged();
472 : : }
473 : : else
474 : : {
475 : 721227 : pgaio_io_prepare_submit(ioh);
476 : 721227 : pgaio_io_perform_synchronously(ioh);
477 : : }
478 : :
212 479 [ - + ]: 1320492 : RESUME_INTERRUPTS();
275 480 : 1320492 : }
481 : :
482 : : bool
483 : 1320492 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
484 : : {
485 : : /*
486 : : * If the caller said to execute the IO synchronously, do so.
487 : : *
488 : : * XXX: We could optimize the logic when to execute synchronously by first
489 : : * checking if there are other IOs in flight and only synchronously
490 : : * executing if not. Unclear whether that'll be sufficiently common to be
491 : : * worth worrying about.
492 : : */
493 [ + + ]: 1320492 : if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
494 : 716518 : return true;
495 : :
496 : : /* Check if the IO method requires synchronous execution of IO */
497 [ + + ]: 603974 : if (pgaio_method_ops->needs_synchronous_execution)
498 : 603581 : return pgaio_method_ops->needs_synchronous_execution(ioh);
499 : :
500 : 393 : return false;
501 : : }
502 : :
503 : : /*
504 : : * Handle IO being processed by IO method.
505 : : *
506 : : * Should be called by IO methods / synchronous IO execution, just before the
507 : : * IO is performed.
508 : : */
509 : : void
510 : 1320492 : pgaio_io_prepare_submit(PgAioHandle *ioh)
511 : : {
512 : 1320492 : pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
513 : :
514 : 1320492 : dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
515 : 1320492 : }
516 : :
517 : : /*
518 : : * Handle IO getting completed by a method.
519 : : *
520 : : * Should be called by IO methods / synchronous IO execution, just after the
521 : : * IO has been performed.
522 : : *
523 : : * Expects to be called in a critical section. We expect IOs to be usable for
524 : : * WAL etc, which requires being able to execute completion callbacks in a
525 : : * critical section.
526 : : */
527 : : void
528 : 1216818 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
529 : : {
530 [ - + ]: 1216818 : Assert(ioh->state == PGAIO_HS_SUBMITTED);
531 : :
532 [ - + ]: 1216818 : Assert(CritSectionCount > 0);
533 : :
534 : 1216818 : ioh->result = result;
535 : :
536 : 1216818 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
537 : :
538 : : INJECTION_POINT("aio-process-completion-before-shared", ioh);
539 : :
540 : 1216818 : pgaio_io_call_complete_shared(ioh);
541 : :
542 : 1216818 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
543 : :
544 : : /* condition variable broadcast ensures state is visible before wakeup */
545 : 1216818 : ConditionVariableBroadcast(&ioh->cv);
546 : :
547 : : /* contains call to pgaio_io_call_complete_local() */
548 [ + + ]: 1216818 : if (ioh->owner_procno == MyProcNumber)
549 : 721617 : pgaio_io_reclaim(ioh);
550 : 1216818 : }
551 : :
552 : : /*
553 : : * Has the IO completed and thus the IO handle been reused?
554 : : *
555 : : * This is useful when waiting for IO completion at a low level (e.g. in an IO
556 : : * method's ->wait_one() callback).
557 : : */
558 : : bool
559 : 3167261 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
560 : : {
561 : 3167261 : *state = ioh->state;
562 : :
563 : : /*
564 : : * Ensure that we don't see an earlier state of the handle than ioh->state
565 : : * due to compiler or CPU reordering. This protects both ->generation as
566 : : * directly used here, and other fields in the handle accessed in the
567 : : * caller if the handle was not reused.
568 : : */
569 : 3167261 : pg_read_barrier();
570 : :
571 : 3167261 : return ioh->generation != ref_generation;
572 : : }
573 : :
574 : : /*
575 : : * Wait for IO to complete. External code should never use this, outside of
576 : : * the AIO subsystem waits are only allowed via pgaio_wref_wait().
577 : : */
578 : : static void
579 : 312080 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
580 : : {
581 : : PgAioHandleState state;
582 : : bool am_owner;
583 : :
584 : 312080 : am_owner = ioh->owner_procno == MyProcNumber;
585 : :
586 [ + + ]: 312080 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
587 : 35 : return;
588 : :
589 [ + + ]: 312045 : if (am_owner)
590 : : {
591 [ + + ]: 309514 : if (state != PGAIO_HS_SUBMITTED
592 [ + + ]: 15130 : && state != PGAIO_HS_COMPLETED_IO
593 [ + - ]: 85 : && state != PGAIO_HS_COMPLETED_SHARED
275 andres@anarazel.de 594 [ # # ]:UBC 0 : && state != PGAIO_HS_COMPLETED_LOCAL)
595 : : {
212 596 [ # # ]: 0 : elog(PANIC, "waiting for own IO %d in wrong state: %s",
597 : : pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
598 : : }
599 : : }
600 : :
601 : : while (true)
602 : : {
275 andres@anarazel.de 603 [ + + ]:CBC 623972 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
604 : 1750 : return;
605 : :
15 peter@eisentraut.org 606 [ - + + + :GNC 622222 : switch (state)
- ]
607 : : {
275 andres@anarazel.de 608 :UBC 0 : case PGAIO_HS_IDLE:
609 : : case PGAIO_HS_HANDED_OUT:
610 [ # # ]: 0 : elog(ERROR, "IO in wrong state: %d", state);
611 : : break;
612 : :
275 andres@anarazel.de 613 :CBC 296128 : case PGAIO_HS_SUBMITTED:
614 : :
615 : : /*
616 : : * If we need to wait via the IO method, do so now. Don't
617 : : * check via the IO method if the issuing backend is executing
618 : : * the IO synchronously.
619 : : */
620 [ + + + - ]: 296128 : if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
621 : : {
622 : 383 : pgaio_method_ops->wait_one(ioh, ref_generation);
623 : 383 : continue;
624 : : }
625 : : /* fallthrough */
626 : :
627 : : /* waiting for owner to submit */
628 : : case PGAIO_HS_DEFINED:
629 : : case PGAIO_HS_STAGED:
630 : : /* waiting for reaper to complete */
631 : : /* fallthrough */
632 : : case PGAIO_HS_COMPLETED_IO:
633 : : /* shouldn't be able to hit this otherwise */
634 [ - + ]: 311545 : Assert(IsUnderPostmaster);
635 : : /* ensure we're going to get woken up */
636 : 311545 : ConditionVariablePrepareToSleep(&ioh->cv);
637 : :
638 [ + + ]: 622904 : while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
639 : : {
640 [ + + ]: 621547 : if (state == PGAIO_HS_COMPLETED_SHARED ||
641 [ + + ]: 311374 : state == PGAIO_HS_COMPLETED_LOCAL)
642 : : break;
643 : 311360 : ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
644 : : }
645 : :
646 : 311544 : ConditionVariableCancelSleep();
647 : 311544 : break;
648 : :
649 : 310294 : case PGAIO_HS_COMPLETED_SHARED:
650 : : case PGAIO_HS_COMPLETED_LOCAL:
651 : :
652 : : /*
653 : : * Note that no interrupts are processed between
654 : : * pgaio_io_was_recycled() and this check - that's important
655 : : * as otherwise an interrupt could have already reclaimed the
656 : : * handle.
657 : : */
658 [ + + ]: 310294 : if (am_owner)
659 : 309133 : pgaio_io_reclaim(ioh);
660 : 310294 : return;
661 : : }
662 : : }
663 : : }
664 : :
665 : : /*
666 : : * Make IO handle ready to be reused after IO has completed or after the
667 : : * handle has been released without being used.
668 : : *
669 : : * Note that callers need to be careful about only calling this in the right
670 : : * state and that no interrupts can be processed between the state check and
671 : : * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
672 : : * already have reclaimed the handle.
673 : : */
674 : : static void
675 : 1323543 : pgaio_io_reclaim(PgAioHandle *ioh)
676 : : {
677 : : /* This is only ok if it's our IO */
678 [ - + ]: 1323543 : Assert(ioh->owner_procno == MyProcNumber);
679 [ - + ]: 1323543 : Assert(ioh->state != PGAIO_HS_IDLE);
680 : :
681 : : /* see comment in function header */
212 682 : 1323543 : HOLD_INTERRUPTS();
683 : :
684 : : /*
685 : : * It's a bit ugly, but right now the easiest place to put the execution
686 : : * of local completion callbacks is this function, as we need to execute
687 : : * local callbacks just before reclaiming at multiple callsites.
688 : : */
275 689 [ + + ]: 1323543 : if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
690 : : {
691 : : PgAioResult local_result;
692 : :
266 693 : 1320492 : local_result = pgaio_io_call_complete_local(ioh);
275 694 : 1320492 : pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
695 : :
266 696 [ + + ]: 1320492 : if (ioh->report_return)
697 : : {
698 : 1320468 : ioh->report_return->result = local_result;
699 : 1320468 : ioh->report_return->target_data = ioh->target_data;
700 : : }
701 : : }
702 : :
275 703 [ - + ]: 1323543 : pgaio_debug_io(DEBUG4, ioh,
704 : : "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
705 : : pgaio_result_status_string(ioh->distilled_result.status),
706 : : ioh->distilled_result.id,
707 : : ioh->distilled_result.error_data,
708 : : ioh->result);
709 : :
710 : : /* if the IO has been defined, it's on the in-flight list, remove */
711 [ + + ]: 1323543 : if (ioh->state != PGAIO_HS_HANDED_OUT)
712 : 1320492 : dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
713 : :
714 [ + + ]: 1323543 : if (ioh->resowner)
715 : : {
716 : 1323477 : ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
717 : 1323477 : ioh->resowner = NULL;
718 : : }
719 : :
720 [ - + ]: 1323543 : Assert(!ioh->resowner);
721 : :
722 : : /*
723 : : * Update generation & state first, before resetting the IO's fields,
724 : : * otherwise a concurrent "viewer" could think the fields are valid, even
725 : : * though they are being reset. Increment the generation first, so that
726 : : * we can assert elsewhere that we never wait for an IDLE IO. While it's
727 : : * a bit weird for the state to go backwards for a generation, it's OK
728 : : * here, as there cannot be references to the "reborn" IO yet. Can't
729 : : * update both at once, so something has to give.
730 : : */
236 731 : 1323543 : ioh->generation++;
732 : 1323543 : pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
733 : :
734 : : /* ensure the state update is visible before we reset fields */
735 : 1323543 : pg_write_barrier();
736 : :
275 737 : 1323543 : ioh->op = PGAIO_OP_INVALID;
738 : 1323543 : ioh->target = PGAIO_TID_INVALID;
739 : 1323543 : ioh->flags = 0;
740 : 1323543 : ioh->num_callbacks = 0;
741 : 1323543 : ioh->handle_data_len = 0;
742 : 1323543 : ioh->report_return = NULL;
743 : 1323543 : ioh->result = 0;
270 744 : 1323543 : ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
745 : :
746 : : /*
747 : : * We push the IO to the head of the idle IO list, that seems more cache
748 : : * efficient in cases where only a few IOs are used.
749 : : */
275 750 : 1323543 : dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
751 : :
212 752 [ - + ]: 1323543 : RESUME_INTERRUPTS();
275 753 : 1323543 : }
754 : :
755 : : /*
756 : : * Wait for an IO handle to become usable.
757 : : *
758 : : * This only really is useful for pgaio_io_acquire().
759 : : */
760 : : static void
761 : 3247 : pgaio_io_wait_for_free(void)
762 : : {
763 : 3247 : int reclaimed = 0;
764 : :
198 peter@eisentraut.org 765 [ + + ]: 3247 : pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
766 : : pgaio_my_backend->num_staged_ios,
767 : : dclist_count(&pgaio_my_backend->in_flight_ios),
768 : : dclist_count(&pgaio_my_backend->idle_ios));
769 : :
770 : : /*
771 : : * First check if any of our IOs actually have completed - when using
772 : : * worker, that'll often be the case. We could do so as part of the loop
773 : : * below, but that'd potentially lead us to wait for some IO submitted
774 : : * before.
775 : : */
275 andres@anarazel.de 776 [ + + ]: 6494 : for (int i = 0; i < io_max_concurrency; i++)
777 : : {
778 : 3247 : PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
779 : :
780 [ + + ]: 3247 : if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
781 : : {
782 : : /*
783 : : * Note that no interrupts are processed between the state check
784 : : * and the call to reclaim - that's important as otherwise an
785 : : * interrupt could have already reclaimed the handle.
786 : : *
787 : : * Need to ensure that there's no reordering, in the more common
788 : : * paths, where we wait for IO, that's done by
789 : : * pgaio_io_was_recycled().
790 : : */
184 791 : 2287 : pg_read_barrier();
275 792 : 2287 : pgaio_io_reclaim(ioh);
793 : 2287 : reclaimed++;
794 : : }
795 : : }
796 : :
797 [ + + ]: 3247 : if (reclaimed > 0)
798 : 2287 : return;
799 : :
800 : : /*
801 : : * If we have any unsubmitted IOs, submit them now. We'll start waiting in
802 : : * a second, so it's better they're in flight. This also addresses the
803 : : * edge-case that all IOs are unsubmitted.
804 : : */
805 [ - + ]: 960 : if (pgaio_my_backend->num_staged_ios > 0)
275 andres@anarazel.de 806 :UBC 0 : pgaio_submit_staged();
807 : :
808 : : /* possibly some IOs finished during submission */
212 andres@anarazel.de 809 [ - + ]:CBC 960 : if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
212 andres@anarazel.de 810 :UBC 0 : return;
811 : :
275 andres@anarazel.de 812 [ - + ]:CBC 960 : if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
236 andres@anarazel.de 813 [ # # ]:UBC 0 : ereport(ERROR,
814 : : errmsg_internal("no free IOs despite no in-flight IOs"),
815 : : errdetail_internal("%d pending, %u in-flight, %u idle IOs",
816 : : pgaio_my_backend->num_staged_ios,
817 : : dclist_count(&pgaio_my_backend->in_flight_ios),
818 : : dclist_count(&pgaio_my_backend->idle_ios)));
819 : :
820 : : /*
821 : : * Wait for the oldest in-flight IO to complete.
822 : : *
823 : : * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
824 : : * for that specific IO to complete, we just need *any* IO to complete.
825 : : */
826 : : {
275 andres@anarazel.de 827 :CBC 960 : PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
828 : : &pgaio_my_backend->in_flight_ios);
212 829 : 960 : uint64 generation = ioh->generation;
830 : :
112 831 [ - + + - ]: 960 : switch ((PgAioHandleState) ioh->state)
832 : : {
833 : : /* should not be in in-flight list */
275 andres@anarazel.de 834 :UBC 0 : case PGAIO_HS_IDLE:
835 : : case PGAIO_HS_DEFINED:
836 : : case PGAIO_HS_HANDED_OUT:
837 : : case PGAIO_HS_STAGED:
838 : : case PGAIO_HS_COMPLETED_LOCAL:
839 [ # # ]: 0 : elog(ERROR, "shouldn't get here with io:%d in state %d",
840 : : pgaio_io_get_id(ioh), ioh->state);
841 : : break;
842 : :
275 andres@anarazel.de 843 :CBC 958 : case PGAIO_HS_COMPLETED_IO:
844 : : case PGAIO_HS_SUBMITTED:
845 [ + + ]: 958 : pgaio_debug_io(DEBUG2, ioh,
846 : : "waiting for free io with %u in flight",
847 : : dclist_count(&pgaio_my_backend->in_flight_ios));
848 : :
849 : : /*
850 : : * In a more general case this would be racy, because the
851 : : * generation could increase after we read ioh->state above.
852 : : * But we are only looking at IOs by the current backend and
853 : : * the IO can only be recycled by this backend. Even this is
854 : : * only OK because we get the handle's generation before
855 : : * potentially processing interrupts, e.g. as part of
856 : : * pgaio_debug_io().
857 : : */
212 858 : 958 : pgaio_io_wait(ioh, generation);
275 859 : 958 : break;
860 : :
861 : 2 : case PGAIO_HS_COMPLETED_SHARED:
862 : :
863 : : /*
864 : : * It's possible that another backend just finished this IO.
865 : : *
866 : : * Note that no interrupts are processed between the state
867 : : * check and the call to reclaim - that's important as
868 : : * otherwise an interrupt could have already reclaimed the
869 : : * handle.
870 : : *
871 : : * Need to ensure that there's no reordering, in the more
872 : : * common paths, where we wait for IO, that's done by
873 : : * pgaio_io_was_recycled().
874 : : */
184 875 : 2 : pg_read_barrier();
275 876 : 2 : pgaio_io_reclaim(ioh);
877 : 2 : break;
878 : : }
879 : :
880 [ - + ]: 960 : if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
275 andres@anarazel.de 881 [ # # ]:UBC 0 : elog(PANIC, "no idle IO after waiting for IO to terminate");
275 andres@anarazel.de 882 :CBC 960 : return;
883 : : }
884 : : }
885 : :
886 : : /*
887 : : * Internal - code outside of AIO should never need this and it'd be hard for
888 : : * such code to be safe.
889 : : */
890 : : static PgAioHandle *
891 : 1918639 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
892 : : {
893 : : PgAioHandle *ioh;
894 : :
895 [ - + ]: 1918639 : Assert(iow->aio_index < pgaio_ctl->io_handle_count);
896 : :
897 : 1918639 : ioh = &pgaio_ctl->io_handles[iow->aio_index];
898 : :
899 : 1918639 : *ref_generation = ((uint64) iow->generation_upper) << 32 |
900 : 1918639 : iow->generation_lower;
901 : :
902 [ - + ]: 1918639 : Assert(*ref_generation != 0);
903 : :
904 : 1918639 : return ioh;
905 : : }
906 : :
907 : : static const char *
908 : 11023 : pgaio_io_state_get_name(PgAioHandleState s)
909 : : {
910 : : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
15 peter@eisentraut.org 911 [ + + + + :GNC 11023 : switch (s)
+ + + -
- ]
912 : : {
275 andres@anarazel.de 913 :CBC 331 : PGAIO_HS_TOSTR_CASE(IDLE);
914 : 3426 : PGAIO_HS_TOSTR_CASE(HANDED_OUT);
915 : 1713 : PGAIO_HS_TOSTR_CASE(DEFINED);
916 : 1713 : PGAIO_HS_TOSTR_CASE(STAGED);
917 : 399 : PGAIO_HS_TOSTR_CASE(SUBMITTED);
918 : 1713 : PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
919 : 1728 : PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
275 andres@anarazel.de 920 :UBC 0 : PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
921 : : }
922 : : #undef PGAIO_HS_TOSTR_CASE
923 : :
924 : 0 : return NULL; /* silence compiler */
925 : : }
926 : :
927 : : const char *
275 andres@anarazel.de 928 :CBC 11023 : pgaio_io_get_state_name(PgAioHandle *ioh)
929 : : {
930 : 11023 : return pgaio_io_state_get_name(ioh->state);
931 : : }
932 : :
933 : : const char *
934 : 3426 : pgaio_result_status_string(PgAioResultStatus rs)
935 : : {
15 peter@eisentraut.org 936 [ - + + - :GNC 3426 : switch (rs)
+ - ]
937 : : {
270 andres@anarazel.de 938 :UBC 0 : case PGAIO_RS_UNKNOWN:
275 939 : 0 : return "UNKNOWN";
270 andres@anarazel.de 940 :CBC 3219 : case PGAIO_RS_OK:
275 941 : 3219 : return "OK";
262 942 : 102 : case PGAIO_RS_WARNING:
943 : 102 : return "WARNING";
270 andres@anarazel.de 944 :UBC 0 : case PGAIO_RS_PARTIAL:
275 945 : 0 : return "PARTIAL";
270 andres@anarazel.de 946 :CBC 105 : case PGAIO_RS_ERROR:
275 947 : 105 : return "ERROR";
948 : : }
949 : :
275 andres@anarazel.de 950 :UBC 0 : return NULL; /* silence compiler */
951 : : }
952 : :
953 : :
954 : :
955 : : /* --------------------------------------------------------------------------------
956 : : * Functions primarily related to IO Wait References
957 : : * --------------------------------------------------------------------------------
958 : : */
959 : :
960 : : /*
961 : : * Mark a wait reference as invalid
962 : : */
963 : : void
275 andres@anarazel.de 964 :CBC 12901046 : pgaio_wref_clear(PgAioWaitRef *iow)
965 : : {
966 : 12901046 : iow->aio_index = PG_UINT32_MAX;
967 : 12901046 : }
968 : :
969 : : /* Is the wait reference valid? */
970 : : bool
971 : 4017522 : pgaio_wref_valid(PgAioWaitRef *iow)
972 : : {
973 : 4017522 : return iow->aio_index != PG_UINT32_MAX;
974 : : }
975 : :
976 : : /*
977 : : * Similar to pgaio_io_get_id(), just for wait references.
978 : : */
979 : : int
275 andres@anarazel.de 980 :UBC 0 : pgaio_wref_get_id(PgAioWaitRef *iow)
981 : : {
982 [ # # ]: 0 : Assert(pgaio_wref_valid(iow));
983 : 0 : return iow->aio_index;
984 : : }
985 : :
986 : : /*
987 : : * Wait for the IO to have completed. Can be called in any process, not just
988 : : * in the issuing backend.
989 : : */
990 : : void
275 andres@anarazel.de 991 :CBC 311100 : pgaio_wref_wait(PgAioWaitRef *iow)
992 : : {
993 : : uint64 ref_generation;
994 : : PgAioHandle *ioh;
995 : :
996 : 311100 : ioh = pgaio_io_from_wref(iow, &ref_generation);
997 : :
998 : 311100 : pgaio_io_wait(ioh, ref_generation);
999 : 311099 : }
1000 : :
1001 : : /*
1002 : : * Check if the referenced IO completed, without blocking.
1003 : : */
1004 : : bool
1005 : 1607539 : pgaio_wref_check_done(PgAioWaitRef *iow)
1006 : : {
1007 : : uint64 ref_generation;
1008 : : PgAioHandleState state;
1009 : : bool am_owner;
1010 : : PgAioHandle *ioh;
1011 : :
1012 : 1607539 : ioh = pgaio_io_from_wref(iow, &ref_generation);
1013 : :
1014 [ + + ]: 1607539 : if (pgaio_io_was_recycled(ioh, ref_generation, &state))
1015 : 1011593 : return true;
1016 : :
1017 [ - + ]: 595946 : if (state == PGAIO_HS_IDLE)
275 andres@anarazel.de 1018 :UBC 0 : return true;
1019 : :
275 andres@anarazel.de 1020 :CBC 595946 : am_owner = ioh->owner_procno == MyProcNumber;
1021 : :
1022 [ + + ]: 595946 : if (state == PGAIO_HS_COMPLETED_SHARED ||
1023 [ - + ]: 308493 : state == PGAIO_HS_COMPLETED_LOCAL)
1024 : : {
1025 : : /*
1026 : : * Note that no interrupts are processed between
1027 : : * pgaio_io_was_recycled() and this check - that's important as
1028 : : * otherwise an interrupt could have already reclaimed the handle.
1029 : : */
1030 [ + - ]: 287453 : if (am_owner)
1031 : 287453 : pgaio_io_reclaim(ioh);
1032 : 287453 : return true;
1033 : : }
1034 : :
1035 : : /*
1036 : : * XXX: It likely would be worth checking in with the io method, to give
1037 : : * the IO method a chance to check if there are completion events queued.
1038 : : */
1039 : :
1040 : 308493 : return false;
1041 : : }
1042 : :
1043 : :
1044 : :
1045 : : /* --------------------------------------------------------------------------------
1046 : : * Actions on multiple IOs.
1047 : : * --------------------------------------------------------------------------------
1048 : : */
1049 : :
1050 : : /*
1051 : : * Submit IOs in batches going forward.
1052 : : *
1053 : : * Submitting multiple IOs at once can be substantially faster than doing so
1054 : : * one-by-one. At the same time, submitting multiple IOs at once requires more
1055 : : * care to avoid deadlocks.
1056 : : *
1057 : : * Consider backend A staging an IO for buffer 1 and then trying to start IO
1058 : : * on buffer 2, while backend B does the inverse. If A submitted the IO before
1059 : : * moving on to buffer 2, this works just fine, B will wait for the IO to
1060 : : * complete. But if batching were used, each backend will wait for IO that has
1061 : : * not yet been submitted to complete, i.e. forever.
1062 : : *
1063 : : * End batch submission mode with pgaio_exit_batchmode(). (Throwing errors is
1064 : : * allowed; error recovery will end the batch.)
1065 : : *
1066 : : * To avoid deadlocks, code needs to ensure that it will not wait for another
1067 : : * backend while there is unsubmitted IO. E.g. by using conditional lock
1068 : : * acquisition when acquiring buffer locks. To check if there currently are
1069 : : * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
1070 : : * pgaio_submit_staged().
1071 : : *
1072 : : * It is not allowed to enter batchmode while already in batchmode, it's
1073 : : * unlikely to ever be needed, as code needs to be explicitly aware of being
1074 : : * called in batchmode, to avoid the deadlock risks explained above.
1075 : : *
1076 : : * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
1077 : : * e.g. because too many IOs have been staged or because pgaio_submit_staged()
1078 : : * was called.
1079 : : */
1080 : : void
1081 : 2573128 : pgaio_enter_batchmode(void)
1082 : : {
1083 [ - + ]: 2573128 : if (pgaio_my_backend->in_batchmode)
275 andres@anarazel.de 1084 [ # # ]:UBC 0 : elog(ERROR, "starting batch while batch already in progress");
275 andres@anarazel.de 1085 :CBC 2573128 : pgaio_my_backend->in_batchmode = true;
1086 : 2573128 : }
1087 : :
1088 : : /*
1089 : : * Stop submitting IOs in batches.
1090 : : */
1091 : : void
1092 : 2573116 : pgaio_exit_batchmode(void)
1093 : : {
1094 [ - + ]: 2573116 : Assert(pgaio_my_backend->in_batchmode);
1095 : :
1096 : 2573116 : pgaio_submit_staged();
1097 : 2573116 : pgaio_my_backend->in_batchmode = false;
1098 : 2573116 : }
1099 : :
1100 : : /*
1101 : : * Are there staged but unsubmitted IOs?
1102 : : *
1103 : : * See comment above pgaio_enter_batchmode() for why code may need to check if
1104 : : * there is IO in that state.
1105 : : */
1106 : : bool
1107 : 1323396 : pgaio_have_staged(void)
1108 : : {
1109 [ + + - + ]: 1323396 : Assert(pgaio_my_backend->in_batchmode ||
1110 : : pgaio_my_backend->num_staged_ios == 0);
1111 : 1323396 : return pgaio_my_backend->num_staged_ios > 0;
1112 : : }
1113 : :
1114 : : /*
1115 : : * Submit all staged but not yet submitted IOs.
1116 : : *
1117 : : * Unless in batch mode, this never needs to be called, as IOs get submitted
1118 : : * as soon as possible. While in batchmode pgaio_submit_staged() can be called
1119 : : * before waiting on another backend, to avoid the risk of deadlocks. See
1120 : : * pgaio_enter_batchmode().
1121 : : */
1122 : : void
1123 : 2605252 : pgaio_submit_staged(void)
1124 : : {
1125 : 2605252 : int total_submitted = 0;
1126 : : int did_submit;
1127 : :
1128 [ + + ]: 2605252 : if (pgaio_my_backend->num_staged_ios == 0)
1129 : 2006573 : return;
1130 : :
1131 : :
1132 : 598679 : START_CRIT_SECTION();
1133 : :
1134 : 598679 : did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
1135 : 598679 : pgaio_my_backend->staged_ios);
1136 : :
1137 [ - + ]: 598679 : END_CRIT_SECTION();
1138 : :
1139 : 598679 : total_submitted += did_submit;
1140 : :
1141 [ - + ]: 598679 : Assert(total_submitted == did_submit);
1142 : :
1143 : 598679 : pgaio_my_backend->num_staged_ios = 0;
1144 : :
1145 [ - + ]: 598679 : pgaio_debug(DEBUG4,
1146 : : "aio: submitted %d IOs",
1147 : : total_submitted);
1148 : : }
1149 : :
1150 : :
1151 : :
1152 : : /* --------------------------------------------------------------------------------
1153 : : * Other
1154 : : * --------------------------------------------------------------------------------
1155 : : */
1156 : :
1157 : :
1158 : : /*
1159 : : * Perform AIO related cleanup after an error.
1160 : : *
1161 : : * This should be called early in the error recovery paths, as later steps may
1162 : : * need to issue AIO (e.g. to record a transaction abort WAL record).
1163 : : */
1164 : : void
1165 : 30806 : pgaio_error_cleanup(void)
1166 : : {
1167 : : /*
1168 : : * It is possible that code errored out after pgaio_enter_batchmode() but
1169 : : * before pgaio_exit_batchmode() was called. In that case we need to
1170 : : * submit the IO now.
1171 : : */
1172 [ + + ]: 30806 : if (pgaio_my_backend->in_batchmode)
1173 : : {
1174 : 12 : pgaio_my_backend->in_batchmode = false;
1175 : :
1176 : 12 : pgaio_submit_staged();
1177 : : }
1178 : :
1179 : : /*
1180 : : * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1181 : : */
1182 [ - + ]: 30806 : Assert(pgaio_my_backend->num_staged_ios == 0);
1183 : 30806 : }
1184 : :
1185 : : /*
1186 : : * Perform AIO related checks at (sub-)transactional boundaries.
1187 : : *
1188 : : * This should be called late during (sub-)transactional commit/abort, after
1189 : : * all steps that might need to perform AIO, so that we can verify that the
1190 : : * AIO subsystem is in a valid state at the end of a transaction.
1191 : : */
1192 : : void
1193 : 354512 : AtEOXact_Aio(bool is_commit)
1194 : : {
1195 : : /*
1196 : : * We should never be in batch mode at transactional boundaries. In case
1197 : : * an error was thrown while in batch mode, pgaio_error_cleanup() should
1198 : : * have exited batchmode.
1199 : : *
1200 : : * In case we are in batchmode somehow, make sure to submit all staged
1201 : : * IOs, other backends may need them to complete to continue.
1202 : : */
1203 [ + + ]: 354512 : if (pgaio_my_backend->in_batchmode)
1204 : : {
1205 : 6 : pgaio_error_cleanup();
1206 [ + - ]: 6 : elog(WARNING, "open AIO batch at end of (sub-)transaction");
1207 : : }
1208 : :
1209 : : /*
1210 : : * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1211 : : */
1212 [ - + ]: 354512 : Assert(pgaio_my_backend->num_staged_ios == 0);
1213 : 354512 : }
1214 : :
1215 : : /*
1216 : : * Need to submit staged but not yet submitted IOs using the fd, otherwise
1217 : : * the IO would end up targeting something bogus.
1218 : : */
1219 : : void
1220 : 939068 : pgaio_closing_fd(int fd)
1221 : : {
1222 : : /*
1223 : : * Might be called before AIO is initialized or in a subprocess that
1224 : : * doesn't use AIO.
1225 : : */
1226 [ + + ]: 939068 : if (!pgaio_my_backend)
1227 : 8406 : return;
1228 : :
1229 : : /*
1230 : : * For now just submit all staged IOs - we could be more selective, but
1231 : : * it's probably not worth it.
1232 : : */
236 1233 [ + + ]: 930662 : if (pgaio_my_backend->num_staged_ios > 0)
1234 : : {
1235 [ + - ]: 4 : pgaio_debug(DEBUG2,
1236 : : "submitting %d IOs before FD %d gets closed",
1237 : : pgaio_my_backend->num_staged_ios, fd);
1238 : 4 : pgaio_submit_staged();
1239 : : }
1240 : :
1241 : : /*
1242 : : * If requested by the IO method, wait for all IOs that use the
1243 : : * to-be-closed FD.
1244 : : */
274 1245 [ + + ]: 930662 : if (pgaio_method_ops->wait_on_fd_before_close)
1246 : : {
1247 : : /*
1248 : : * As waiting for one IO to complete may complete multiple IOs, we
1249 : : * can't just use a mutable list iterator. The maximum number of
1250 : : * in-flight IOs is fairly small, so just restart the loop after
1251 : : * waiting for an IO.
1252 : : */
1253 [ + + ]: 5374 : while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
1254 : : {
1255 : : dlist_iter iter;
1256 : 9 : PgAioHandle *ioh = NULL;
1257 : : uint64 generation;
1258 : :
1259 [ + - + - ]: 9 : dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
1260 : : {
1261 : 9 : ioh = dclist_container(PgAioHandle, node, iter.cur);
1262 : :
212 1263 : 9 : generation = ioh->generation;
1264 : :
274 1265 [ + - ]: 9 : if (pgaio_io_uses_fd(ioh, fd))
1266 : 9 : break;
1267 : : else
274 andres@anarazel.de 1268 :UBC 0 : ioh = NULL;
1269 : : }
1270 : :
274 andres@anarazel.de 1271 [ - + ]:CBC 9 : if (!ioh)
274 andres@anarazel.de 1272 :UBC 0 : break;
1273 : :
236 andres@anarazel.de 1274 [ + - ]:CBC 9 : pgaio_debug_io(DEBUG2, ioh,
1275 : : "waiting for IO before FD %d gets closed, %u in-flight IOs",
1276 : : fd, dclist_count(&pgaio_my_backend->in_flight_ios));
1277 : :
1278 : : /* see comment in pgaio_io_wait_for_free() about raciness */
212 1279 : 9 : pgaio_io_wait(ioh, generation);
1280 : : }
1281 : : }
1282 : : }
1283 : :
1284 : : /*
1285 : : * Registered as before_shmem_exit() callback in pgaio_init_backend()
1286 : : */
1287 : : void
275 1288 : 17982 : pgaio_shutdown(int code, Datum arg)
1289 : : {
1290 [ - + ]: 17982 : Assert(pgaio_my_backend);
1291 [ - + ]: 17982 : Assert(!pgaio_my_backend->handed_out_io);
1292 : :
1293 : : /* first clean up resources as we would at a transaction boundary */
1294 : 17982 : AtEOXact_Aio(code == 0);
1295 : :
1296 : : /*
1297 : : * Before exiting, make sure that all IOs are finished. That has two main
1298 : : * purposes:
1299 : : *
1300 : : * - Some kernel-level AIO mechanisms don't deal well with the issuer of
1301 : : * an AIO exiting before IO completed
1302 : : *
1303 : : * - It'd be confusing to see partially finished IOs in stats views etc
1304 : : */
1305 [ + + ]: 17995 : while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
1306 : : {
1307 : 13 : PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
212 1308 : 13 : uint64 generation = ioh->generation;
1309 : :
236 1310 [ + + ]: 13 : pgaio_debug_io(DEBUG2, ioh,
1311 : : "waiting for IO to complete during shutdown, %u in-flight IOs",
1312 : : dclist_count(&pgaio_my_backend->in_flight_ios));
1313 : :
1314 : : /* see comment in pgaio_io_wait_for_free() about raciness */
212 1315 : 13 : pgaio_io_wait(ioh, generation);
1316 : : }
1317 : :
275 1318 : 17982 : pgaio_my_backend = NULL;
1319 : 17982 : }
1320 : :
1321 : : void
1322 : 1123 : assign_io_method(int newval, void *extra)
1323 : : {
43 1324 [ - + ]: 1123 : Assert(newval < lengthof(pgaio_method_ops_table));
275 1325 [ - + ]: 1123 : Assert(pgaio_method_ops_table[newval] != NULL);
1326 : :
1327 : 1123 : pgaio_method_ops = pgaio_method_ops_table[newval];
1328 : 1123 : }
1329 : :
1330 : : bool
1331 : 2175 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
1332 : : {
1333 [ + + ]: 2175 : if (*newval == -1)
1334 : : {
1335 : : /*
1336 : : * Auto-tuning will be applied later during startup, as auto-tuning
1337 : : * depends on the value of various GUCs.
1338 : : */
1339 : 1107 : return true;
1340 : : }
1341 [ - + ]: 1068 : else if (*newval == 0)
1342 : : {
275 andres@anarazel.de 1343 :UBC 0 : GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
1344 : 0 : return false;
1345 : : }
1346 : :
275 andres@anarazel.de 1347 :CBC 1068 : return true;
1348 : : }
|