LCOV - 380a8b2ea024c33a35e7abc8628e7c4f52f9f9f9 vs db5ed03217b9c238703df8b4b286115d6e940488

LCOV - differential code coverage report

Current view:	top level - src/backend/storage/smgr - md.c (source / functions)		Coverage	Total	Hit	UNC	UBC	GNC	CBC	DUB	DCB
Current:	380a8b2ea024c33a35e7abc8628e7c4f52f9f9f9 vs db5ed03217b9c238703df8b4b286115d6e940488	Lines:	75.7 %	534	404	2	128	15	389	2	15
Current Date:	2026-05-29 21:51:00 -0400	Functions:	94.9 %	39	37	1	1	9	28
Baseline:	lcov-20260530-034037-baseline	Branches:	50.0 %	398	199	7	192	7	192	7	7
Baseline Date:	2026-05-29 14:39:03 -0700	Line coverage date bins:
Legend:	Lines: hit not hit Branches: + taken - not taken # not executed	(30,360] days:	88.2 %	17	15	2		15
		(360..) days:	75.2 %	517	389		128		389
		Function coverage date bins:
		(360..) days:	94.9 %	39	37	1	1	9	28
		Branch coverage date bins:
		(30,360] days:	50.0 %	14	7	7		7
		(360..) days:	50.0 %	384	192		192		192

 Age         Owner                    Branch data    TLA  Line data    Source code

                                  1                 :                : /*-------------------------------------------------------------------------
                                  2                 :                :  *
                                  3                 :                :  * md.c
                                  4                 :                :  *    This code manages relations that reside on magnetic disk.
                                  5                 :                :  *
                                  6                 :                :  * Or at least, that was what the Berkeley folk had in mind when they named
                                  7                 :                :  * this file.  In reality, what this code provides is an interface from
                                  8                 :                :  * the smgr API to Unix-like filesystem APIs, so it will work with any type
                                  9                 :                :  * of device for which the operating system provides filesystem support.
                                 10                 :                :  * It doesn't matter whether the bits are on spinning rust or some other
                                 11                 :                :  * storage technology.
                                 12                 :                :  *
                                 13                 :                :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
                                 14                 :                :  * Portions Copyright (c) 1994, Regents of the University of California
                                 15                 :                :  *
                                 16                 :                :  *
                                 17                 :                :  * IDENTIFICATION
                                 18                 :                :  *    src/backend/storage/smgr/md.c
                                 19                 :                :  *
                                 20                 :                :  *-------------------------------------------------------------------------
                                 21                 :                :  */
                                 22                 :                : #include "postgres.h"
                                 23                 :                : 
                                 24                 :                : #include <limits.h>
                                 25                 :                : #include <unistd.h>
                                 26                 :                : #include <fcntl.h>
                                 27                 :                : #include <sys/file.h>
                                 28                 :                : 
                                 29                 :                : #include "access/xlogutils.h"
                                 30                 :                : #include "commands/tablespace.h"
                                 31                 :                : #include "common/file_utils.h"
                                 32                 :                : #include "miscadmin.h"
                                 33                 :                : #include "pg_trace.h"
                                 34                 :                : #include "pgstat.h"
                                 35                 :                : #include "storage/aio.h"
                                 36                 :                : #include "storage/bufmgr.h"
                                 37                 :                : #include "storage/fd.h"
                                 38                 :                : #include "storage/md.h"
                                 39                 :                : #include "storage/relfilelocator.h"
                                 40                 :                : #include "storage/smgr.h"
                                 41                 :                : #include "storage/sync.h"
                                 42                 :                : #include "utils/memutils.h"
                                 43                 :                : #include "utils/wait_event.h"
                                 44                 :                : 
                                 45                 :                : /*
                                 46                 :                :  * The magnetic disk storage manager keeps track of open file
                                 47                 :                :  * descriptors in its own descriptor pool.  This is done to make it
                                 48                 :                :  * easier to support relations that are larger than the operating
                                 49                 :                :  * system's file size limit (often 2GBytes).  In order to do that,
                                 50                 :                :  * we break relations up into "segment" files that are each shorter than
                                 51                 :                :  * the OS file size limit.  The segment size is set by the RELSEG_SIZE
                                 52                 :                :  * configuration constant in pg_config.h.
                                 53                 :                :  *
                                 54                 :                :  * On disk, a relation must consist of consecutively numbered segment
                                 55                 :                :  * files in the pattern
                                 56                 :                :  *  -- Zero or more full segments of exactly RELSEG_SIZE blocks each
                                 57                 :                :  *  -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
                                 58                 :                :  *  -- Optionally, any number of inactive segments of size 0 blocks.
                                 59                 :                :  * The full and partial segments are collectively the "active" segments.
                                 60                 :                :  * Inactive segments are those that once contained data but are currently
                                 61                 :                :  * not needed because of an mdtruncate() operation.  The reason for leaving
                                 62                 :                :  * them present at size zero, rather than unlinking them, is that other
                                 63                 :                :  * backends and/or the checkpointer might be holding open file references to
                                 64                 :                :  * such segments.  If the relation expands again after mdtruncate(), such
                                 65                 :                :  * that a deactivated segment becomes active again, it is important that
                                 66                 :                :  * such file references still be valid --- else data might get written
                                 67                 :                :  * out to an unlinked old copy of a segment file that will eventually
                                 68                 :                :  * disappear.
                                 69                 :                :  *
                                 70                 :                :  * RELSEG_SIZE must fit into BlockNumber; but since we expose its value
                                 71                 :                :  * as an integer GUC, it actually needs to fit in signed int.  It's worth
                                 72                 :                :  * having a cross-check for this since configure's --with-segsize options
                                 73                 :                :  * could let people select insane values.
                                 74                 :                :  */
                                 75                 :                : StaticAssertDecl(RELSEG_SIZE > 0 && RELSEG_SIZE <= INT_MAX,
                                 76                 :                :                  "RELSEG_SIZE must fit in an integer");
                                 77                 :                : 
                                 78                 :                : /*
                                 79                 :                :  * File descriptors are stored in the per-fork md_seg_fds arrays inside
                                 80                 :                :  * SMgrRelation. The length of these arrays is stored in md_num_open_segs.
                                 81                 :                :  * Note that a fork's md_num_open_segs having a specific value does not
                                 82                 :                :  * necessarily mean the relation doesn't have additional segments; we may
                                 83                 :                :  * just not have opened the next segment yet.  (We could not have "all
                                 84                 :                :  * segments are in the array" as an invariant anyway, since another backend
                                 85                 :                :  * could extend the relation while we aren't looking.)  We do not have
                                 86                 :                :  * entries for inactive segments, however; as soon as we find a partial
                                 87                 :                :  * segment, we assume that any subsequent segments are inactive.
                                 88                 :                :  *
                                 89                 :                :  * The entire MdfdVec array is palloc'd in the MdCxt memory context.
                                 90                 :                :  */
                                 91                 :                : 
                                 92                 :                : typedef struct _MdfdVec
                                 93                 :                : {
                                 94                 :                :     File        mdfd_vfd;       /* fd number in fd.c's pool */
                                 95                 :                :     BlockNumber mdfd_segno;     /* segment number, from 0 */
                                 96                 :                : } MdfdVec;
                                 97                 :                : 
                                 98                 :                : static MemoryContext MdCxt;     /* context for all MdfdVec objects */
                                 99                 :                : 
                                100                 :                : 
                                101                 :                : /* Populate a file tag describing an md.c segment file. */
                                102                 :                : #define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
                                103                 :                : ( \
                                104                 :                :     memset(&(a), 0, sizeof(FileTag)), \
                                105                 :                :     (a).handler = SYNC_HANDLER_MD, \
                                106                 :                :     (a).rlocator = (xx_rlocator), \
                                107                 :                :     (a).forknum = (xx_forknum), \
                                108                 :                :     (a).segno = (xx_segno) \
                                109                 :                : )
                                110                 :                : 
                                111                 :                : 
                                112                 :                : /*** behavior for mdopen & _mdfd_getseg ***/
                                113                 :                : /* ereport if segment not present */
                                114                 :                : #define EXTENSION_FAIL              (1 << 0)
                                115                 :                : /* return NULL if segment not present */
                                116                 :                : #define EXTENSION_RETURN_NULL       (1 << 1)
                                117                 :                : /* create new segments as needed */
                                118                 :                : #define EXTENSION_CREATE            (1 << 2)
                                119                 :                : /* create new segments if needed during recovery */
                                120                 :                : #define EXTENSION_CREATE_RECOVERY   (1 << 3)
                                121                 :                : /* don't try to open a segment, if not already open */
                                122                 :                : #define EXTENSION_DONT_OPEN         (1 << 5)
                                123                 :                : 
                                124                 :                : 
                                125                 :                : /*
                                126                 :                :  * Fixed-length string to represent paths to files that need to be built by
                                127                 :                :  * md.c.
                                128                 :                :  *
                                129                 :                :  * The maximum number of segments is MaxBlockNumber / RELSEG_SIZE, where
                                130                 :                :  * RELSEG_SIZE can be set to 1 (for testing only).
                                131                 :                :  */
                                132                 :                : #define SEGMENT_CHARS   OIDCHARS
                                133                 :                : #define MD_PATH_STR_MAXLEN \
                                134                 :                :     (\
                                135                 :                :         REL_PATH_STR_MAXLEN \
                                136                 :                :         + sizeof((char)'.') \
                                137                 :                :         + SEGMENT_CHARS \
                                138                 :                :     )
                                139                 :                : typedef struct MdPathStr
                                140                 :                : {
                                141                 :                :     char        str[MD_PATH_STR_MAXLEN + 1];
                                142                 :                : } MdPathStr;
                                143                 :                : 
                                144                 :                : 
                                145                 :                : /* local routines */
                                146                 :                : static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
                                147                 :                :                          bool isRedo);
                                148                 :                : static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
                                149                 :                : static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
                                150                 :                :                                    MdfdVec *seg);
                                151                 :                : static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
                                152                 :                :                                     BlockNumber segno);
                                153                 :                : static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
                                154                 :                :                                     BlockNumber segno);
                                155                 :                : static void _fdvec_resize(SMgrRelation reln,
                                156                 :                :                           ForkNumber forknum,
                                157                 :                :                           int nseg);
                                158                 :                : static MdPathStr _mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
                                159                 :                :                                BlockNumber segno);
                                160                 :                : static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum,
                                161                 :                :                               BlockNumber segno, int oflags);
                                162                 :                : static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
                                163                 :                :                              BlockNumber blkno, bool skipFsync, int behavior);
                                164                 :                : static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
                                165                 :                :                               MdfdVec *seg);
                                166                 :                : 
                                167                 :                : static PgAioResult md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data);
                                168                 :                : static void md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel);
                                169                 :                : 
                                170                 :                : const PgAioHandleCallbacks aio_md_readv_cb = {
                                171                 :                :     .complete_shared = md_readv_complete,
                                172                 :                :     .report = md_readv_report,
                                173                 :                : };
                                174                 :                : 
                                175                 :                : 
                                176                 :                : static inline int
 1148 tmunro@postgresql.or      177                 :CBC     1561156 : _mdfd_open_flags(void)
                                178                 :                : {
                                179                 :        1561156 :     int         flags = O_RDWR | PG_BINARY;
                                180                 :                : 
                                181         [ +  + ]:        1561156 :     if (io_direct_flags & IO_DIRECT_DATA)
                                182                 :            328 :         flags |= PG_O_DIRECT;
                                183                 :                : 
                                184                 :        1561156 :     return flags;
                                185                 :                : }
                                186                 :                : 
                                187                 :                : /*
                                188                 :                :  * mdinit() -- Initialize private state for magnetic disk storage manager.
                                189                 :                :  */
                                190                 :                : void
 9103 tgl@sss.pgh.pa.us         191                 :          23112 : mdinit(void)
                                192                 :                : {
 9467                           193                 :          23112 :     MdCxt = AllocSetContextCreate(TopMemoryContext,
                                194                 :                :                                   "MdSmgr",
                                195                 :                :                                   ALLOCSET_DEFAULT_SIZES);
 6183 heikki.linnakangas@i      196                 :          23112 : }
                                197                 :                : 
                                198                 :                : /*
                                199                 :                :  * mdexists() -- Does the physical file exist?
                                200                 :                :  *
                                201                 :                :  * Note: this will return true for lingering files, with pending deletions
                                202                 :                :  */
                                203                 :                : bool
 1349 pg@bowt.ie                204                 :         601659 : mdexists(SMgrRelation reln, ForkNumber forknum)
                                205                 :                : {
                                206                 :                :     /*
                                207                 :                :      * Close it first, to ensure that we notice if the fork has been unlinked
                                208                 :                :      * since we opened it.  As an optimization, we can skip that in recovery,
                                209                 :                :      * which already closes relations when dropping them.
                                210                 :                :      */
 1514 tmunro@postgresql.or      211         [ +  + ]:         601659 :     if (!InRecovery)
 1349 pg@bowt.ie                212                 :         578656 :         mdclose(reln, forknum);
                                213                 :                : 
                                214                 :         601659 :     return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
                                215                 :                : }
                                216                 :                : 
                                217                 :                : /*
                                218                 :                :  * mdcreate() -- Create a new relation on magnetic disk.
                                219                 :                :  *
                                220                 :                :  * If isRedo is true, it's okay for the relation to exist already.
                                221                 :                :  */
                                222                 :                : void
                                223                 :        5946731 : mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
                                224                 :                : {
                                225                 :                :     MdfdVec    *mdfd;
                                226                 :                :     RelPathStr  path;
                                227                 :                :     File        fd;
                                228                 :                : 
                                229   [ +  +  +  + ]:        5946731 :     if (isRedo && reln->md_num_open_segs[forknum] > 0)
 7087 tgl@sss.pgh.pa.us         230                 :        5739465 :         return;                 /* created and opened already... */
                                231                 :                : 
 1349 pg@bowt.ie                232         [ -  + ]:         207266 :     Assert(reln->md_num_open_segs[forknum] == 0);
                                233                 :                : 
                                234                 :                :     /*
                                235                 :                :      * We may be using the target table space for the first time in this
                                236                 :                :      * database, so create a per-database subdirectory if needed.
                                237                 :                :      *
                                238                 :                :      * XXX this is a fairly ugly violation of module layering, but this seems
                                239                 :                :      * to be the best place to put the check.  Maybe TablespaceCreateDbspace
                                240                 :                :      * should be here and not in commands/tablespace.c?  But that would imply
                                241                 :                :      * importing a lot of stuff that smgr.c oughtn't know, either.
                                242                 :                :      */
 1424 rhaas@postgresql.org      243                 :         207266 :     TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
                                244                 :                :                             reln->smgr_rlocator.locator.dbOid,
                                245                 :                :                             isRedo);
                                246                 :                : 
 1349 pg@bowt.ie                247                 :         207266 :     path = relpath(reln->smgr_rlocator, forknum);
                                248                 :                : 
  459 andres@anarazel.de        249                 :         207266 :     fd = PathNameOpenFile(path.str, _mdfd_open_flags() | O_CREAT | O_EXCL);
                                250                 :                : 
10492 bruce@momjian.us          251         [ +  + ]:         207266 :     if (fd < 0)
                                252                 :                :     {
 9200                           253                 :          15821 :         int         save_errno = errno;
                                254                 :                : 
 2679 akapila@postgresql.o      255         [ +  - ]:          15821 :         if (isRedo)
  459 andres@anarazel.de        256                 :          15821 :             fd = PathNameOpenFile(path.str, _mdfd_open_flags());
10492 bruce@momjian.us          257         [ -  + ]:          15821 :         if (fd < 0)
                                258                 :                :         {
                                259                 :                :             /* be sure to report the error reported by create, not open */
 9476 tgl@sss.pgh.pa.us         260                 :UBC           0 :             errno = save_errno;
 7087                           261         [ #  # ]:              0 :             ereport(ERROR,
                                262                 :                :                     (errcode_for_file_access(),
                                263                 :                :                      errmsg("could not create file \"%s\": %m", path.str)));
                                264                 :                :         }
                                265                 :                :     }
                                266                 :                : 
 1349 pg@bowt.ie                267                 :CBC      207266 :     _fdvec_resize(reln, forknum, 1);
                                268                 :         207266 :     mdfd = &reln->md_seg_fds[forknum][0];
 3551 andres@anarazel.de        269                 :         207266 :     mdfd->mdfd_vfd = fd;
                                270                 :         207266 :     mdfd->mdfd_segno = 0;
                                271                 :                : 
 1061 heikki.linnakangas@i      272         [ +  + ]:         207266 :     if (!SmgrIsTemp(reln))
                                273                 :         202443 :         register_dirty_segment(reln, forknum, mdfd);
                                274                 :                : }
                                275                 :                : 
                                276                 :                : /*
                                277                 :                :  * mdunlink() -- Unlink a relation.
                                278                 :                :  *
                                279                 :                :  * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
                                280                 :                :  * there won't be an SMgrRelation hashtable entry anymore.
                                281                 :                :  *
                                282                 :                :  * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
                                283                 :                :  * to delete all forks.
                                284                 :                :  *
                                285                 :                :  * For regular relations, we don't unlink the first segment file of the rel,
                                286                 :                :  * but just truncate it to zero length, and record a request to unlink it after
                                287                 :                :  * the next checkpoint.  Additional segments can be unlinked immediately,
                                288                 :                :  * however.  Leaving the empty file in place prevents that relfilenumber
                                289                 :                :  * from being reused.  The scenario this protects us from is:
                                290                 :                :  * 1. We delete a relation (and commit, and actually remove its file).
                                291                 :                :  * 2. We create a new relation, which by chance gets the same relfilenumber as
                                292                 :                :  *    the just-deleted one (OIDs must've wrapped around for that to happen).
                                293                 :                :  * 3. We crash before another checkpoint occurs.
                                294                 :                :  * During replay, we would delete the file and then recreate it, which is fine
                                295                 :                :  * if the contents of the file were repopulated by subsequent WAL entries.
                                296                 :                :  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
                                297                 :                :  * file after populating it (as we do at wal_level=minimal), the contents of
                                298                 :                :  * the file would be lost forever.  By leaving the empty file until after the
                                299                 :                :  * next checkpoint, we prevent reassignment of the relfilenumber until it's
                                300                 :                :  * safe, because relfilenumber assignment skips over any existing file.
                                301                 :                :  *
                                302                 :                :  * Additional segments, if any, are truncated and then unlinked.  The reason
                                303                 :                :  * for truncating is that other backends may still hold open FDs for these at
                                304                 :                :  * the smgr level, so that the kernel can't remove the file yet.  We want to
                                305                 :                :  * reclaim the disk space right away despite that.
                                306                 :                :  *
                                307                 :                :  * We do not need to go through this dance for temp relations, though, because
                                308                 :                :  * we never make WAL entries for temp rels, and so a temp rel poses no threat
                                309                 :                :  * to the health of a regular rel that has taken over its relfilenumber.
                                310                 :                :  * The fact that temp rels and regular rels have different file naming
                                311                 :                :  * patterns provides additional safety.  Other backends shouldn't have open
                                312                 :                :  * FDs for them, either.
                                313                 :                :  *
                                314                 :                :  * We also don't do it while performing a binary upgrade.  There is no reuse
                                315                 :                :  * hazard in that case, since after a crash or even a simple ERROR, the
                                316                 :                :  * upgrade fails and the whole cluster must be recreated from scratch.
                                317                 :                :  * Furthermore, it is important to remove the files from disk immediately,
                                318                 :                :  * because we may be about to reuse the same relfilenumber.
                                319                 :                :  *
                                320                 :                :  * All the above applies only to the relation's main fork; other forks can
                                321                 :                :  * just be removed immediately, since they are not needed to prevent the
                                322                 :                :  * relfilenumber from being recycled.  Also, we do not carefully
                                323                 :                :  * track whether other forks have been created or not, but just attempt to
                                324                 :                :  * unlink them unconditionally; so we should never complain about ENOENT.
                                325                 :                :  *
                                326                 :                :  * If isRedo is true, it's unsurprising for the relation to be already gone.
                                327                 :                :  * Also, we should remove the file immediately instead of queuing a request
                                328                 :                :  * for later, since during redo there's no possibility of creating a
                                329                 :                :  * conflicting relation.
                                330                 :                :  *
                                331                 :                :  * Note: we currently just never warn about ENOENT at all.  We could warn in
                                332                 :                :  * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
                                333                 :                :  *
                                334                 :                :  * Note: any failure should be reported as WARNING not ERROR, because
                                335                 :                :  * we are usually not in a transaction anymore when this is called.
                                336                 :                :  */
                                337                 :                : void
 1349 pg@bowt.ie                338                 :         243100 : mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
                                339                 :                : {
                                340                 :                :     /* Now do the per-fork work */
                                341         [ -  + ]:         243100 :     if (forknum == InvalidForkNumber)
                                342                 :                :     {
 1349 pg@bowt.ie                343         [ #  # ]:UBC           0 :         for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
                                344                 :              0 :             mdunlinkfork(rlocator, forknum, isRedo);
                                345                 :                :     }
                                346                 :                :     else
 1349 pg@bowt.ie                347                 :CBC      243100 :         mdunlinkfork(rlocator, forknum, isRedo);
 5063 tgl@sss.pgh.pa.us         348                 :         243100 : }
                                349                 :                : 
                                350                 :                : /*
                                351                 :                :  * Truncate a file to release disk space.
                                352                 :                :  */
                                353                 :                : static int
 2006 tmunro@postgresql.or      354                 :         284559 : do_truncate(const char *path)
                                355                 :                : {
                                356                 :                :     int         save_errno;
                                357                 :                :     int         ret;
                                358                 :                : 
                                359                 :         284559 :     ret = pg_truncate(path, 0);
                                360                 :                : 
                                361                 :                :     /* Log a warning here to avoid repetition in callers. */
                                362   [ +  +  -  + ]:         284559 :     if (ret < 0 && errno != ENOENT)
                                363                 :                :     {
 2006 tmunro@postgresql.or      364                 :UBC           0 :         save_errno = errno;
                                365         [ #  # ]:              0 :         ereport(WARNING,
                                366                 :                :                 (errcode_for_file_access(),
                                367                 :                :                  errmsg("could not truncate file \"%s\": %m", path)));
                                368                 :              0 :         errno = save_errno;
                                369                 :                :     }
                                370                 :                : 
 2006 tmunro@postgresql.or      371                 :CBC      284559 :     return ret;
                                372                 :                : }
                                373                 :                : 
                                374                 :                : static void
 1349 pg@bowt.ie                375                 :         243100 : mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
                                376                 :                : {
                                377                 :                :     RelPathStr  path;
                                378                 :                :     int         ret;
                                379                 :                :     int         save_errno;
                                380                 :                : 
                                381                 :         243100 :     path = relpath(rlocator, forknum);
                                382                 :                : 
                                383                 :                :     /*
                                384                 :                :      * Truncate and then unlink the first segment, or just register a request
                                385                 :                :      * to unlink it later, as described in the comments for mdunlink().
                                386                 :                :      */
 1298 tgl@sss.pgh.pa.us         387   [ +  +  +  +  :         243100 :     if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
                                              +  + ]
                                388         [ +  + ]:          52099 :         RelFileLocatorBackendIsTemp(rlocator))
                                389                 :                :     {
 1424 rhaas@postgresql.org      390         [ +  + ]:         195433 :         if (!RelFileLocatorBackendIsTemp(rlocator))
                                391                 :                :         {
                                392                 :                :             /* Prevent other backends' fds from holding on to the disk space */
  459 andres@anarazel.de        393                 :         177705 :             ret = do_truncate(path.str);
                                394                 :                : 
                                395                 :                :             /* Forget any pending sync requests for the first segment */
 1300 tgl@sss.pgh.pa.us         396                 :         177705 :             save_errno = errno;
 1349 pg@bowt.ie                397                 :         177705 :             register_forget_request(rlocator, forknum, 0 /* first seg */ );
 1300 tgl@sss.pgh.pa.us         398                 :         177705 :             errno = save_errno;
                                399                 :                :         }
                                400                 :                :         else
 2006 tmunro@postgresql.or      401                 :          17728 :             ret = 0;
                                402                 :                : 
                                403                 :                :         /* Next unlink the file, unless it was already found to be missing */
 1298 tgl@sss.pgh.pa.us         404   [ +  +  -  + ]:         195433 :         if (ret >= 0 || errno != ENOENT)
                                405                 :                :         {
  459 andres@anarazel.de        406                 :          29253 :             ret = unlink(path.str);
 2006 tmunro@postgresql.or      407   [ +  +  -  + ]:          29253 :             if (ret < 0 && errno != ENOENT)
                                408                 :                :             {
 1298 tgl@sss.pgh.pa.us         409                 :UBC           0 :                 save_errno = errno;
 2006 tmunro@postgresql.or      410         [ #  # ]:              0 :                 ereport(WARNING,
                                411                 :                :                         (errcode_for_file_access(),
                                412                 :                :                          errmsg("could not remove file \"%s\": %m", path.str)));
 1298 tgl@sss.pgh.pa.us         413                 :              0 :                 errno = save_errno;
                                414                 :                :             }
                                415                 :                :         }
                                416                 :                :     }
                                417                 :                :     else
                                418                 :                :     {
                                419                 :                :         /* Prevent other backends' fds from holding on to the disk space */
  459 andres@anarazel.de        420                 :CBC       47667 :         ret = do_truncate(path.str);
                                421                 :                : 
                                422                 :                :         /* Register request to unlink first segment later */
 1298 tgl@sss.pgh.pa.us         423                 :          47667 :         save_errno = errno;
                                424                 :          47667 :         register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
                                425                 :          47667 :         errno = save_errno;
                                426                 :                :     }
                                427                 :                : 
                                428                 :                :     /*
                                429                 :                :      * Delete any additional segments.
                                430                 :                :      *
                                431                 :                :      * Note that because we loop until getting ENOENT, we will correctly
                                432                 :                :      * remove all inactive segments as well as active ones.  Ideally we'd
                                433                 :                :      * continue the loop until getting exactly that errno, but that risks an
                                434                 :                :      * infinite loop if the problem is directory-wide (for instance, if we
                                435                 :                :      * suddenly can't read the data directory itself).  We compromise by
                                436                 :                :      * continuing after a non-ENOENT truncate error, but stopping after any
                                437                 :                :      * unlink error.  If there is indeed a directory-wide problem, additional
                                438                 :                :      * unlink attempts wouldn't work anyway.
                                439                 :                :      */
                                440   [ +  +  -  + ]:         243100 :     if (ret >= 0 || errno != ENOENT)
                                441                 :                :     {
                                442                 :                :         MdPathStr   segpath;
                                443                 :                :         BlockNumber segno;
                                444                 :                : 
                                445                 :          64010 :         for (segno = 1;; segno++)
                                446                 :                :         {
  459 andres@anarazel.de        447                 :          64010 :             sprintf(segpath.str, "%s.%u", path.str, segno);
                                448                 :                : 
 1424 rhaas@postgresql.org      449         [ +  + ]:          64010 :             if (!RelFileLocatorBackendIsTemp(rlocator))
                                450                 :                :             {
                                451                 :                :                 /*
                                452                 :                :                  * Prevent other backends' fds from holding on to the disk
                                453                 :                :                  * space.  We're done if we see ENOENT, though.
                                454                 :                :                  */
  459 andres@anarazel.de        455   [ +  -  +  - ]:          59187 :                 if (do_truncate(segpath.str) < 0 && errno == ENOENT)
 2006 tmunro@postgresql.or      456                 :          59187 :                     break;
                                457                 :                : 
                                458                 :                :                 /*
                                459                 :                :                  * Forget any pending sync requests for this segment before we
                                460                 :                :                  * try to unlink.
                                461                 :                :                  */
 1349 pg@bowt.ie                462                 :UBC           0 :                 register_forget_request(rlocator, forknum, segno);
                                463                 :                :             }
                                464                 :                : 
  459 andres@anarazel.de        465         [ +  - ]:CBC        4823 :             if (unlink(segpath.str) < 0)
                                466                 :                :             {
                                467                 :                :                 /* ENOENT is expected after the last segment... */
 9334 tgl@sss.pgh.pa.us         468         [ -  + ]:           4823 :                 if (errno != ENOENT)
 7087 tgl@sss.pgh.pa.us         469         [ #  # ]:UBC           0 :                     ereport(WARNING,
                                470                 :                :                             (errcode_for_file_access(),
                                471                 :                :                              errmsg("could not remove file \"%s\": %m", segpath.str)));
 9334 tgl@sss.pgh.pa.us         472                 :CBC        4823 :                 break;
                                473                 :                :             }
                                474                 :                :         }
                                475                 :                :     }
10917 scrappy@hub.org           476                 :         243100 : }
                                477                 :                : 
                                478                 :                : /*
                                479                 :                :  * mdextend() -- Add a block to the specified relation.
                                480                 :                :  *
                                481                 :                :  * The semantics are nearly the same as mdwrite(): write at the
                                482                 :                :  * specified position.  However, this is to be used for the case of
                                483                 :                :  * extending a relation (i.e., blocknum is at or beyond the current
                                484                 :                :  * EOF).  Note that we assume writing a block beyond current EOF
                                485                 :                :  * causes intervening file space to become filled with zeroes.
                                486                 :                :  */
                                487                 :                : void
 6501 heikki.linnakangas@i      488                 :         144822 : mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                489                 :                :          const void *buffer, bool skipFsync)
                                490                 :                : {
                                491                 :                :     pgoff_t     seekpos;
                                492                 :                :     int         nbytes;
                                493                 :                :     MdfdVec    *v;
                                494                 :                : 
                                495                 :                :     /* If this build supports direct I/O, the buffer must be I/O aligned. */
                                496                 :                :     if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
 1148 tmunro@postgresql.or      497         [ -  + ]:         144822 :         Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
                                498                 :                : 
                                499                 :                :     /* This assert is too expensive to have on normally ... */
                                500                 :                : #ifdef CHECK_WRITE_VS_EXTEND
                                501                 :                :     Assert(blocknum >= mdnblocks(reln, forknum));
                                502                 :                : #endif
                                503                 :                : 
                                504                 :                :     /*
                                505                 :                :      * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
                                506                 :                :      * more --- we mustn't create a block whose number actually is
                                507                 :                :      * InvalidBlockNumber.  (Note that this failure should be unreachable
                                508                 :                :      * because of upstream checks in bufmgr.c.)
                                509                 :                :      */
 7087 tgl@sss.pgh.pa.us         510         [ -  + ]:         144822 :     if (blocknum == InvalidBlockNumber)
 7087 tgl@sss.pgh.pa.us         511         [ #  # ]:UBC           0 :         ereport(ERROR,
                                512                 :                :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                                513                 :                :                  errmsg("cannot extend file \"%s\" beyond %u blocks",
                                514                 :                :                         relpath(reln->smgr_rlocator, forknum).str,
                                515                 :                :                         InvalidBlockNumber)));
                                516                 :                : 
 5769 rhaas@postgresql.org      517                 :CBC      144822 :     v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
                                518                 :                : 
  198 michael@paquier.xyz       519                 :GNC      144822 :     seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                                520                 :                : 
                                521         [ -  + ]:         144822 :     Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
                                522                 :                : 
 2761 tmunro@postgresql.or      523         [ -  + ]:CBC      144822 :     if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
                                524                 :                :     {
 7087 tgl@sss.pgh.pa.us         525         [ #  # ]:UBC           0 :         if (nbytes < 0)
                                526         [ #  # ]:              0 :             ereport(ERROR,
                                527                 :                :                     (errcode_for_file_access(),
                                528                 :                :                      errmsg("could not extend file \"%s\": %m",
                                529                 :                :                             FilePathName(v->mdfd_vfd)),
                                530                 :                :                      errhint("Check free disk space.")));
                                531                 :                :         /* short write: complain appropriately */
                                532         [ #  # ]:              0 :         ereport(ERROR,
                                533                 :                :                 (errcode(ERRCODE_DISK_FULL),
                                534                 :                :                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
                                535                 :                :                         FilePathName(v->mdfd_vfd),
                                536                 :                :                         nbytes, BLCKSZ, blocknum),
                                537                 :                :                  errhint("Check free disk space.")));
                                538                 :                :     }
                                539                 :                : 
 5769 rhaas@postgresql.org      540   [ +  +  +  - ]:CBC      144822 :     if (!skipFsync && !SmgrIsTemp(reln))
 6501 heikki.linnakangas@i      541                 :             39 :         register_dirty_segment(reln, forknum, v);
                                542                 :                : 
                                543         [ -  + ]:         144822 :     Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
10917 scrappy@hub.org           544                 :         144822 : }
                                545                 :                : 
                                546                 :                : /*
                                547                 :                :  * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
                                548                 :                :  *
                                549                 :                :  * Similar to mdextend(), except the relation can be extended by multiple
                                550                 :                :  * blocks at once and the added blocks will be filled with zeroes.
                                551                 :                :  */
                                552                 :                : void
 1151 andres@anarazel.de        553                 :         268944 : mdzeroextend(SMgrRelation reln, ForkNumber forknum,
                                554                 :                :              BlockNumber blocknum, int nblocks, bool skipFsync)
                                555                 :                : {
                                556                 :                :     MdfdVec    *v;
                                557                 :         268944 :     BlockNumber curblocknum = blocknum;
                                558                 :         268944 :     int         remblocks = nblocks;
                                559                 :                : 
                                560         [ -  + ]:         268944 :     Assert(nblocks > 0);
                                561                 :                : 
                                562                 :                :     /* This assert is too expensive to have on normally ... */
                                563                 :                : #ifdef CHECK_WRITE_VS_EXTEND
                                564                 :                :     Assert(blocknum >= mdnblocks(reln, forknum));
                                565                 :                : #endif
                                566                 :                : 
                                567                 :                :     /*
                                568                 :                :      * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
                                569                 :                :      * more --- we mustn't create a block whose number actually is
                                570                 :                :      * InvalidBlockNumber or larger.
                                571                 :                :      */
                                572         [ -  + ]:         268944 :     if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
 1151 andres@anarazel.de        573         [ #  # ]:UBC           0 :         ereport(ERROR,
                                574                 :                :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
                                575                 :                :                  errmsg("cannot extend file \"%s\" beyond %u blocks",
                                576                 :                :                         relpath(reln->smgr_rlocator, forknum).str,
                                577                 :                :                         InvalidBlockNumber)));
                                578                 :                : 
 1151 andres@anarazel.de        579         [ +  + ]:CBC      537888 :     while (remblocks > 0)
                                580                 :                :     {
 1107 tgl@sss.pgh.pa.us         581                 :         268944 :         BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
  198 michael@paquier.xyz       582                 :GNC      268944 :         pgoff_t     seekpos = (pgoff_t) BLCKSZ * segstartblock;
                                583                 :                :         int         numblocks;
                                584                 :                : 
 1151 andres@anarazel.de        585         [ -  + ]:CBC      268944 :         if (segstartblock + remblocks > RELSEG_SIZE)
 1151 andres@anarazel.de        586                 :UBC           0 :             numblocks = RELSEG_SIZE - segstartblock;
                                587                 :                :         else
 1151 andres@anarazel.de        588                 :CBC      268944 :             numblocks = remblocks;
                                589                 :                : 
                                590                 :         268944 :         v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
                                591                 :                : 
                                592         [ -  + ]:         268944 :         Assert(segstartblock < RELSEG_SIZE);
                                593         [ -  + ]:         268944 :         Assert(segstartblock + numblocks <= RELSEG_SIZE);
                                594                 :                : 
                                595                 :                :         /*
                                596                 :                :          * If available and useful, use posix_fallocate() (via
                                597                 :                :          * FileFallocate()) to extend the relation. That's often more
                                598                 :                :          * efficient than using write(), as it commonly won't cause the kernel
                                599                 :                :          * to allocate page cache space for the extended pages.
                                600                 :                :          *
                                601                 :                :          * However, we don't use FileFallocate() for small extensions, as it
                                602                 :                :          * defeats delayed allocation on some filesystems. Not clear where
                                603                 :                :          * that decision should be made though? For now just use a cutoff of
                                604                 :                :          * 8, anything between 4 and 8 worked OK in some local testing.
                                605                 :                :          */
  364 tmunro@postgresql.or      606         [ +  + ]:         268944 :         if (numblocks > 8 &&
                                607         [ +  - ]:            716 :             file_extend_method != FILE_EXTEND_METHOD_WRITE_ZEROS)
 1151 andres@anarazel.de        608                 :            716 :         {
  364 tmunro@postgresql.or      609                 :            716 :             int         ret = 0;
                                610                 :                : 
                                611                 :                : #ifdef HAVE_POSIX_FALLOCATE
                                612         [ +  - ]:            716 :             if (file_extend_method == FILE_EXTEND_METHOD_POSIX_FALLOCATE)
                                613                 :                :             {
                                614                 :            716 :                 ret = FileFallocate(v->mdfd_vfd,
                                615                 :                :                                     seekpos, (pgoff_t) BLCKSZ * numblocks,
                                616                 :                :                                     WAIT_EVENT_DATA_FILE_EXTEND);
                                617                 :                :             }
                                618                 :                :             else
                                619                 :                : #endif
                                620                 :                :             {
  364 tmunro@postgresql.or      621         [ #  # ]:UBC           0 :                 elog(ERROR, "unsupported file_extend_method: %d",
                                622                 :                :                      file_extend_method);
                                623                 :                :             }
 1151 andres@anarazel.de        624         [ -  + ]:CBC         716 :             if (ret != 0)
                                625                 :                :             {
 1151 andres@anarazel.de        626         [ #  # ]:UBC           0 :                 ereport(ERROR,
                                627                 :                :                         errcode_for_file_access(),
                                628                 :                :                         errmsg("could not extend file \"%s\" with FileFallocate(): %m",
                                629                 :                :                                FilePathName(v->mdfd_vfd)),
                                630                 :                :                         errhint("Check free disk space."));
                                631                 :                :             }
                                632                 :                :         }
                                633                 :                :         else
                                634                 :                :         {
                                635                 :                :             int         ret;
                                636                 :                : 
                                637                 :                :             /*
                                638                 :                :              * Even if we don't want to use fallocate, we can still extend a
                                639                 :                :              * bit more efficiently than writing each 8kB block individually.
                                640                 :                :              * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
                                641                 :                :              * to avoid multiple writes or needing a zeroed buffer for the
                                642                 :                :              * whole length of the extension.
                                643                 :                :              */
 1151 andres@anarazel.de        644                 :CBC      268228 :             ret = FileZero(v->mdfd_vfd,
                                645                 :                :                            seekpos, (pgoff_t) BLCKSZ * numblocks,
                                646                 :                :                            WAIT_EVENT_DATA_FILE_EXTEND);
                                647         [ -  + ]:         268228 :             if (ret < 0)
 1151 andres@anarazel.de        648         [ #  # ]:UBC           0 :                 ereport(ERROR,
                                649                 :                :                         errcode_for_file_access(),
                                650                 :                :                         errmsg("could not extend file \"%s\": %m",
                                651                 :                :                                FilePathName(v->mdfd_vfd)),
                                652                 :                :                         errhint("Check free disk space."));
                                653                 :                :         }
                                654                 :                : 
 1151 andres@anarazel.de        655   [ +  -  +  + ]:CBC      268944 :         if (!skipFsync && !SmgrIsTemp(reln))
                                656                 :         252928 :             register_dirty_segment(reln, forknum, v);
                                657                 :                : 
                                658         [ -  + ]:         268944 :         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
                                659                 :                : 
                                660                 :         268944 :         remblocks -= numblocks;
                                661                 :         268944 :         curblocknum += numblocks;
                                662                 :                :     }
                                663                 :         268944 : }
                                664                 :                : 
                                665                 :                : /*
                                666                 :                :  * mdopenfork() -- Open one fork of the specified relation.
                                667                 :                :  *
                                668                 :                :  * Note we only open the first segment, when there are multiple segments.
                                669                 :                :  *
                                670                 :                :  * If first segment is not present, either ereport or return NULL according
                                671                 :                :  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
                                672                 :                :  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
                                673                 :                :  * invent one out of whole cloth.
                                674                 :                :  */
                                675                 :                : static MdfdVec *
 2509 tmunro@postgresql.or      676                 :        4443534 : mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
                                677                 :                : {
                                678                 :                :     MdfdVec    *mdfd;
                                679                 :                :     RelPathStr  path;
                                680                 :                :     File        fd;
                                681                 :                : 
                                682                 :                :     /* No work if already open */
 3551 andres@anarazel.de        683         [ +  + ]:        4443534 :     if (reln->md_num_open_segs[forknum] > 0)
                                684                 :        3137627 :         return &reln->md_seg_fds[forknum][0];
                                685                 :                : 
 1424 rhaas@postgresql.org      686                 :        1305907 :     path = relpath(reln->smgr_rlocator, forknum);
                                687                 :                : 
  459 andres@anarazel.de        688                 :        1305907 :     fd = PathNameOpenFile(path.str, _mdfd_open_flags());
                                689                 :                : 
10492 bruce@momjian.us          690         [ +  + ]:        1305907 :     if (fd < 0)
                                691                 :                :     {
 2679 akapila@postgresql.o      692         [ +  + ]:         418102 :         if ((behavior & EXTENSION_RETURN_NULL) &&
                                693         [ +  - ]:         418080 :             FILE_POSSIBLY_DELETED(errno))
                                694                 :         418080 :             return NULL;
                                695         [ +  - ]:             22 :         ereport(ERROR,
                                696                 :                :                 (errcode_for_file_access(),
                                697                 :                :                  errmsg("could not open file \"%s\": %m", path.str)));
                                698                 :                :     }
                                699                 :                : 
 3551 andres@anarazel.de        700                 :         887805 :     _fdvec_resize(reln, forknum, 1);
                                701                 :         887805 :     mdfd = &reln->md_seg_fds[forknum][0];
 8034 tgl@sss.pgh.pa.us         702                 :         887805 :     mdfd->mdfd_vfd = fd;
                                703                 :         887805 :     mdfd->mdfd_segno = 0;
                                704                 :                : 
 6501 heikki.linnakangas@i      705         [ -  + ]:         887805 :     Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
                                706                 :                : 
 8034 tgl@sss.pgh.pa.us         707                 :         887805 :     return mdfd;
                                708                 :                : }
                                709                 :                : 
                                710                 :                : /*
                                711                 :                :  * mdopen() -- Initialize newly-opened relation.
                                712                 :                :  */
                                713                 :                : void
 2509 tmunro@postgresql.or      714                 :        1208449 : mdopen(SMgrRelation reln)
                                715                 :                : {
                                716                 :                :     /* mark it not open */
                                717         [ +  + ]:        6042245 :     for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
                                718                 :        4833796 :         reln->md_num_open_segs[forknum] = 0;
                                719                 :        1208449 : }
                                720                 :                : 
                                721                 :                : /*
                                722                 :                :  * mdclose() -- Close the specified relation, if it isn't closed already.
                                723                 :                :  */
                                724                 :                : void
 6501 heikki.linnakangas@i      725                 :        4408744 : mdclose(SMgrRelation reln, ForkNumber forknum)
                                726                 :                : {
 3551 andres@anarazel.de        727                 :        4408744 :     int         nopensegs = reln->md_num_open_segs[forknum];
                                728                 :                : 
                                729                 :                :     /* No work if already closed */
                                730         [ +  + ]:        4408744 :     if (nopensegs == 0)
 7087 tgl@sss.pgh.pa.us         731                 :        3764828 :         return;
                                732                 :                : 
                                733                 :                :     /* close segments starting from the end */
 3551 andres@anarazel.de        734         [ +  + ]:        1287832 :     while (nopensegs > 0)
                                735                 :                :     {
                                736                 :         643916 :         MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
                                737                 :                : 
 2332 noah@leadboat.com         738                 :         643916 :         FileClose(v->mdfd_vfd);
                                739                 :         643916 :         _fdvec_resize(reln, forknum, nopensegs - 1);
 3551 andres@anarazel.de        740                 :         643916 :         nopensegs--;
                                741                 :                :     }
                                742                 :                : }
                                743                 :                : 
                                744                 :                : /*
                                745                 :                :  * mdprefetch() -- Initiate asynchronous read of the specified blocks of a relation
                                746                 :                :  */
                                747                 :                : bool
  896 tmunro@postgresql.or      748                 :           9520 : mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                749                 :                :            int nblocks)
                                750                 :                : {
                                751                 :                : #ifdef USE_PREFETCH
                                752                 :                : 
 1148                           753         [ -  + ]:           9520 :     Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
                                754                 :                : 
  896                           755         [ -  + ]:           9520 :     if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
 2243 tmunro@postgresql.or      756                 :UBC           0 :         return false;
                                757                 :                : 
  896 tmunro@postgresql.or      758         [ +  + ]:CBC       19040 :     while (nblocks > 0)
                                759                 :                :     {
                                760                 :                :         pgoff_t     seekpos;
                                761                 :                :         MdfdVec    *v;
                                762                 :                :         int         nblocks_this_segment;
                                763                 :                : 
                                764                 :           9520 :         v = _mdfd_getseg(reln, forknum, blocknum, false,
                                765         [ +  + ]:           9520 :                          InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
                                766         [ -  + ]:           9520 :         if (v == NULL)
  896 tmunro@postgresql.or      767                 :UBC           0 :             return false;
                                768                 :                : 
  198 michael@paquier.xyz       769                 :GNC        9520 :         seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                                770                 :                : 
                                771         [ -  + ]:           9520 :         Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
                                772                 :                : 
  896 tmunro@postgresql.or      773                 :CBC        9520 :         nblocks_this_segment =
                                774                 :           9520 :             Min(nblocks,
                                775                 :                :                 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
                                776                 :                : 
                                777                 :           9520 :         (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
                                778                 :                :                             WAIT_EVENT_DATA_FILE_PREFETCH);
                                779                 :                : 
                                780                 :           9520 :         blocknum += nblocks_this_segment;
                                781                 :           9520 :         nblocks -= nblocks_this_segment;
                                782                 :                :     }
                                783                 :                : #endif                          /* USE_PREFETCH */
                                784                 :                : 
 2243                           785                 :           9520 :     return true;
                                786                 :                : }
                                787                 :                : 
                                788                 :                : /*
                                789                 :                :  * Convert an array of buffer address into an array of iovec objects, and
                                790                 :                :  * return the number that were required.  'iov' must have enough space for up
                                791                 :                :  * to 'nblocks' elements, but the number used may be less depending on
                                792                 :                :  * merging.  In the case of a run of fully contiguous buffers, a single iovec
                                793                 :                :  * will be populated that can be handled as a plain non-vectored I/O.
                                794                 :                :  */
                                795                 :                : static int
  894                           796                 :        2215316 : buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
                                797                 :                : {
                                798                 :                :     struct iovec *iovp;
                                799                 :                :     int         iovcnt;
                                800                 :                : 
                                801         [ -  + ]:        2215316 :     Assert(nblocks >= 1);
                                802                 :                : 
                                803                 :                :     /* If this build supports direct I/O, buffers must be I/O aligned. */
                                804         [ +  + ]:        4613009 :     for (int i = 0; i < nblocks; ++i)
                                805                 :                :     {
                                806                 :                :         if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
                                807         [ -  + ]:        2397693 :             Assert((uintptr_t) buffers[i] ==
                                808                 :                :                    TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
                                809                 :                :     }
                                810                 :                : 
                                811                 :                :     /* Start the first iovec off with the first buffer. */
                                812                 :        2215316 :     iovp = &iov[0];
                                813                 :        2215316 :     iovp->iov_base = buffers[0];
                                814                 :        2215316 :     iovp->iov_len = BLCKSZ;
                                815                 :        2215316 :     iovcnt = 1;
                                816                 :                : 
                                817                 :                :     /* Try to merge the rest. */
                                818         [ +  + ]:        2397693 :     for (int i = 1; i < nblocks; ++i)
                                819                 :                :     {
                                820                 :         182377 :         void       *buffer = buffers[i];
                                821                 :                : 
                                822         [ +  + ]:         182377 :         if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
                                823                 :                :         {
                                824                 :                :             /* Contiguous with the last iovec. */
                                825                 :         181102 :             iovp->iov_len += BLCKSZ;
                                826                 :                :         }
                                827                 :                :         else
                                828                 :                :         {
                                829                 :                :             /* Need a new iovec. */
                                830                 :           1275 :             iovp++;
                                831                 :           1275 :             iovp->iov_base = buffer;
                                832                 :           1275 :             iovp->iov_len = BLCKSZ;
                                833                 :           1275 :             iovcnt++;
                                834                 :                :         }
                                835                 :                :     }
                                836                 :                : 
                                837                 :        2215316 :     return iovcnt;
                                838                 :                : }
                                839                 :                : 
                                840                 :                : /*
                                841                 :                :  * mdmaxcombine() -- Return the maximum number of total blocks that can be
                                842                 :                :  *               combined with an IO starting at blocknum.
                                843                 :                :  */
                                844                 :                : uint32
  599 andres@anarazel.de        845                 :          37177 : mdmaxcombine(SMgrRelation reln, ForkNumber forknum,
                                846                 :                :              BlockNumber blocknum)
                                847                 :                : {
                                848                 :                :     BlockNumber segoff;
                                849                 :                : 
                                850                 :          37177 :     segoff = blocknum % ((BlockNumber) RELSEG_SIZE);
                                851                 :                : 
                                852                 :          37177 :     return RELSEG_SIZE - segoff;
                                853                 :                : }
                                854                 :                : 
                                855                 :                : /*
                                856                 :                :  * mdreadv() -- Read the specified blocks from a relation.
                                857                 :                :  */
                                858                 :                : void
  894 tmunro@postgresql.or      859                 :            766 : mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                860                 :                :         void **buffers, BlockNumber nblocks)
                                861                 :                : {
                                862         [ +  + ]:           1532 :     while (nblocks > 0)
                                863                 :                :     {
                                864                 :                :         struct iovec iov[PG_IOV_MAX];
                                865                 :                :         int         iovcnt;
                                866                 :                :         pgoff_t     seekpos;
                                867                 :                :         int         nbytes;
                                868                 :                :         MdfdVec    *v;
                                869                 :                :         BlockNumber nblocks_this_segment;
                                870                 :                :         size_t      transferred_this_segment;
                                871                 :                :         size_t      size_this_segment;
                                872                 :                : 
                                873                 :            766 :         v = _mdfd_getseg(reln, forknum, blocknum, false,
                                874                 :                :                          EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
                                875                 :                : 
  198 michael@paquier.xyz       876                 :GNC         766 :         seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                                877                 :                : 
                                878         [ -  + ]:            766 :         Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
                                879                 :                : 
  894 tmunro@postgresql.or      880                 :CBC         766 :         nblocks_this_segment =
                                881                 :            766 :             Min(nblocks,
                                882                 :                :                 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
                                883                 :            766 :         nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
                                884                 :                : 
  599 andres@anarazel.de        885         [ -  + ]:            766 :         if (nblocks_this_segment != nblocks)
  599 andres@anarazel.de        886         [ #  # ]:UBC           0 :             elog(ERROR, "read crosses segment boundary");
                                887                 :                : 
  894 tmunro@postgresql.or      888                 :CBC         766 :         iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
                                889                 :            766 :         size_this_segment = nblocks_this_segment * BLCKSZ;
                                890                 :            766 :         transferred_this_segment = 0;
                                891                 :                : 
                                892                 :                :         /*
                                893                 :                :          * Inner loop to continue after a short read.  We'll keep going until
                                894                 :                :          * we hit EOF rather than assuming that a short read means we hit the
                                895                 :                :          * end.
                                896                 :                :          */
                                897                 :                :         for (;;)
                                898                 :                :         {
  894 tmunro@postgresql.or      899                 :UBC           0 :             TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
                                900                 :                :                                                 reln->smgr_rlocator.locator.spcOid,
                                901                 :                :                                                 reln->smgr_rlocator.locator.dbOid,
                                902                 :                :                                                 reln->smgr_rlocator.locator.relNumber,
                                903                 :                :                                                 reln->smgr_rlocator.backend);
  894 tmunro@postgresql.or      904                 :CBC         766 :             nbytes = FileReadV(v->mdfd_vfd, iov, iovcnt, seekpos,
                                905                 :                :                                WAIT_EVENT_DATA_FILE_READ);
                                906                 :                :             TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
                                907                 :                :                                                reln->smgr_rlocator.locator.spcOid,
                                908                 :                :                                                reln->smgr_rlocator.locator.dbOid,
                                909                 :                :                                                reln->smgr_rlocator.locator.relNumber,
                                910                 :                :                                                reln->smgr_rlocator.backend,
                                911                 :                :                                                nbytes,
                                912                 :                :                                                size_this_segment - transferred_this_segment);
                                913                 :                : 
                                914                 :                : #ifdef SIMULATE_SHORT_READ
                                915                 :                :             nbytes = Min(nbytes, 4096);
                                916                 :                : #endif
                                917                 :                : 
                                918         [ -  + ]:            766 :             if (nbytes < 0)
  894 tmunro@postgresql.or      919         [ #  # ]:UBC           0 :                 ereport(ERROR,
                                920                 :                :                         (errcode_for_file_access(),
                                921                 :                :                          errmsg("could not read blocks %u..%u in file \"%s\": %m",
                                922                 :                :                                 blocknum,
                                923                 :                :                                 blocknum + nblocks_this_segment - 1,
                                924                 :                :                                 FilePathName(v->mdfd_vfd))));
                                925                 :                : 
  894 tmunro@postgresql.or      926         [ -  + ]:CBC         766 :             if (nbytes == 0)
                                927                 :                :             {
                                928                 :                :                 /*
                                929                 :                :                  * We are at or past EOF, or we read a partial block at EOF.
                                930                 :                :                  * Normally this is an error; upper levels should never try to
                                931                 :                :                  * read a nonexistent block.  However, if zero_damaged_pages
                                932                 :                :                  * is ON or we are InRecovery, we should instead return zeroes
                                933                 :                :                  * without complaining.  This allows, for example, the case of
                                934                 :                :                  * trying to update a block that was later truncated away.
                                935                 :                :                  *
                                936                 :                :                  * NB: We think that this codepath is unreachable in recovery
                                937                 :                :                  * and incomplete with zero_damaged_pages, as missing segments
                                938                 :                :                  * are not created. Putting blocks into the buffer-pool that
                                939                 :                :                  * do not exist on disk is rather problematic, as it will not
                                940                 :                :                  * be found by scans that rely on smgrnblocks(), as they are
                                941                 :                :                  * beyond EOF. It also can cause weird problems with relation
                                942                 :                :                  * extension, as relation extension does not expect blocks
                                943                 :                :                  * beyond EOF to exist.
                                944                 :                :                  *
                                945                 :                :                  * Therefore we do not want to copy the logic into
                                946                 :                :                  * mdstartreadv(), where it would have to be more complicated
                                947                 :                :                  * due to potential differences in the zero_damaged_pages
                                948                 :                :                  * setting between the definer and completor of IO.
                                949                 :                :                  *
                                950                 :                :                  * For PG 18, we are putting an Assert(false) in mdreadv()
                                951                 :                :                  * (triggering failures in assertion-enabled builds, but
                                952                 :                :                  * continuing to work in production builds). Afterwards we
                                953                 :                :                  * plan to remove this code entirely.
                                954                 :                :                  */
  894 tmunro@postgresql.or      955   [ #  #  #  # ]:UBC           0 :                 if (zero_damaged_pages || InRecovery)
                                956                 :                :                 {
  424 andres@anarazel.de        957                 :              0 :                     Assert(false);  /* see comment above */
                                958                 :                : 
                                959                 :                :                     for (BlockNumber i = transferred_this_segment / BLCKSZ;
                                960                 :                :                          i < nblocks_this_segment;
                                961                 :                :                          ++i)
                                962                 :                :                         memset(buffers[i], 0, BLCKSZ);
                                963                 :                :                     break;
                                964                 :                :                 }
                                965                 :                :                 else
  894 tmunro@postgresql.or      966         [ #  # ]:              0 :                     ereport(ERROR,
                                967                 :                :                             (errcode(ERRCODE_DATA_CORRUPTED),
                                968                 :                :                              errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
                                969                 :                :                                     blocknum,
                                970                 :                :                                     blocknum + nblocks_this_segment - 1,
                                971                 :                :                                     FilePathName(v->mdfd_vfd),
                                972                 :                :                                     transferred_this_segment,
                                973                 :                :                                     size_this_segment)));
                                974                 :                :             }
                                975                 :                : 
                                976                 :                :             /* One loop should usually be enough. */
  894 tmunro@postgresql.or      977                 :CBC         766 :             transferred_this_segment += nbytes;
                                978         [ -  + ]:            766 :             Assert(transferred_this_segment <= size_this_segment);
                                979         [ +  - ]:            766 :             if (transferred_this_segment == size_this_segment)
                                980                 :            766 :                 break;
                                981                 :                : 
                                982                 :                :             /* Adjust position and vectors after a short read. */
  894 tmunro@postgresql.or      983                 :UBC           0 :             seekpos += nbytes;
                                984                 :              0 :             iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
                                985                 :                :         }
                                986                 :                : 
  894 tmunro@postgresql.or      987                 :CBC         766 :         nblocks -= nblocks_this_segment;
                                988                 :            766 :         buffers += nblocks_this_segment;
                                989                 :            766 :         blocknum += nblocks_this_segment;
                                990                 :                :     }
10917 scrappy@hub.org           991                 :            766 : }
                                992                 :                : 
                                993                 :                : /*
                                994                 :                :  * mdstartreadv() -- Asynchronous version of mdreadv().
                                995                 :                :  */
                                996                 :                : void
  427 andres@anarazel.de        997                 :        1486091 : mdstartreadv(PgAioHandle *ioh,
                                998                 :                :              SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                999                 :                :              void **buffers, BlockNumber nblocks)
                               1000                 :                : {
                               1001                 :                :     pgoff_t     seekpos;
                               1002                 :                :     MdfdVec    *v;
                               1003                 :                :     BlockNumber nblocks_this_segment;
                               1004                 :                :     struct iovec *iov;
                               1005                 :                :     int         iovcnt;
                               1006                 :                :     int         ret;
                               1007                 :                : 
                               1008                 :        1486091 :     v = _mdfd_getseg(reln, forknum, blocknum, false,
                               1009                 :                :                      EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
                               1010                 :                : 
  198 michael@paquier.xyz      1011                 :GNC     1486076 :     seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                               1012                 :                : 
                               1013         [ -  + ]:        1486076 :     Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
                               1014                 :                : 
  427 andres@anarazel.de       1015                 :CBC     1486076 :     nblocks_this_segment =
                               1016                 :        1486076 :         Min(nblocks,
                               1017                 :                :             RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
                               1018                 :                : 
                               1019         [ -  + ]:        1486076 :     if (nblocks_this_segment != nblocks)
  427 andres@anarazel.de       1020         [ #  # ]:UBC           0 :         elog(ERROR, "read crossing segment boundary");
                               1021                 :                : 
  427 andres@anarazel.de       1022                 :CBC     1486076 :     iovcnt = pgaio_io_get_iovec(ioh, &iov);
                               1023                 :                : 
                               1024         [ -  + ]:        1486076 :     Assert(nblocks <= iovcnt);
                               1025                 :                : 
                               1026                 :        1486076 :     iovcnt = buffers_to_iovec(iov, buffers, nblocks_this_segment);
                               1027                 :                : 
                               1028         [ -  + ]:        1486076 :     Assert(iovcnt <= nblocks_this_segment);
                               1029                 :                : 
                               1030         [ +  + ]:        1486076 :     if (!(io_direct_flags & IO_DIRECT_DATA))
                               1031                 :        1484581 :         pgaio_io_set_flag(ioh, PGAIO_HF_BUFFERED);
                               1032                 :                : 
                               1033                 :        1486076 :     pgaio_io_set_target_smgr(ioh,
                               1034                 :                :                              reln,
                               1035                 :                :                              forknum,
                               1036                 :                :                              blocknum,
                               1037                 :                :                              nblocks,
                               1038                 :                :                              false);
                               1039                 :        1486076 :     pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_READV, 0);
                               1040                 :                : 
                               1041                 :        1486076 :     ret = FileStartReadV(ioh, v->mdfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_READ);
                               1042         [ -  + ]:        1486076 :     if (ret != 0)
  427 andres@anarazel.de       1043         [ #  # ]:UBC           0 :         ereport(ERROR,
                               1044                 :                :                 (errcode_for_file_access(),
                               1045                 :                :                  errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
                               1046                 :                :                         blocknum,
                               1047                 :                :                         blocknum + nblocks_this_segment - 1,
                               1048                 :                :                         FilePathName(v->mdfd_vfd))));
                               1049                 :                : 
                               1050                 :                :     /*
                               1051                 :                :      * The error checks corresponding to the post-read checks in mdreadv() are
                               1052                 :                :      * in md_readv_complete().
                               1053                 :                :      *
                               1054                 :                :      * However we chose, at least for now, to not implement the
                               1055                 :                :      * zero_damaged_pages logic present in mdreadv(). As outlined in mdreadv()
                               1056                 :                :      * that logic is rather problematic, and we want to get rid of it. Here
                               1057                 :                :      * equivalent logic would have to be more complicated due to potential
                               1058                 :                :      * differences in the zero_damaged_pages setting between the definer and
                               1059                 :                :      * completor of IO.
                               1060                 :                :      */
  427 andres@anarazel.de       1061                 :CBC     1486076 : }
                               1062                 :                : 
                               1063                 :                : /*
                               1064                 :                :  * mdwritev() -- Write the supplied blocks at the appropriate location.
                               1065                 :                :  *
                               1066                 :                :  * This is to be used only for updating already-existing blocks of a
                               1067                 :                :  * relation (ie, those before the current EOF).  To extend a relation,
                               1068                 :                :  * use mdextend().
                               1069                 :                :  */
                               1070                 :                : void
  894 tmunro@postgresql.or     1071                 :         728474 : mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                               1072                 :                :          const void **buffers, BlockNumber nblocks, bool skipFsync)
                               1073                 :                : {
                               1074                 :                :     /* This assert is too expensive to have on normally ... */
                               1075                 :                : #ifdef CHECK_WRITE_VS_EXTEND
                               1076                 :                :     Assert((uint64) blocknum + (uint64) nblocks <= (uint64) mdnblocks(reln, forknum));
                               1077                 :                : #endif
                               1078                 :                : 
                               1079         [ +  + ]:        1456948 :     while (nblocks > 0)
                               1080                 :                :     {
                               1081                 :                :         struct iovec iov[PG_IOV_MAX];
                               1082                 :                :         int         iovcnt;
                               1083                 :                :         pgoff_t     seekpos;
                               1084                 :                :         int         nbytes;
                               1085                 :                :         MdfdVec    *v;
                               1086                 :                :         BlockNumber nblocks_this_segment;
                               1087                 :                :         size_t      transferred_this_segment;
                               1088                 :                :         size_t      size_this_segment;
                               1089                 :                : 
                               1090                 :         728474 :         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
                               1091                 :                :                          EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
                               1092                 :                : 
  198 michael@paquier.xyz      1093                 :GNC      728474 :         seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                               1094                 :                : 
                               1095         [ -  + ]:         728474 :         Assert(seekpos < (pgoff_t) BLCKSZ * RELSEG_SIZE);
                               1096                 :                : 
  894 tmunro@postgresql.or     1097                 :CBC      728474 :         nblocks_this_segment =
                               1098                 :         728474 :             Min(nblocks,
                               1099                 :                :                 RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
                               1100                 :         728474 :         nblocks_this_segment = Min(nblocks_this_segment, lengthof(iov));
                               1101                 :                : 
  599 andres@anarazel.de       1102         [ -  + ]:         728474 :         if (nblocks_this_segment != nblocks)
  599 andres@anarazel.de       1103         [ #  # ]:UBC           0 :             elog(ERROR, "write crosses segment boundary");
                               1104                 :                : 
  894 tmunro@postgresql.or     1105                 :CBC      728474 :         iovcnt = buffers_to_iovec(iov, (void **) buffers, nblocks_this_segment);
                               1106                 :         728474 :         size_this_segment = nblocks_this_segment * BLCKSZ;
                               1107                 :         728474 :         transferred_this_segment = 0;
                               1108                 :                : 
                               1109                 :                :         /*
                               1110                 :                :          * Inner loop to continue after a short write.  If the reason is that
                               1111                 :                :          * we're out of disk space, a future attempt should get an ENOSPC
                               1112                 :                :          * error from the kernel.
                               1113                 :                :          */
                               1114                 :                :         for (;;)
                               1115                 :                :         {
  894 tmunro@postgresql.or     1116                 :UBC           0 :             TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
                               1117                 :                :                                                  reln->smgr_rlocator.locator.spcOid,
                               1118                 :                :                                                  reln->smgr_rlocator.locator.dbOid,
                               1119                 :                :                                                  reln->smgr_rlocator.locator.relNumber,
                               1120                 :                :                                                  reln->smgr_rlocator.backend);
  894 tmunro@postgresql.or     1121                 :CBC      728474 :             nbytes = FileWriteV(v->mdfd_vfd, iov, iovcnt, seekpos,
                               1122                 :                :                                 WAIT_EVENT_DATA_FILE_WRITE);
                               1123                 :                :             TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
                               1124                 :                :                                                 reln->smgr_rlocator.locator.spcOid,
                               1125                 :                :                                                 reln->smgr_rlocator.locator.dbOid,
                               1126                 :                :                                                 reln->smgr_rlocator.locator.relNumber,
                               1127                 :                :                                                 reln->smgr_rlocator.backend,
                               1128                 :                :                                                 nbytes,
                               1129                 :                :                                                 size_this_segment - transferred_this_segment);
                               1130                 :                : 
                               1131                 :                : #ifdef SIMULATE_SHORT_WRITE
                               1132                 :                :             nbytes = Min(nbytes, 4096);
                               1133                 :                : #endif
                               1134                 :                : 
                               1135         [ -  + ]:         728474 :             if (nbytes < 0)
                               1136                 :                :             {
  894 tmunro@postgresql.or     1137                 :UBC           0 :                 bool        enospc = errno == ENOSPC;
                               1138                 :                : 
                               1139   [ #  #  #  # ]:              0 :                 ereport(ERROR,
                               1140                 :                :                         (errcode_for_file_access(),
                               1141                 :                :                          errmsg("could not write blocks %u..%u in file \"%s\": %m",
                               1142                 :                :                                 blocknum,
                               1143                 :                :                                 blocknum + nblocks_this_segment - 1,
                               1144                 :                :                                 FilePathName(v->mdfd_vfd)),
                               1145                 :                :                          enospc ? errhint("Check free disk space.") : 0));
                               1146                 :                :             }
                               1147                 :                : 
                               1148                 :                :             /* One loop should usually be enough. */
  894 tmunro@postgresql.or     1149                 :CBC      728474 :             transferred_this_segment += nbytes;
                               1150         [ -  + ]:         728474 :             Assert(transferred_this_segment <= size_this_segment);
                               1151         [ +  - ]:         728474 :             if (transferred_this_segment == size_this_segment)
                               1152                 :         728474 :                 break;
                               1153                 :                : 
                               1154                 :                :             /* Adjust position and iovecs after a short write. */
  894 tmunro@postgresql.or     1155                 :UBC           0 :             seekpos += nbytes;
                               1156                 :              0 :             iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
                               1157                 :                :         }
                               1158                 :                : 
  894 tmunro@postgresql.or     1159   [ +  +  +  + ]:CBC      728474 :         if (!skipFsync && !SmgrIsTemp(reln))
                               1160                 :         723159 :             register_dirty_segment(reln, forknum, v);
                               1161                 :                : 
                               1162                 :         728474 :         nblocks -= nblocks_this_segment;
                               1163                 :         728474 :         buffers += nblocks_this_segment;
                               1164                 :         728474 :         blocknum += nblocks_this_segment;
                               1165                 :                :     }
 9547 tgl@sss.pgh.pa.us        1166                 :         728474 : }
                               1167                 :                : 
                               1168                 :                : 
                               1169                 :                : /*
                               1170                 :                :  * mdwriteback() -- Tell the kernel to write pages back to storage.
                               1171                 :                :  *
                               1172                 :                :  * This accepts a range of blocks because flushing several pages at once is
                               1173                 :                :  * considerably more efficient than doing so individually.
                               1174                 :                :  */
                               1175                 :                : void
 1107 peter@eisentraut.org     1176                 :UBC           0 : mdwriteback(SMgrRelation reln, ForkNumber forknum,
                               1177                 :                :             BlockNumber blocknum, BlockNumber nblocks)
                               1178                 :                : {
                               1179         [ #  # ]:              0 :     Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
                               1180                 :                : 
                               1181                 :                :     /*
                               1182                 :                :      * Issue flush requests in as few requests as possible; have to split at
                               1183                 :                :      * segment boundaries though, since those are actually separate files.
                               1184                 :                :      */
                               1185         [ #  # ]:              0 :     while (nblocks > 0)
                               1186                 :                :     {
                               1187                 :              0 :         BlockNumber nflush = nblocks;
                               1188                 :                :         pgoff_t     seekpos;
                               1189                 :                :         MdfdVec    *v;
                               1190                 :                :         int         segnum_start,
                               1191                 :                :                     segnum_end;
                               1192                 :                : 
                               1193                 :              0 :         v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
                               1194                 :                :                          EXTENSION_DONT_OPEN);
                               1195                 :                : 
                               1196                 :                :         /*
                               1197                 :                :          * We might be flushing buffers of already removed relations, that's
                               1198                 :                :          * ok, just ignore that case.  If the segment file wasn't open already
                               1199                 :                :          * (ie from a recent mdwrite()), then we don't want to re-open it, to
                               1200                 :                :          * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
                               1201                 :                :          * us with a descriptor to a file that is about to be unlinked.
                               1202                 :                :          */
                               1203         [ #  # ]:              0 :         if (!v)
                               1204                 :              0 :             return;
                               1205                 :                : 
                               1206                 :                :         /* compute offset inside the current segment */
                               1207                 :              0 :         segnum_start = blocknum / RELSEG_SIZE;
                               1208                 :                : 
                               1209                 :                :         /* compute number of desired writes within the current segment */
                               1210                 :              0 :         segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
                               1211         [ #  # ]:              0 :         if (segnum_start != segnum_end)
                               1212                 :              0 :             nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
                               1213                 :                : 
                               1214         [ #  # ]:              0 :         Assert(nflush >= 1);
                               1215         [ #  # ]:              0 :         Assert(nflush <= nblocks);
                               1216                 :                : 
  198 michael@paquier.xyz      1217                 :UNC           0 :         seekpos = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                               1218                 :                : 
                               1219                 :              0 :         FileWriteback(v->mdfd_vfd, seekpos, (pgoff_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
                               1220                 :                : 
 1107 peter@eisentraut.org     1221                 :UBC           0 :         nblocks -= nflush;
                               1222                 :              0 :         blocknum += nflush;
                               1223                 :                :     }
                               1224                 :                : }
                               1225                 :                : 
                               1226                 :                : /*
                               1227                 :                :  * mdnblocks() -- Get the number of blocks stored in a relation.
                               1228                 :                :  *
                               1229                 :                :  * Important side effect: all active segments of the relation are opened
                               1230                 :                :  * and added to the md_seg_fds array.  If this routine has not been
                               1231                 :                :  * called, then only segments up to the last one actually touched
                               1232                 :                :  * are present in the array.
                               1233                 :                :  */
                               1234                 :                : BlockNumber
 6501 heikki.linnakangas@i     1235                 :CBC     3041217 : mdnblocks(SMgrRelation reln, ForkNumber forknum)
                               1236                 :                : {
                               1237                 :                :     MdfdVec    *v;
                               1238                 :                :     BlockNumber nblocks;
                               1239                 :                :     BlockNumber segno;
                               1240                 :                : 
 2095 bruce@momjian.us         1241                 :        3041217 :     mdopenfork(reln, forknum, EXTENSION_FAIL);
                               1242                 :                : 
                               1243                 :                :     /* mdopen has opened the first segment */
 3551 andres@anarazel.de       1244         [ -  + ]:        3041198 :     Assert(reln->md_num_open_segs[forknum] > 0);
                               1245                 :                : 
                               1246                 :                :     /*
                               1247                 :                :      * Start from the last open segments, to avoid redundant seeks.  We have
                               1248                 :                :      * previously verified that these segments are exactly RELSEG_SIZE long,
                               1249                 :                :      * and it's useless to recheck that each time.
                               1250                 :                :      *
                               1251                 :                :      * NOTE: this assumption could only be wrong if another backend has
                               1252                 :                :      * truncated the relation.  We rely on higher code levels to handle that
                               1253                 :                :      * scenario by closing and re-opening the md fd, which is handled via
                               1254                 :                :      * relcache flush.  (Since the checkpointer doesn't participate in
                               1255                 :                :      * relcache flush, it could have segment entries for inactive segments;
                               1256                 :                :      * that's OK because the checkpointer never needs to compute relation
                               1257                 :                :      * size.)
                               1258                 :                :      */
                               1259                 :        3041198 :     segno = reln->md_num_open_segs[forknum] - 1;
                               1260                 :        3041198 :     v = &reln->md_seg_fds[forknum][segno];
                               1261                 :                : 
                               1262                 :                :     for (;;)
                               1263                 :                :     {
 6501 heikki.linnakangas@i     1264                 :        3041198 :         nblocks = _mdnblocks(reln, forknum, v);
 9103 tgl@sss.pgh.pa.us        1265         [ -  + ]:        3041198 :         if (nblocks > ((BlockNumber) RELSEG_SIZE))
 8346 tgl@sss.pgh.pa.us        1266         [ #  # ]:UBC           0 :             elog(FATAL, "segment too big");
 9103 tgl@sss.pgh.pa.us        1267         [ +  - ]:CBC     3041198 :         if (nblocks < ((BlockNumber) RELSEG_SIZE))
                               1268                 :        3041198 :             return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
                               1269                 :                : 
                               1270                 :                :         /*
                               1271                 :                :          * If segment is exactly RELSEG_SIZE, advance to next one.
                               1272                 :                :          */
 9151 tgl@sss.pgh.pa.us        1273                 :UBC           0 :         segno++;
                               1274                 :                : 
                               1275                 :                :         /*
                               1276                 :                :          * We used to pass O_CREAT here, but that has the disadvantage that it
                               1277                 :                :          * might create a segment which has vanished through some operating
                               1278                 :                :          * system misadventure.  In such a case, creating the segment here
                               1279                 :                :          * undermines _mdfd_getseg's attempts to notice and report an error
                               1280                 :                :          * upon access to a missing segment.
                               1281                 :                :          */
 3551 andres@anarazel.de       1282                 :              0 :         v = _mdfd_openseg(reln, forknum, segno, 0);
                               1283         [ #  # ]:              0 :         if (v == NULL)
                               1284                 :              0 :             return segno * ((BlockNumber) RELSEG_SIZE);
                               1285                 :                :     }
                               1286                 :                : }
                               1287                 :                : 
                               1288                 :                : /*
                               1289                 :                :  * mdtruncate() -- Truncate relation to specified number of blocks.
                               1290                 :                :  *
                               1291                 :                :  * Guaranteed not to allocate memory, so it can be used in a critical section.
                               1292                 :                :  * Caller must have called smgrnblocks() to obtain curnblk while holding a
                               1293                 :                :  * sufficient lock to prevent a change in relation size, and not used any smgr
                               1294                 :                :  * functions for this relation or handled interrupts in between.  This makes
                               1295                 :                :  * sure we have opened all active segments, so that truncate loop will get
                               1296                 :                :  * them all!
                               1297                 :                :  *
                               1298                 :                :  * If nblocks > curnblk, the request is ignored when we are InRecovery,
                               1299                 :                :  * otherwise, an error is raised.
                               1300                 :                :  */
                               1301                 :                : void
  526 tmunro@postgresql.or     1302                 :CBC        1144 : mdtruncate(SMgrRelation reln, ForkNumber forknum,
                               1303                 :                :            BlockNumber curnblk, BlockNumber nblocks)
                               1304                 :                : {
                               1305                 :                :     BlockNumber priorblocks;
                               1306                 :                :     int         curopensegs;
                               1307                 :                : 
 9103 tgl@sss.pgh.pa.us        1308         [ -  + ]:           1144 :     if (nblocks > curnblk)
                               1309                 :                :     {
                               1310                 :                :         /* Bogus request ... but no complaint if InRecovery */
 7087 tgl@sss.pgh.pa.us        1311         [ #  # ]:UBC           0 :         if (InRecovery)
                               1312                 :              0 :             return;
                               1313         [ #  # ]:              0 :         ereport(ERROR,
                               1314                 :                :                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
                               1315                 :                :                         relpath(reln->smgr_rlocator, forknum).str,
                               1316                 :                :                         nblocks, curnblk)));
                               1317                 :                :     }
 9767 tgl@sss.pgh.pa.us        1318         [ +  + ]:CBC        1144 :     if (nblocks == curnblk)
 7087                          1319                 :            472 :         return;                 /* no work */
                               1320                 :                : 
                               1321                 :                :     /*
                               1322                 :                :      * Truncate segments, starting at the last one. Starting at the end makes
                               1323                 :                :      * managing the memory for the fd array easier, should there be errors.
                               1324                 :                :      */
 3551 andres@anarazel.de       1325                 :            672 :     curopensegs = reln->md_num_open_segs[forknum];
                               1326         [ +  + ]:           1344 :     while (curopensegs > 0)
                               1327                 :                :     {
                               1328                 :                :         MdfdVec    *v;
                               1329                 :                : 
                               1330                 :            672 :         priorblocks = (curopensegs - 1) * RELSEG_SIZE;
                               1331                 :                : 
                               1332                 :            672 :         v = &reln->md_seg_fds[forknum][curopensegs - 1];
                               1333                 :                : 
 9767 tgl@sss.pgh.pa.us        1334         [ -  + ]:            672 :         if (priorblocks > nblocks)
                               1335                 :                :         {
                               1336                 :                :             /*
                               1337                 :                :              * This segment is no longer active. We truncate the file, but do
                               1338                 :                :              * not delete it, for reasons explained in the header comments.
                               1339                 :                :              */
 3360 rhaas@postgresql.org     1340         [ #  # ]:UBC           0 :             if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
 7087 tgl@sss.pgh.pa.us        1341         [ #  # ]:              0 :                 ereport(ERROR,
                               1342                 :                :                         (errcode_for_file_access(),
                               1343                 :                :                          errmsg("could not truncate file \"%s\": %m",
                               1344                 :                :                                 FilePathName(v->mdfd_vfd))));
                               1345                 :                : 
 5769 rhaas@postgresql.org     1346         [ #  # ]:              0 :             if (!SmgrIsTemp(reln))
 6501 heikki.linnakangas@i     1347                 :              0 :                 register_dirty_segment(reln, forknum, v);
                               1348                 :                : 
                               1349                 :                :             /* we never drop the 1st segment */
 3551 andres@anarazel.de       1350         [ #  # ]:              0 :             Assert(v != &reln->md_seg_fds[forknum][0]);
                               1351                 :                : 
                               1352                 :              0 :             FileClose(v->mdfd_vfd);
                               1353                 :              0 :             _fdvec_resize(reln, forknum, curopensegs - 1);
                               1354                 :                :         }
 9103 tgl@sss.pgh.pa.us        1355         [ +  - ]:CBC         672 :         else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
                               1356                 :                :         {
                               1357                 :                :             /*
                               1358                 :                :              * This is the last segment we want to keep. Truncate the file to
                               1359                 :                :              * the right length. NOTE: if nblocks is exactly a multiple K of
                               1360                 :                :              * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
                               1361                 :                :              * keep it. This adheres to the invariant given in the header
                               1362                 :                :              * comments.
                               1363                 :                :              */
 8983 bruce@momjian.us         1364                 :            672 :             BlockNumber lastsegblocks = nblocks - priorblocks;
                               1365                 :                : 
  198 michael@paquier.xyz      1366         [ -  + ]:GNC         672 :             if (FileTruncate(v->mdfd_vfd, (pgoff_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
 7087 tgl@sss.pgh.pa.us        1367         [ #  # ]:UBC           0 :                 ereport(ERROR,
                               1368                 :                :                         (errcode_for_file_access(),
                               1369                 :                :                          errmsg("could not truncate file \"%s\" to %u blocks: %m",
                               1370                 :                :                                 FilePathName(v->mdfd_vfd),
                               1371                 :                :                                 nblocks)));
 5769 rhaas@postgresql.org     1372         [ +  + ]:CBC         672 :             if (!SmgrIsTemp(reln))
 6501 heikki.linnakangas@i     1373                 :            454 :                 register_dirty_segment(reln, forknum, v);
                               1374                 :                :         }
                               1375                 :                :         else
                               1376                 :                :         {
                               1377                 :                :             /*
                               1378                 :                :              * We still need this segment, so nothing to do for this and any
                               1379                 :                :              * earlier segment.
                               1380                 :                :              */
 3551 andres@anarazel.de       1381                 :UBC           0 :             break;
                               1382                 :                :         }
 3551 andres@anarazel.de       1383                 :CBC         672 :         curopensegs--;
                               1384                 :                :     }
                               1385                 :                : }
                               1386                 :                : 
                               1387                 :                : /*
                               1388                 :                :  * mdregistersync() -- Mark whole relation as needing fsync
                               1389                 :                :  */
                               1390                 :                : void
  827 heikki.linnakangas@i     1391                 :          32148 : mdregistersync(SMgrRelation reln, ForkNumber forknum)
                               1392                 :                : {
                               1393                 :                :     int         segno;
                               1394                 :                :     int         min_inactive_seg;
                               1395                 :                : 
                               1396                 :                :     /*
                               1397                 :                :      * NOTE: mdnblocks makes sure we have opened all active segments, so that
                               1398                 :                :      * the loop below will get them all!
                               1399                 :                :      */
                               1400                 :          32148 :     mdnblocks(reln, forknum);
                               1401                 :                : 
                               1402                 :          32148 :     min_inactive_seg = segno = reln->md_num_open_segs[forknum];
                               1403                 :                : 
                               1404                 :                :     /*
                               1405                 :                :      * Temporarily open inactive segments, then close them after sync.  There
                               1406                 :                :      * may be some inactive segments left opened after error, but that is
                               1407                 :                :      * harmless.  We don't bother to clean them up and take a risk of further
                               1408                 :                :      * trouble.  The next mdclose() will soon close them.
                               1409                 :                :      */
                               1410         [ -  + ]:          32148 :     while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
  827 heikki.linnakangas@i     1411                 :UBC           0 :         segno++;
                               1412                 :                : 
  827 heikki.linnakangas@i     1413         [ +  + ]:CBC       64296 :     while (segno > 0)
                               1414                 :                :     {
                               1415                 :          32148 :         MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
                               1416                 :                : 
                               1417                 :          32148 :         register_dirty_segment(reln, forknum, v);
                               1418                 :                : 
                               1419                 :                :         /* Close inactive segments immediately */
                               1420         [ -  + ]:          32148 :         if (segno > min_inactive_seg)
                               1421                 :                :         {
  827 heikki.linnakangas@i     1422                 :UBC           0 :             FileClose(v->mdfd_vfd);
                               1423                 :              0 :             _fdvec_resize(reln, forknum, segno - 1);
                               1424                 :                :         }
                               1425                 :                : 
  827 heikki.linnakangas@i     1426                 :CBC       32148 :         segno--;
                               1427                 :                :     }
                               1428                 :          32148 : }
                               1429                 :                : 
                               1430                 :                : /*
                               1431                 :                :  * mdimmedsync() -- Immediately sync a relation to stable storage.
                               1432                 :                :  *
                               1433                 :                :  * Note that only writes already issued are synced; this routine knows
                               1434                 :                :  * nothing of dirty buffers that may exist inside the buffer manager.  We
                               1435                 :                :  * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
                               1436                 :                :  * Consider a relation skipping WAL.  Suppose a checkpoint syncs blocks of
                               1437                 :                :  * some segment, then mdtruncate() renders that segment inactive.  If we
                               1438                 :                :  * crash before the next checkpoint syncs the newly-inactive segment, that
                               1439                 :                :  * segment may survive recovery, reintroducing unwanted data into the table.
                               1440                 :                :  */
                               1441                 :                : void
 6501                          1442                 :             14 : mdimmedsync(SMgrRelation reln, ForkNumber forknum)
                               1443                 :                : {
                               1444                 :                :     int         segno;
                               1445                 :                :     int         min_inactive_seg;
                               1446                 :                : 
                               1447                 :                :     /*
                               1448                 :                :      * NOTE: mdnblocks makes sure we have opened all active segments, so that
                               1449                 :                :      * the loop below will get them all!
                               1450                 :                :      */
 5528 peter_e@gmx.net          1451                 :             14 :     mdnblocks(reln, forknum);
                               1452                 :                : 
 2247 noah@leadboat.com        1453                 :             14 :     min_inactive_seg = segno = reln->md_num_open_segs[forknum];
                               1454                 :                : 
                               1455                 :                :     /*
                               1456                 :                :      * Temporarily open inactive segments, then close them after sync.  There
                               1457                 :                :      * may be some inactive segments left opened after fsync() error, but that
                               1458                 :                :      * is harmless.  We don't bother to clean them up and take a risk of
                               1459                 :                :      * further trouble.  The next mdclose() will soon close them.
                               1460                 :                :      */
                               1461         [ -  + ]:             14 :     while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
 2247 noah@leadboat.com        1462                 :UBC           0 :         segno++;
                               1463                 :                : 
 3551 andres@anarazel.de       1464         [ +  + ]:CBC          28 :     while (segno > 0)
                               1465                 :                :     {
                               1466                 :             14 :         MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
                               1467                 :                : 
                               1468                 :                :         /*
                               1469                 :                :          * fsyncs done through mdimmedsync() should be tracked in a separate
                               1470                 :                :          * IOContext than those done through mdsyncfiletag() to differentiate
                               1471                 :                :          * between unavoidable client backend fsyncs (e.g. those done during
                               1472                 :                :          * index build) and those which ideally would have been done by the
                               1473                 :                :          * checkpointer. Since other IO operations bypassing the buffer
                               1474                 :                :          * manager could also be tracked in such an IOContext, wait until
                               1475                 :                :          * these are also tracked to track immediate fsyncs.
                               1476                 :                :          */
 3360 rhaas@postgresql.org     1477         [ -  + ]:             14 :         if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
 2749 tmunro@postgresql.or     1478         [ #  # ]:UBC           0 :             ereport(data_sync_elevel(ERROR),
                               1479                 :                :                     (errcode_for_file_access(),
                               1480                 :                :                      errmsg("could not fsync file \"%s\": %m",
                               1481                 :                :                             FilePathName(v->mdfd_vfd))));
                               1482                 :                : 
                               1483                 :                :         /* Close inactive segments immediately */
 2247 noah@leadboat.com        1484         [ -  + ]:CBC          14 :         if (segno > min_inactive_seg)
                               1485                 :                :         {
 2247 noah@leadboat.com        1486                 :UBC           0 :             FileClose(v->mdfd_vfd);
                               1487                 :              0 :             _fdvec_resize(reln, forknum, segno - 1);
                               1488                 :                :         }
                               1489                 :                : 
 3551 andres@anarazel.de       1490                 :CBC          14 :         segno--;
                               1491                 :                :     }
 8032 tgl@sss.pgh.pa.us        1492                 :             14 : }
                               1493                 :                : 
                               1494                 :                : int
  427 andres@anarazel.de       1495                 :         536124 : mdfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
                               1496                 :                : {
                               1497                 :         536124 :     MdfdVec    *v = mdopenfork(reln, forknum, EXTENSION_FAIL);
                               1498                 :                : 
                               1499                 :         536124 :     v = _mdfd_getseg(reln, forknum, blocknum, false,
                               1500                 :                :                      EXTENSION_FAIL);
                               1501                 :                : 
  198 michael@paquier.xyz      1502                 :GNC      536124 :     *off = (pgoff_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
                               1503                 :                : 
                               1504         [ -  + ]:         536124 :     Assert(*off < (pgoff_t) BLCKSZ * RELSEG_SIZE);
                               1505                 :                : 
  427 andres@anarazel.de       1506                 :CBC      536124 :     return FileGetRawDesc(v->mdfd_vfd);
                               1507                 :                : }
                               1508                 :                : 
                               1509                 :                : /*
                               1510                 :                :  * register_dirty_segment() -- Mark a relation segment as needing fsync
                               1511                 :                :  *
                               1512                 :                :  * If there is a local pending-ops table, just make an entry in it for
                               1513                 :                :  * ProcessSyncRequests to process later.  Otherwise, try to pass off the
                               1514                 :                :  * fsync request to the checkpointer process.  If that fails, just do the
                               1515                 :                :  * fsync locally before returning (we hope this will not happen often
                               1516                 :                :  * enough to be a performance problem).
                               1517                 :                :  */
                               1518                 :                : static void
 6501 heikki.linnakangas@i     1519                 :        1211171 : register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
                               1520                 :                : {
                               1521                 :                :     FileTag     tag;
                               1522                 :                : 
 1424 rhaas@postgresql.org     1523                 :        1211171 :     INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
                               1524                 :                : 
                               1525                 :                :     /* Temp relations should never be fsync'd */
 5065 tgl@sss.pgh.pa.us        1526         [ -  + ]:        1211171 :     Assert(!SmgrIsTemp(reln));
                               1527                 :                : 
 2613 tmunro@postgresql.or     1528         [ +  + ]:        1211171 :     if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
                               1529                 :                :     {
                               1530                 :                :         instr_time  io_start;
                               1531                 :                : 
 1149 andres@anarazel.de       1532         [ -  + ]:             80 :         ereport(DEBUG1,
                               1533                 :                :                 (errmsg_internal("could not forward fsync request because request queue is full")));
                               1534                 :                : 
  458 michael@paquier.xyz      1535                 :             80 :         io_start = pgstat_prepare_io_time(track_io_timing);
                               1536                 :                : 
 1149 andres@anarazel.de       1537         [ -  + ]:             80 :         if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
 1149 andres@anarazel.de       1538         [ #  # ]:UBC           0 :             ereport(data_sync_elevel(ERROR),
                               1539                 :                :                     (errcode_for_file_access(),
                               1540                 :                :                      errmsg("could not fsync file \"%s\": %m",
                               1541                 :                :                             FilePathName(seg->mdfd_vfd))));
                               1542                 :                : 
                               1543                 :                :         /*
                               1544                 :                :          * We have no way of knowing if the current IOContext is
                               1545                 :                :          * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
                               1546                 :                :          * point, so count the fsync as being in the IOCONTEXT_NORMAL
                               1547                 :                :          * IOContext. This is probably okay, because the number of backend
                               1548                 :                :          * fsyncs doesn't say anything about the efficacy of the
                               1549                 :                :          * BufferAccessStrategy. And counting both fsyncs done in
                               1550                 :                :          * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
                               1551                 :                :          * IOCONTEXT_NORMAL is likely clearer when investigating the number of
                               1552                 :                :          * backend fsyncs.
                               1553                 :                :          */
 1149 andres@anarazel.de       1554                 :CBC          80 :         pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
                               1555                 :                :                                 IOOP_FSYNC, io_start, 1, 0);
                               1556                 :                :     }
10917 scrappy@hub.org          1557                 :        1211171 : }
                               1558                 :                : 
                               1559                 :                : /*
                               1560                 :                :  * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
                               1561                 :                :  */
                               1562                 :                : static void
 1424 rhaas@postgresql.org     1563                 :          47667 : register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
                               1564                 :                :                         BlockNumber segno)
                               1565                 :                : {
                               1566                 :                :     FileTag     tag;
                               1567                 :                : 
                               1568                 :          47667 :     INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
                               1569                 :                : 
                               1570                 :                :     /* Should never be used with temp relations */
                               1571         [ -  + ]:          47667 :     Assert(!RelFileLocatorBackendIsTemp(rlocator));
                               1572                 :                : 
 2613 tmunro@postgresql.or     1573                 :          47667 :     RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
 6771 tgl@sss.pgh.pa.us        1574                 :          47667 : }
                               1575                 :                : 
                               1576                 :                : /*
                               1577                 :                :  * register_forget_request() -- forget any fsyncs for a relation fork's segment
                               1578                 :                :  */
                               1579                 :                : static void
 1424 rhaas@postgresql.org     1580                 :         177705 : register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
                               1581                 :                :                         BlockNumber segno)
                               1582                 :                : {
                               1583                 :                :     FileTag     tag;
                               1584                 :                : 
                               1585                 :         177705 :     INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
                               1586                 :                : 
 2613 tmunro@postgresql.or     1587                 :         177705 :     RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
 7073 tgl@sss.pgh.pa.us        1588                 :         177705 : }
                               1589                 :                : 
                               1590                 :                : /*
                               1591                 :                :  * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
                               1592                 :                :  */
                               1593                 :                : void
 2613 tmunro@postgresql.or     1594                 :             67 : ForgetDatabaseSyncRequests(Oid dbid)
                               1595                 :                : {
                               1596                 :                :     FileTag     tag;
                               1597                 :                :     RelFileLocator rlocator;
                               1598                 :                : 
 1424 rhaas@postgresql.org     1599                 :             67 :     rlocator.dbOid = dbid;
                               1600                 :             67 :     rlocator.spcOid = 0;
                               1601                 :             67 :     rlocator.relNumber = 0;
                               1602                 :                : 
                               1603                 :             67 :     INIT_MD_FILETAG(tag, rlocator, InvalidForkNumber, InvalidBlockNumber);
                               1604                 :                : 
 2613 tmunro@postgresql.or     1605                 :             67 :     RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
 9345 vadim4o@yahoo.com        1606                 :             67 : }
                               1607                 :                : 
                               1608                 :                : /*
                               1609                 :                :  * DropRelationFiles -- drop files of all given relations
                               1610                 :                :  */
                               1611                 :                : void
 1424 rhaas@postgresql.org     1612                 :           2969 : DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
                               1613                 :                : {
                               1614                 :                :     SMgrRelation *srels;
                               1615                 :                :     int         i;
                               1616                 :                : 
  171 michael@paquier.xyz      1617                 :GNC        2969 :     srels = palloc_array(SMgrRelation, ndelrels);
 2886 fujii@postgresql.org     1618         [ +  + ]:CBC       11555 :     for (i = 0; i < ndelrels; i++)
                               1619                 :                :     {
  818 heikki.linnakangas@i     1620                 :           8586 :         SMgrRelation srel = smgropen(delrels[i], INVALID_PROC_NUMBER);
                               1621                 :                : 
 2886 fujii@postgresql.org     1622         [ +  + ]:           8586 :         if (isRedo)
                               1623                 :                :         {
                               1624                 :                :             ForkNumber  fork;
                               1625                 :                : 
                               1626         [ +  + ]:          42740 :             for (fork = 0; fork <= MAX_FORKNUM; fork++)
                               1627                 :          34192 :                 XLogDropRelation(delrels[i], fork);
                               1628                 :                :         }
                               1629                 :           8586 :         srels[i] = srel;
                               1630                 :                :     }
                               1631                 :                : 
                               1632                 :           2969 :     smgrdounlinkall(srels, ndelrels, isRedo);
                               1633                 :                : 
 2621 tomas.vondra@postgre     1634         [ +  + ]:          11555 :     for (i = 0; i < ndelrels; i++)
 2886 fujii@postgresql.org     1635                 :           8586 :         smgrclose(srels[i]);
                               1636                 :           2969 :     pfree(srels);
                               1637                 :           2969 : }
                               1638                 :                : 
                               1639                 :                : 
                               1640                 :                : /*
                               1641                 :                :  * _fdvec_resize() -- Resize the fork's open segments array
                               1642                 :                :  */
                               1643                 :                : static void
 3551 andres@anarazel.de       1644                 :        1738987 : _fdvec_resize(SMgrRelation reln,
                               1645                 :                :               ForkNumber forknum,
                               1646                 :                :               int nseg)
                               1647                 :                : {
                               1648         [ +  + ]:        1738987 :     if (nseg == 0)
                               1649                 :                :     {
                               1650         [ +  - ]:         643916 :         if (reln->md_num_open_segs[forknum] > 0)
                               1651                 :                :         {
                               1652                 :         643916 :             pfree(reln->md_seg_fds[forknum]);
                               1653                 :         643916 :             reln->md_seg_fds[forknum] = NULL;
                               1654                 :                :         }
                               1655                 :                :     }
                               1656         [ +  - ]:        1095071 :     else if (reln->md_num_open_segs[forknum] == 0)
                               1657                 :                :     {
                               1658                 :        1095071 :         reln->md_seg_fds[forknum] =
                               1659                 :        1095071 :             MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
                               1660                 :                :     }
  526 tmunro@postgresql.or     1661         [ #  # ]:UBC           0 :     else if (nseg > reln->md_num_open_segs[forknum])
                               1662                 :                :     {
                               1663                 :                :         /*
                               1664                 :                :          * It doesn't seem worthwhile complicating the code to amortize
                               1665                 :                :          * repalloc() calls.  Those are far faster than PathNameOpenFile() or
                               1666                 :                :          * FileClose(), and the memory context internally will sometimes avoid
                               1667                 :                :          * doing an actual reallocation.
                               1668                 :                :          */
 3551 andres@anarazel.de       1669                 :              0 :         reln->md_seg_fds[forknum] =
                               1670                 :              0 :             repalloc(reln->md_seg_fds[forknum],
                               1671                 :                :                      sizeof(MdfdVec) * nseg);
                               1672                 :                :     }
                               1673                 :                :     else
                               1674                 :                :     {
                               1675                 :                :         /*
                               1676                 :                :          * We don't reallocate a smaller array, because we want mdtruncate()
                               1677                 :                :          * to be able to promise that it won't allocate memory, so that it is
                               1678                 :                :          * allowed in a critical section.  This means that a bit of space in
                               1679                 :                :          * the array is now wasted, until the next time we add a segment and
                               1680                 :                :          * reallocate.
                               1681                 :                :          */
                               1682                 :                :     }
                               1683                 :                : 
 3551 andres@anarazel.de       1684                 :CBC     1738987 :     reln->md_num_open_segs[forknum] = nseg;
10600 vadim4o@yahoo.com        1685                 :        1738987 : }
                               1686                 :                : 
                               1687                 :                : /*
                               1688                 :                :  * Return the filename for the specified segment of the relation. The
                               1689                 :                :  * returned string is palloc'd.
                               1690                 :                :  */
                               1691                 :                : static MdPathStr
 6142 heikki.linnakangas@i     1692                 :          32174 : _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
                               1693                 :                : {
                               1694                 :                :     RelPathStr  path;
                               1695                 :                :     MdPathStr   fullpath;
                               1696                 :                : 
 1424 rhaas@postgresql.org     1697                 :          32174 :     path = relpath(reln->smgr_rlocator, forknum);
                               1698                 :                : 
10492 bruce@momjian.us         1699         [ +  - ]:          32174 :     if (segno > 0)
  459 andres@anarazel.de       1700                 :          32174 :         sprintf(fullpath.str, "%s.%u", path.str, segno);
                               1701                 :                :     else
  459 andres@anarazel.de       1702                 :UBC           0 :         strcpy(fullpath.str, path.str);
                               1703                 :                : 
 6142 heikki.linnakangas@i     1704                 :CBC       32174 :     return fullpath;
                               1705                 :                : }
                               1706                 :                : 
                               1707                 :                : /*
                               1708                 :                :  * Open the specified segment of the relation,
                               1709                 :                :  * and make a MdfdVec object for it.  Returns NULL on failure.
                               1710                 :                :  */
                               1711                 :                : static MdfdVec *
                               1712                 :          32162 : _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
                               1713                 :                :               int oflags)
                               1714                 :                : {
                               1715                 :                :     MdfdVec    *v;
                               1716                 :                :     File        fd;
                               1717                 :                :     MdPathStr   fullpath;
                               1718                 :                : 
                               1719                 :          32162 :     fullpath = _mdfd_segpath(reln, forknum, segno);
                               1720                 :                : 
                               1721                 :                :     /* open the file */
  459 andres@anarazel.de       1722                 :          32162 :     fd = PathNameOpenFile(fullpath.str, _mdfd_open_flags() | oflags);
                               1723                 :                : 
10492 bruce@momjian.us         1724         [ +  - ]:          32162 :     if (fd < 0)
 8179 neilc@samurai.com        1725                 :          32162 :         return NULL;
                               1726                 :                : 
                               1727                 :                :     /*
                               1728                 :                :      * Segments are always opened in order from lowest to highest, so we must
                               1729                 :                :      * be adding a new one at the end.
                               1730                 :                :      */
 2315 tmunro@postgresql.or     1731         [ #  # ]:UBC           0 :     Assert(segno == reln->md_num_open_segs[forknum]);
                               1732                 :                : 
                               1733                 :              0 :     _fdvec_resize(reln, forknum, segno + 1);
                               1734                 :                : 
                               1735                 :                :     /* fill the entry */
 3551 andres@anarazel.de       1736                 :              0 :     v = &reln->md_seg_fds[forknum][segno];
10492 bruce@momjian.us         1737                 :              0 :     v->mdfd_vfd = fd;
 8034 tgl@sss.pgh.pa.us        1738                 :              0 :     v->mdfd_segno = segno;
                               1739                 :                : 
 6501 heikki.linnakangas@i     1740         [ #  # ]:              0 :     Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
                               1741                 :                : 
                               1742                 :                :     /* all done */
10133 bruce@momjian.us         1743                 :              0 :     return v;
                               1744                 :                : }
                               1745                 :                : 
                               1746                 :                : /*
                               1747                 :                :  * _mdfd_getseg() -- Find the segment of the relation holding the
                               1748                 :                :  *                   specified block.
                               1749                 :                :  *
                               1750                 :                :  * If the segment doesn't exist, we ereport, return NULL, or create the
                               1751                 :                :  * segment, according to "behavior".  Note: skipFsync is only used in the
                               1752                 :                :  * EXTENSION_CREATE case.
                               1753                 :                :  */
                               1754                 :                : static MdfdVec *
 6501 heikki.linnakangas@i     1755                 :CBC     3174741 : _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
                               1756                 :                :              bool skipFsync, int behavior)
                               1757                 :                : {
                               1758                 :                :     MdfdVec    *v;
                               1759                 :                :     BlockNumber targetseg;
                               1760                 :                :     BlockNumber nextsegno;
                               1761                 :                : 
                               1762                 :                :     /* some way to handle non-existent segments needs to be specified */
 3678 andres@anarazel.de       1763         [ -  + ]:        3174741 :     Assert(behavior &
                               1764                 :                :            (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL |
                               1765                 :                :             EXTENSION_DONT_OPEN));
                               1766                 :                : 
 7087 tgl@sss.pgh.pa.us        1767                 :        3174741 :     targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
                               1768                 :                : 
                               1769                 :                :     /* if an existing and opened segment, we're done */
 3551 andres@anarazel.de       1770         [ +  + ]:        3174741 :     if (targetseg < reln->md_num_open_segs[forknum])
                               1771                 :                :     {
                               1772                 :        2910195 :         v = &reln->md_seg_fds[forknum][targetseg];
                               1773                 :        2910195 :         return v;
                               1774                 :                :     }
                               1775                 :                : 
                               1776                 :                :     /* The caller only wants the segment if we already had it open. */
 1484 tmunro@postgresql.or     1777         [ -  + ]:         264546 :     if (behavior & EXTENSION_DONT_OPEN)
 1484 tmunro@postgresql.or     1778                 :UBC           0 :         return NULL;
                               1779                 :                : 
                               1780                 :                :     /*
                               1781                 :                :      * The target segment is not yet open. Iterate over all the segments
                               1782                 :                :      * between the last opened and the target segment. This way missing
                               1783                 :                :      * segments either raise an error, or get created (according to
                               1784                 :                :      * 'behavior'). Start with either the last opened, or the first segment if
                               1785                 :                :      * none was opened before.
                               1786                 :                :      */
 3551 andres@anarazel.de       1787         [ +  + ]:CBC      264546 :     if (reln->md_num_open_segs[forknum] > 0)
                               1788                 :             12 :         v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
                               1789                 :                :     else
                               1790                 :                :     {
 2509 tmunro@postgresql.or     1791                 :         264534 :         v = mdopenfork(reln, forknum, behavior);
 3551 andres@anarazel.de       1792         [ -  + ]:         264531 :         if (!v)
 3551 andres@anarazel.de       1793                 :UBC           0 :             return NULL;        /* if behavior & EXTENSION_RETURN_NULL */
                               1794                 :                :     }
                               1795                 :                : 
 3551 andres@anarazel.de       1796                 :CBC      264543 :     for (nextsegno = reln->md_num_open_segs[forknum];
                               1797         [ +  + ]:         264543 :          nextsegno <= targetseg; nextsegno++)
                               1798                 :                :     {
                               1799                 :             12 :         BlockNumber nblocks = _mdnblocks(reln, forknum, v);
                               1800                 :             12 :         int         flags = 0;
                               1801                 :                : 
                               1802         [ -  + ]:             12 :         Assert(nextsegno == v->mdfd_segno + 1);
                               1803                 :                : 
                               1804         [ -  + ]:             12 :         if (nblocks > ((BlockNumber) RELSEG_SIZE))
 3551 andres@anarazel.de       1805         [ #  # ]:UBC           0 :             elog(FATAL, "segment too big");
                               1806                 :                : 
 3551 andres@anarazel.de       1807         [ +  - ]:CBC          12 :         if ((behavior & EXTENSION_CREATE) ||
                               1808   [ -  +  -  - ]:             12 :             (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
                               1809                 :                :         {
                               1810                 :                :             /*
                               1811                 :                :              * Normally we will create new segments only if authorized by the
                               1812                 :                :              * caller (i.e., we are doing mdextend()).  But when doing WAL
                               1813                 :                :              * recovery, create segments anyway; this allows cases such as
                               1814                 :                :              * replaying WAL data that has a write into a high-numbered
                               1815                 :                :              * segment of a relation that was later deleted. We want to go
                               1816                 :                :              * ahead and create the segments so we can finish out the replay.
                               1817                 :                :              *
                               1818                 :                :              * We have to maintain the invariant that segments before the last
                               1819                 :                :              * active segment are of size RELSEG_SIZE; therefore, if
                               1820                 :                :              * extending, pad them out with zeroes if needed.  (This only
                               1821                 :                :              * matters if in recovery, or if the caller is extending the
                               1822                 :                :              * relation discontiguously, but that can happen in hash indexes.)
                               1823                 :                :              */
 3551 andres@anarazel.de       1824         [ #  # ]:UBC           0 :             if (nblocks < ((BlockNumber) RELSEG_SIZE))
                               1825                 :                :             {
 1148 tmunro@postgresql.or     1826                 :              0 :                 char       *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
                               1827                 :                :                                                      MCXT_ALLOC_ZERO);
                               1828                 :                : 
 3551 andres@anarazel.de       1829                 :              0 :                 mdextend(reln, forknum,
                               1830                 :              0 :                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
                               1831                 :                :                          zerobuf, skipFsync);
                               1832                 :              0 :                 pfree(zerobuf);
                               1833                 :                :             }
                               1834                 :              0 :             flags = O_CREAT;
                               1835                 :                :         }
  532 tmunro@postgresql.or     1836         [ +  - ]:CBC          12 :         else if (nblocks < ((BlockNumber) RELSEG_SIZE))
                               1837                 :                :         {
                               1838                 :                :             /*
                               1839                 :                :              * When not extending, only open the next segment if the current
                               1840                 :                :              * one is exactly RELSEG_SIZE.  If not (this branch), either
                               1841                 :                :              * return NULL or fail.
                               1842                 :                :              */
 3551 andres@anarazel.de       1843         [ -  + ]:             12 :             if (behavior & EXTENSION_RETURN_NULL)
                               1844                 :                :             {
                               1845                 :                :                 /*
                               1846                 :                :                  * Some callers discern between reasons for _mdfd_getseg()
                               1847                 :                :                  * returning NULL based on errno. As there's no failing
                               1848                 :                :                  * syscall involved in this case, explicitly set errno to
                               1849                 :                :                  * ENOENT, as that seems the closest interpretation.
                               1850                 :                :                  */
 3551 andres@anarazel.de       1851                 :UBC           0 :                 errno = ENOENT;
                               1852                 :              0 :                 return NULL;
                               1853                 :                :             }
                               1854                 :                : 
 3551 andres@anarazel.de       1855         [ +  - ]:CBC          12 :             ereport(ERROR,
                               1856                 :                :                     (errcode_for_file_access(),
                               1857                 :                :                      errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
                               1858                 :                :                             _mdfd_segpath(reln, forknum, nextsegno).str,
                               1859                 :                :                             blkno, nblocks)));
                               1860                 :                :         }
                               1861                 :                : 
 3551 andres@anarazel.de       1862                 :UBC           0 :         v = _mdfd_openseg(reln, forknum, nextsegno, flags);
                               1863                 :                : 
                               1864         [ #  # ]:              0 :         if (v == NULL)
                               1865                 :                :         {
                               1866         [ #  # ]:              0 :             if ((behavior & EXTENSION_RETURN_NULL) &&
                               1867         [ #  # ]:              0 :                 FILE_POSSIBLY_DELETED(errno))
                               1868                 :              0 :                 return NULL;
                               1869         [ #  # ]:              0 :             ereport(ERROR,
                               1870                 :                :                     (errcode_for_file_access(),
                               1871                 :                :                      errmsg("could not open file \"%s\" (target block %u): %m",
                               1872                 :                :                             _mdfd_segpath(reln, forknum, nextsegno).str,
                               1873                 :                :                             blkno)));
                               1874                 :                :         }
                               1875                 :                :     }
                               1876                 :                : 
10133 bruce@momjian.us         1877                 :CBC      264531 :     return v;
                               1878                 :                : }
                               1879                 :                : 
                               1880                 :                : /*
                               1881                 :                :  * Get number of blocks present in a single disk file
                               1882                 :                :  */
                               1883                 :                : static BlockNumber
 6501 heikki.linnakangas@i     1884                 :        4342781 : _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
                               1885                 :                : {
                               1886                 :                :     pgoff_t     len;
                               1887                 :                : 
 2761 tmunro@postgresql.or     1888                 :        4342781 :     len = FileSize(seg->mdfd_vfd);
 9544 bruce@momjian.us         1889         [ -  + ]:        4342781 :     if (len < 0)
 7087 tgl@sss.pgh.pa.us        1890         [ #  # ]:UBC           0 :         ereport(ERROR,
                               1891                 :                :                 (errcode_for_file_access(),
                               1892                 :                :                  errmsg("could not seek to end of file \"%s\": %m",
                               1893                 :                :                         FilePathName(seg->mdfd_vfd))));
                               1894                 :                :     /* note that this calculation will ignore any partial block at EOF */
 7087 tgl@sss.pgh.pa.us        1895                 :CBC     4342781 :     return (BlockNumber) (len / BLCKSZ);
                               1896                 :                : }
                               1897                 :                : 
                               1898                 :                : /*
                               1899                 :                :  * Sync a file to disk, given a file tag.  Write the path into an output
                               1900                 :                :  * buffer so the caller can use it in error messages.
                               1901                 :                :  *
                               1902                 :                :  * Return 0 on success, -1 on failure, with errno set.
                               1903                 :                :  */
                               1904                 :                : int
 2613 tmunro@postgresql.or     1905                 :UBC           0 : mdsyncfiletag(const FileTag *ftag, char *path)
                               1906                 :                : {
  818 heikki.linnakangas@i     1907                 :              0 :     SMgrRelation reln = smgropen(ftag->rlocator, INVALID_PROC_NUMBER);
                               1908                 :                :     File        file;
                               1909                 :                :     instr_time  io_start;
                               1910                 :                :     bool        need_to_close;
                               1911                 :                :     int         result,
                               1912                 :                :                 save_errno;
                               1913                 :                : 
                               1914                 :                :     /* See if we already have the file open, or need to open it. */
 2359 tmunro@postgresql.or     1915         [ #  # ]:              0 :     if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
                               1916                 :                :     {
                               1917                 :              0 :         file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
                               1918                 :              0 :         strlcpy(path, FilePathName(file), MAXPGPATH);
                               1919                 :              0 :         need_to_close = false;
                               1920                 :                :     }
                               1921                 :                :     else
                               1922                 :                :     {
                               1923                 :                :         MdPathStr   p;
                               1924                 :                : 
                               1925                 :              0 :         p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
  459 andres@anarazel.de       1926                 :              0 :         strlcpy(path, p.str, MD_PATH_STR_MAXLEN);
                               1927                 :                : 
 1148 tmunro@postgresql.or     1928                 :              0 :         file = PathNameOpenFile(path, _mdfd_open_flags());
 2359                          1929         [ #  # ]:              0 :         if (file < 0)
                               1930                 :              0 :             return -1;
                               1931                 :              0 :         need_to_close = true;
                               1932                 :                :     }
                               1933                 :                : 
  458 michael@paquier.xyz      1934                 :              0 :     io_start = pgstat_prepare_io_time(track_io_timing);
                               1935                 :                : 
                               1936                 :                :     /* Sync the file. */
 2359 tmunro@postgresql.or     1937                 :              0 :     result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
                               1938                 :              0 :     save_errno = errno;
                               1939                 :                : 
                               1940         [ #  # ]:              0 :     if (need_to_close)
                               1941                 :              0 :         FileClose(file);
                               1942                 :                : 
 1149 andres@anarazel.de       1943                 :              0 :     pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
                               1944                 :                :                             IOOP_FSYNC, io_start, 1, 0);
                               1945                 :                : 
 2359 tmunro@postgresql.or     1946                 :              0 :     errno = save_errno;
                               1947                 :              0 :     return result;
                               1948                 :                : }
                               1949                 :                : 
                               1950                 :                : /*
                               1951                 :                :  * Unlink a file, given a file tag.  Write the path into an output
                               1952                 :                :  * buffer so the caller can use it in error messages.
                               1953                 :                :  *
                               1954                 :                :  * Return 0 on success, -1 on failure, with errno set.
                               1955                 :                :  */
                               1956                 :                : int
 2613 tmunro@postgresql.or     1957                 :CBC       37137 : mdunlinkfiletag(const FileTag *ftag, char *path)
                               1958                 :                : {
                               1959                 :                :     RelPathStr  p;
                               1960                 :                : 
                               1961                 :                :     /* Compute the path. */
 1424 rhaas@postgresql.org     1962                 :          37137 :     p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
  459 andres@anarazel.de       1963                 :          37137 :     strlcpy(path, p.str, MAXPGPATH);
                               1964                 :                : 
                               1965                 :                :     /* Try to unlink the file. */
 2613 tmunro@postgresql.or     1966                 :          37137 :     return unlink(path);
                               1967                 :                : }
                               1968                 :                : 
                               1969                 :                : /*
                               1970                 :                :  * Check if a given candidate request matches a given tag, when processing
                               1971                 :                :  * a SYNC_FILTER_REQUEST request.  This will be called for all pending
                               1972                 :                :  * requests to find out whether to forget them.
                               1973                 :                :  */
                               1974                 :                : bool
                               1975                 :           7883 : mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
                               1976                 :                : {
                               1977                 :                :     /*
                               1978                 :                :      * For now we only use filter requests as a way to drop all scheduled
                               1979                 :                :      * callbacks relating to a given database, when dropping the database.
                               1980                 :                :      * We'll return true for all candidates that have the same database OID as
                               1981                 :                :      * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
                               1982                 :                :      */
 1424 rhaas@postgresql.org     1983                 :           7883 :     return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
                               1984                 :                : }
                               1985                 :                : 
                               1986                 :                : /*
                               1987                 :                :  * AIO completion callback for mdstartreadv().
                               1988                 :                :  */
                               1989                 :                : static PgAioResult
  427 andres@anarazel.de       1990                 :        1349995 : md_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
                               1991                 :                : {
                               1992                 :        1349995 :     PgAioTargetData *td = pgaio_io_get_target_data(ioh);
                               1993                 :        1349995 :     PgAioResult result = prior_result;
                               1994                 :                : 
                               1995         [ +  + ]:        1349995 :     if (prior_result.result < 0)
                               1996                 :                :     {
                               1997                 :             13 :         result.status = PGAIO_RS_ERROR;
                               1998                 :             13 :         result.id = PGAIO_HCB_MD_READV;
                               1999                 :                :         /* For "hard" errors, track the error number in error_data */
                               2000                 :             13 :         result.error_data = -prior_result.result;
                               2001                 :             13 :         result.result = 0;
                               2002                 :                : 
                               2003                 :                :         /*
                               2004                 :                :          * Immediately log a message about the IO error, but only to the
                               2005                 :                :          * server log. The reason to do so immediately is that the originator
                               2006                 :                :          * might not process the query result immediately (because it is busy
                               2007                 :                :          * doing another part of query processing) or at all (e.g. if it was
                               2008                 :                :          * cancelled or errored out due to another IO also failing).  The
                               2009                 :                :          * definer of the IO will emit an ERROR when processing the IO's
                               2010                 :                :          * results
                               2011                 :                :          */
                               2012                 :             13 :         pgaio_result_report(result, td, LOG_SERVER_ONLY);
                               2013                 :                : 
                               2014                 :             13 :         return result;
                               2015                 :                :     }
                               2016                 :                : 
                               2017                 :                :     /*
                               2018                 :                :      * As explained above smgrstartreadv(), the smgr API operates on the level
                               2019                 :                :      * of blocks, rather than bytes. Convert.
                               2020                 :                :      */
                               2021                 :        1349982 :     result.result /= BLCKSZ;
                               2022                 :                : 
                               2023         [ -  + ]:        1349982 :     Assert(result.result <= td->smgr.nblocks);
                               2024                 :                : 
                               2025         [ +  + ]:        1349982 :     if (result.result == 0)
                               2026                 :                :     {
                               2027                 :                :         /* consider 0 blocks read a failure */
                               2028                 :              3 :         result.status = PGAIO_RS_ERROR;
                               2029                 :              3 :         result.id = PGAIO_HCB_MD_READV;
                               2030                 :              3 :         result.error_data = 0;
                               2031                 :                : 
                               2032                 :                :         /* see comment above the "hard error" case */
                               2033                 :              3 :         pgaio_result_report(result, td, LOG_SERVER_ONLY);
                               2034                 :                : 
                               2035                 :              3 :         return result;
                               2036                 :                :     }
                               2037                 :                : 
                               2038         [ +  - ]:        1349979 :     if (result.status != PGAIO_RS_ERROR &&
                               2039         [ +  + ]:        1349979 :         result.result < td->smgr.nblocks)
                               2040                 :                :     {
                               2041                 :                :         /* partial reads should be retried at upper level */
                               2042                 :            166 :         result.status = PGAIO_RS_PARTIAL;
                               2043                 :            166 :         result.id = PGAIO_HCB_MD_READV;
                               2044                 :                :     }
                               2045                 :                : 
                               2046                 :        1349979 :     return result;
                               2047                 :                : }
                               2048                 :                : 
                               2049                 :                : /*
                               2050                 :                :  * AIO error reporting callback for mdstartreadv().
                               2051                 :                :  *
                               2052                 :                :  * Errors are encoded as follows:
                               2053                 :                :  * - PgAioResult.error_data != 0 encodes IO that failed with that errno
                               2054                 :                :  * - PgAioResult.error_data == 0 encodes IO that didn't read all data
                               2055                 :                :  */
                               2056                 :                : static void
                               2057                 :            195 : md_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
                               2058                 :                : {
                               2059                 :                :     RelPathStr  path;
                               2060                 :                : 
                               2061         [ -  + ]:            195 :     path = relpathbackend(td->smgr.rlocator,
                               2062                 :                :                           td->smgr.is_temp ? MyProcNumber : INVALID_PROC_NUMBER,
                               2063                 :                :                           td->smgr.forkNum);
                               2064                 :                : 
                               2065         [ +  + ]:            195 :     if (result.error_data != 0)
                               2066                 :                :     {
                               2067                 :                :         /* for errcode_for_file_access() and %m */
                               2068                 :             26 :         errno = result.error_data;
                               2069                 :                : 
                               2070         [ +  - ]:             26 :         ereport(elevel,
                               2071                 :                :                 errcode_for_file_access(),
                               2072                 :                :                 errmsg("could not read blocks %u..%u in file \"%s\": %m",
                               2073                 :                :                        td->smgr.blockNum,
                               2074                 :                :                        td->smgr.blockNum + td->smgr.nblocks - 1,
                               2075                 :                :                        path.str));
                               2076                 :                :     }
                               2077                 :                :     else
                               2078                 :                :     {
                               2079                 :                :         /*
                               2080                 :                :          * NB: This will typically only be output in debug messages, while
                               2081                 :                :          * retrying a partial IO.
                               2082                 :                :          */
                               2083         [ +  - ]:            169 :         ereport(elevel,
                               2084                 :                :                 errcode(ERRCODE_DATA_CORRUPTED),
                               2085                 :                :                 errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
                               2086                 :                :                        td->smgr.blockNum,
                               2087                 :                :                        td->smgr.blockNum + td->smgr.nblocks - 1,
                               2088                 :                :                        path.str,
                               2089                 :                :                        result.result * (size_t) BLCKSZ,
                               2090                 :                :                        td->smgr.nblocks * (size_t) BLCKSZ));
                               2091                 :                :     }
                               2092                 :            179 : }

Generated by: LCOV version 2.5.0-beta